diff --git a/src/mac2nix/scanners/_utils.py b/src/mac2nix/scanners/_utils.py index d0a162f..88b1d1c 100644 --- a/src/mac2nix/scanners/_utils.py +++ b/src/mac2nix/scanners/_utils.py @@ -8,6 +8,8 @@ import plistlib import shutil import subprocess +from collections.abc import Callable +from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime from pathlib import Path from typing import Any @@ -59,9 +61,226 @@ # Trash ".Trash", ".Trashes", + # Python packaging (biggest single source: 33K+ .py files on test system) + "site-packages", + ".venv", + "venv", + ".eggs", + ".mypy_cache", + ".ruff_cache", + ".pytest_cache", + ".pytype", + ".direnv", + # Electron/Chromium (every Electron app generates these) + "Crashpad", + "Session Storage", + "WebStorage", + "Local Storage", + "_locales", + # macOS internal metadata + ".Spotlight-V100", + ".fseventsd", + ".DocumentRevisions-V100", + ".TemporaryItems", + # Developer tools & large data stores + "CoreSimulator", + "DeviceSupport", + "steamapps", + "drive_c", } ) +NON_CONFIG_EXTENSIONS = frozenset( + { + # Source code (not user config — package/library files) + ".py", + ".pyi", + ".pyc", + ".pyo", + ".js", + ".jsx", + ".ts", + ".tsx", + ".mjs", + ".cjs", + ".c", + ".cpp", + ".cc", + ".h", + ".hpp", + ".m", + ".mm", + ".swift", + ".go", + ".rs", + ".java", + ".class", + ".jar", + ".rb", + ".pl", + ".pm", + ".lua", + ".r", + # Compiled/binary artifacts + ".so", + ".dylib", + ".dll", + ".o", + ".a", + ".lib", + ".wasm", + ".node", + ".framework", + ".exe", + ".msi", + # Media & images + ".png", + ".jpg", + ".jpeg", + ".gif", + ".bmp", + ".ico", + ".icns", + ".svg", + ".webp", + ".tiff", + ".tif", + ".heic", + ".heif", + ".mp3", + ".mp4", + ".m4a", + ".m4v", + ".wav", + ".aac", + ".flac", + ".ogg", + ".avi", + ".mov", + ".mkv", + ".webm", + ".ttf", + ".otf", + ".woff", + ".woff2", + ".eot", + # Archives & compressed + ".zip", + ".tar", + ".gz", + ".bz2", + ".xz", + ".7z", + ".rar", + ".dmg", + ".iso", + ".pkg", + # Data files (not human-readable config) + ".lance", + ".parquet", + ".arrow", + ".feather", + ".npy", + ".npz", + ".pickle", + ".pkl", + ".ldb", + ".sst", + # Web assets (Electron app bundles) + ".css", + ".scss", + ".less", + ".html", + ".htm", + # GPU shaders + ".amd", + ".glsl", + ".hlsl", + ".metal", + # Debug & build + ".map", + ".d", + ".dep", + ".log", + # Documents (not config) + ".pdf", + ".doc", + ".docx", + ".xls", + ".xlsx", + ".ppt", + ".pptx", + # Email messages (~/Library/Mail can have 500K+ .emlx files) + ".emlx", + ".eml", + ".mbox", + # Misc non-config + ".strings", + ".nib", + ".storyboard", + ".typed", + ".manifest", + } +) + +WALK_SKIP_SUFFIXES = (".noindex", ".lproj") + + +def parallel_walk_dirs[T]( + dirs: list[Path], + process_fn: Callable[[Path], T], + *, + max_workers: int = 8, +) -> list[T]: + """Walk multiple independent directory trees in parallel. + + Each directory in *dirs* is submitted as an independent work unit to a + ThreadPoolExecutor. The *process_fn* receives a single directory Path and + should return a result of type T. Exceptions in individual workers are + logged and skipped. + + The function is designed to be called from a scanner's ``scan()`` method, + which already runs inside ``asyncio.to_thread()`` via the orchestrator. + The ThreadPoolExecutor provides a second level of parallelism within + the scanner's thread. + + Note: callers run inside asyncio.to_thread() via the orchestrator, creating + nested thread pools. Peak thread count is bounded (~8 per scanner x + concurrent scanners) and well within OS limits. + + Args: + dirs: Independent directory roots to process in parallel. + process_fn: Function that processes one directory and returns a result. + max_workers: Maximum concurrent workers (default 8, suitable for NVMe SSD). + + Returns: + List of results from successful process_fn calls (order not guaranteed). + """ + if not dirs: + return [] + + results: list[T] = [] + + # For very small dir lists, skip the pool overhead + if len(dirs) <= 2: + for d in dirs: + try: + results.append(process_fn(d)) + except Exception: + logger.exception("Failed to process directory: %s", d) + else: + with ThreadPoolExecutor(max_workers=min(max_workers, len(dirs))) as pool: + futures = {pool.submit(process_fn, d): d for d in dirs} + for future in as_completed(futures): + directory = futures[future] + try: + results.append(future.result()) + except Exception: + logger.exception("Failed to process directory: %s", directory) + + return results + + LAUNCHD_DIRS: list[tuple[Path, str]] = [ (Path.home() / "Library" / "LaunchAgents", "user"), (Path("/Library/LaunchAgents"), "system"), diff --git a/src/mac2nix/scanners/library_scanner.py b/src/mac2nix/scanners/library_scanner.py index 9370a90..dab9dc4 100644 --- a/src/mac2nix/scanners/library_scanner.py +++ b/src/mac2nix/scanners/library_scanner.py @@ -5,6 +5,7 @@ import contextlib import logging import os +import re import sqlite3 from datetime import UTC, datetime from pathlib import Path @@ -20,7 +21,15 @@ LibraryResult, WorkflowEntry, ) -from mac2nix.scanners._utils import WALK_SKIP_DIRS, hash_file, read_plist_safe, run_command +from mac2nix.scanners._utils import ( + NON_CONFIG_EXTENSIONS, + WALK_SKIP_DIRS, + WALK_SKIP_SUFFIXES, + hash_file, + parallel_walk_dirs, + read_plist_safe, + run_command, +) from mac2nix.scanners.base import BaseScannerPlugin, register logger = logging.getLogger(__name__) @@ -43,7 +52,6 @@ } _MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB -_MAX_FILES_PER_APP = 500 # --- Library audit constants --- @@ -77,8 +85,16 @@ _SENSITIVE_KEY_PATTERNS = {"_KEY", "_TOKEN", "_SECRET", "_PASSWORD", "_CREDENTIAL", "_AUTH"} -_MAX_FILES_PER_DIR = 1000 -_MAX_SCRIPTS = 50 +# Redacts values in key=value / key: value lines where the key contains a sensitive word. +# Uses separator-prefixed compound patterns ([_.-]key, [_.-]token, etc.) to avoid false +# positives on words like "monkey", "turkey", "keyboard". Standalone patterns (password, +# secret, token) are anchored to the start of the key. Handles JSON quoted keys. +_SENSITIVE_VALUE_RE = re.compile( + r'^(\s*"?(?:\S*[_.\-](?:key|token|secret|password|credential|auth)' + r'|password|passwd|secret|token)"?\s*[:=]\s*).+', + re.IGNORECASE | re.MULTILINE, +) + _SYSTEM_SCAN_PATTERNS: dict[str, str] = { "Extensions": "*.kext", @@ -138,19 +154,23 @@ def scan(self) -> LibraryResult: scripts = self._scan_scripts(home_lib) # Capture uncovered files, workflows, and bundles from uncovered directories - for d in directories: - if d.covered_by_scanner is None and d.name not in _TRANSIENT_DIRS: - files, wf, bdl = self._capture_uncovered_dir(d.path) - uncovered_files.extend(files) - workflows.extend(wf) - bundles.extend(bdl) - + uncovered_dirs = [d.path for d in directories if d.covered_by_scanner is None and d.name not in _TRANSIENT_DIRS] + captured = parallel_walk_dirs(uncovered_dirs, self._capture_uncovered_dir) + for files, wf, bdl in captured: + uncovered_files.extend(files) + workflows.extend(wf) + bundles.extend(bdl) # Scan workflows from known Workflows/Services dirs for wf_dir_name in ["Workflows", "Services"]: wf_dir = home_lib / wf_dir_name if wf_dir.is_dir(): workflows.extend(self._scan_workflows(wf_dir)) + # Sort parallel-aggregated lists for deterministic output + uncovered_files.sort(key=lambda e: str(e.path)) + workflows.sort(key=lambda e: str(e.path)) + bundles.sort(key=lambda e: str(e.path)) + # --- App config scanning --- entries = self._scan_app_configs(home_lib) @@ -179,8 +199,6 @@ def scan(self) -> LibraryResult: def _scan_app_configs(self, home_lib: Path) -> list[AppConfigEntry]: """Walk Application Support, Group Containers, and Containers for app configs.""" - entries: list[AppConfigEntry] = [] - scan_dirs = [ home_lib / "Application Support", home_lib / "Group Containers", @@ -197,6 +215,7 @@ def _scan_app_configs(self, home_lib: Path) -> list[AppConfigEntry]: except PermissionError: logger.warning("Permission denied reading: %s", containers_dir) + all_app_dirs: list[Path] = [] for base_dir in scan_dirs: if not base_dir.is_dir(): continue @@ -211,25 +230,30 @@ def _scan_app_configs(self, home_lib: Path) -> list[AppConfigEntry]: if not os.access(app_dir, os.R_OK): logger.debug("Skipping TCC-protected directory: %s", app_dir) continue - self._scan_app_dir(app_dir, entries) + all_app_dirs.append(app_dir) + batched = parallel_walk_dirs(all_app_dirs, self._scan_app_dir) + entries = [e for batch in batched for e in batch] + entries.sort(key=lambda e: (e.app_name, str(e.path))) return entries - def _scan_app_dir(self, app_dir: Path, entries: list[AppConfigEntry]) -> None: + def _scan_app_dir(self, app_dir: Path) -> list[AppConfigEntry]: app_name = app_dir.name - file_count = 0 + entries: list[AppConfigEntry] = [] try: for dirpath, dirnames, filenames in os.walk(app_dir, followlinks=False): # Prune skipped directories in-place - dirnames[:] = [d for d in dirnames if d not in WALK_SKIP_DIRS] + dirnames[:] = [d for d in dirnames if d not in WALK_SKIP_DIRS and not d.endswith(WALK_SKIP_SUFFIXES)] for filename in filenames: - if file_count >= _MAX_FILES_PER_APP: - logger.info("Reached %d file cap for app: %s", _MAX_FILES_PER_APP, app_name) - return - filepath = Path(dirpath) / filename + ext = filepath.suffix.lower() + + # Skip non-config files before any syscall + if ext in NON_CONFIG_EXTENSIONS: + continue + try: stat = filepath.stat() except OSError: @@ -239,7 +263,6 @@ def _scan_app_dir(self, app_dir: Path, entries: list[AppConfigEntry]) -> None: if stat.st_size > _MAX_FILE_SIZE: continue - ext = filepath.suffix.lower() file_type = _EXTENSION_MAP.get(ext, ConfigFileType.UNKNOWN) scannable = file_type != ConfigFileType.DATABASE @@ -256,10 +279,11 @@ def _scan_app_dir(self, app_dir: Path, entries: list[AppConfigEntry]) -> None: modified_time=modified_time, ) ) - file_count += 1 except PermissionError: logger.warning("Permission denied reading app config dir: %s", app_dir) + return entries + # --- Library audit scanning --- def _audit_directories(self, lib_path: Path) -> list[LibraryDirEntry]: @@ -267,27 +291,27 @@ def _audit_directories(self, lib_path: Path) -> list[LibraryDirEntry]: if not lib_path.is_dir(): return [] - entries: list[LibraryDirEntry] = [] try: - for child in sorted(lib_path.iterdir()): - if not child.is_dir(): - continue - covered = _COVERED_DIRS.get(child.name) - file_count, total_size, newest_mod = self._dir_stats(child) - entries.append( - LibraryDirEntry( - name=child.name, - path=child, - file_count=file_count, - total_size_bytes=total_size, - covered_by_scanner=covered, - has_user_content=covered is None and child.name not in _TRANSIENT_DIRS, - newest_modification=newest_mod, - ) - ) + children = [c for c in sorted(lib_path.iterdir()) if c.is_dir()] except PermissionError: logger.warning("Permission denied reading: %s", lib_path) + return [] + def _compute_entry(child: Path) -> LibraryDirEntry: + covered = _COVERED_DIRS.get(child.name) + file_count, total_size, newest_mod = self._dir_stats(child) + return LibraryDirEntry( + name=child.name, + path=child, + file_count=file_count, + total_size_bytes=total_size, + covered_by_scanner=covered, + has_user_content=covered is None and child.name not in _TRANSIENT_DIRS, + newest_modification=newest_mod, + ) + + entries = parallel_walk_dirs(children, _compute_entry) + entries.sort(key=lambda e: e.name) return entries @staticmethod @@ -319,16 +343,11 @@ def _capture_uncovered_dir( files: list[LibraryFileEntry] = [] workflows: list[WorkflowEntry] = [] bundles: list[BundleEntry] = [] - count = 0 try: for dirpath, dirnames, filenames in os.walk(dir_path, followlinks=False): for filename in filenames: - if count >= _MAX_FILES_PER_DIR: - logger.info("Reached %d file cap for directory: %s", _MAX_FILES_PER_DIR, dir_path) - return files, workflows, bundles filepath = Path(dirpath) / filename - count += 1 entry = self._classify_file(filepath) if entry is not None: files.append(entry) @@ -343,7 +362,7 @@ def _capture_uncovered_dir( workflows.append(wf) elif any(dirname.endswith(ext) for ext in _BUNDLE_EXTENSIONS): bundles.append(self._parse_bundle(sub_path)) - elif dirname not in WALK_SKIP_DIRS: + elif dirname not in WALK_SKIP_DIRS and not dirname.endswith(WALK_SKIP_SUFFIXES): kept.append(dirname) dirnames[:] = kept except PermissionError: @@ -353,13 +372,17 @@ def _capture_uncovered_dir( def _classify_file(self, filepath: Path) -> LibraryFileEntry | None: """Classify and capture a file from an uncovered directory.""" + suffix = filepath.suffix.lower() + + if suffix in NON_CONFIG_EXTENSIONS: + return None + try: stat = filepath.stat() except OSError: return None size = stat.st_size - suffix = filepath.suffix.lower() file_type = suffix.lstrip(".") if suffix else "unknown" plist_content: dict[str, Any] | None = None text_content: str | None = None @@ -377,7 +400,8 @@ def _classify_file(self, filepath: Path) -> LibraryFileEntry | None: content_hash = hash_file(filepath) if size < 65536: with contextlib.suppress(OSError): - text_content = filepath.read_text(errors="replace") + raw_text = filepath.read_text(errors="replace") + text_content = _SENSITIVE_VALUE_RE.sub(r"\1***REDACTED***", raw_text) strategy = "text_capture" if text_content else "hash_only" elif suffix in _BUNDLE_EXTENSIONS: strategy = "bundle" @@ -531,9 +555,6 @@ def _scan_scripts(self, lib_path: Path) -> list[str]: scripts: list[str] = [] try: for f in sorted(scripts_dir.iterdir()): - if len(scripts) >= _MAX_SCRIPTS: - logger.info("Reached %d script cap for: %s", _MAX_SCRIPTS, scripts_dir) - break if f.is_file(): if f.suffix == ".scpt": # Try to decompile AppleScript diff --git a/src/mac2nix/scanners/nix_state.py b/src/mac2nix/scanners/nix_state.py index 9476900..5e35d6e 100644 --- a/src/mac2nix/scanners/nix_state.py +++ b/src/mac2nix/scanners/nix_state.py @@ -24,7 +24,7 @@ NixRegistryEntry, NixState, ) -from mac2nix.scanners._utils import run_command +from mac2nix.scanners._utils import WALK_SKIP_DIRS, WALK_SKIP_SUFFIXES, parallel_walk_dirs, run_command from mac2nix.scanners.base import BaseScannerPlugin, register logger = logging.getLogger(__name__) @@ -32,28 +32,22 @@ _SENSITIVE_PATTERNS = {"ACCESS_TOKEN", "SECRET", "PASSWORD", "CREDENTIAL", "NETRC"} _SENSITIVE_EXACT_KEYS = {"access-tokens", "netrc-file"} -_PACKAGE_CAP = 500 -_ADJACENT_CAP = 50 -_ADJACENT_MAX_DEPTH = 2 -_PRUNE_DIRS = { - # VCS / build - ".git", - "node_modules", - ".direnv", - "__pycache__", - ".venv", - # macOS non-project directories (avoid wasting IO at depth 0-1) - "Library", - "Applications", - "Downloads", - "Movies", - "Music", - "Pictures", - "Public", - ".Trash", - ".cache", - ".local", -} +_ADJACENT_MAX_DEPTH = 5 +_NON_PROJECT_DIRS = frozenset( + { + # macOS non-project directories — checked at all walk depths. + # These never contain devbox.json/devenv.nix/.envrc. + "Library", + "Applications", + "Downloads", + "Movies", + "Music", + "Pictures", + "Public", + ".Trash", + ".local", + } +) _SYSTEM_NIX_CONF = Path("/etc/nix/nix.conf") _VERSION_RE = re.compile(r"(\d+\.\d+[\w.]*)") @@ -175,7 +169,7 @@ def _detect_profiles(self) -> list[NixProfile]: NixProfile( name="default", path=nix_profile_path, - packages=packages[:_PACKAGE_CAP], + packages=packages, ) ) return profiles @@ -193,7 +187,7 @@ def _detect_profiles(self) -> list[NixProfile]: NixProfile( name="default", path=Path.home() / ".nix-profile", - packages=packages[:_PACKAGE_CAP], + packages=packages, ) ) return profiles @@ -213,7 +207,7 @@ def _detect_profiles(self) -> list[NixProfile]: NixProfile( name="default", path=Path.home() / ".nix-profile", - packages=packages[:_PACKAGE_CAP], + packages=packages, ) ) @@ -355,8 +349,7 @@ def _get_hm_packages() -> list[str]: result = run_command(["home-manager", "packages"]) if result is None or result.returncode != 0: return [] - packages = [line.strip() for line in result.stdout.strip().splitlines() if line.strip()] - return packages[:_PACKAGE_CAP] + return [line.strip() for line in result.stdout.strip().splitlines() if line.strip()] def _detect_channels_and_flakes( self, @@ -508,16 +501,48 @@ def _parse_max_jobs(value: str | None) -> int | None: def _detect_nix_adjacent( self, ) -> tuple[list[DevboxProject], list[DevenvProject], list[NixDirenvConfig]]: - devbox_projects: list[DevboxProject] = [] - devenv_projects: list[DevenvProject] = [] - direnv_configs: list[NixDirenvConfig] = [] - home = Path.home() - self._walk_for_adjacent(home, 0, devbox_projects, devenv_projects, direnv_configs) + try: + children = sorted(home.iterdir()) + except (PermissionError, OSError): + return [], [], [] + + walkable = [ + c + for c in children + if c.is_dir() + and c.name not in _NON_PROJECT_DIRS + and c.name not in WALK_SKIP_DIRS + and not c.name.endswith(WALK_SKIP_SUFFIXES) + ] + + results = parallel_walk_dirs(walkable, self._walk_child_for_adjacent) - return devbox_projects, devenv_projects, direnv_configs + devbox: list[DevboxProject] = [] + devenv: list[DevenvProject] = [] + direnv: list[NixDirenvConfig] = [] + for db, de, dc in results: + devbox.extend(db) + devenv.extend(de) + direnv.extend(dc) + + devbox.sort(key=lambda e: str(e.path)) + devenv.sort(key=lambda e: str(e.path)) + direnv.sort(key=lambda e: str(e.path)) + + return devbox, devenv, direnv + + def _walk_child_for_adjacent( + self, + child: Path, + ) -> tuple[list[DevboxProject], list[DevenvProject], list[NixDirenvConfig]]: + devbox: list[DevboxProject] = [] + devenv: list[DevenvProject] = [] + direnv: list[NixDirenvConfig] = [] + self._walk_recursive(child, 1, devbox, devenv, direnv) + return devbox, devenv, direnv - def _walk_for_adjacent( + def _walk_recursive( self, directory: Path, depth: int, @@ -534,24 +559,22 @@ def _walk_for_adjacent( return for entry in entries: - if ( - len(devbox_projects) >= _ADJACENT_CAP - and len(devenv_projects) >= _ADJACENT_CAP - and len(direnv_configs) >= _ADJACENT_CAP - ): - break - if entry.is_dir(): - if entry.name in _PRUNE_DIRS: + if entry.is_dir() and not entry.is_symlink(): + if ( + entry.name in WALK_SKIP_DIRS + or entry.name in _NON_PROJECT_DIRS + or entry.name.endswith(WALK_SKIP_SUFFIXES) + ): continue - self._walk_for_adjacent(entry, depth + 1, devbox_projects, devenv_projects, direnv_configs) + self._walk_recursive(entry, depth + 1, devbox_projects, devenv_projects, direnv_configs) elif entry.is_file(): - if entry.name == "devbox.json" and len(devbox_projects) < _ADJACENT_CAP: + if entry.name == "devbox.json": packages = self._parse_devbox_json(entry) devbox_projects.append(DevboxProject(path=entry.parent, packages=packages)) - elif entry.name == "devenv.nix" and len(devenv_projects) < _ADJACENT_CAP: + elif entry.name == "devenv.nix": has_lock = (entry.parent / "devenv.lock").exists() devenv_projects.append(DevenvProject(path=entry.parent, has_lock=has_lock)) - elif entry.name == ".envrc" and len(direnv_configs) < _ADJACENT_CAP: + elif entry.name == ".envrc": self._check_envrc(entry, direnv_configs) @staticmethod diff --git a/tests/scanners/test_library_scanner.py b/tests/scanners/test_library_scanner.py index 844be6e..c9c9bd3 100644 --- a/tests/scanners/test_library_scanner.py +++ b/tests/scanners/test_library_scanner.py @@ -4,9 +4,12 @@ from pathlib import Path from unittest.mock import MagicMock, patch +import pytest + from mac2nix.models.files import ConfigFileType, LibraryResult from mac2nix.scanners.library_scanner import ( _COVERED_DIRS, + _SENSITIVE_VALUE_RE, _TRANSIENT_DIRS, LibraryScanner, _redact_sensitive_keys, @@ -272,7 +275,7 @@ def test_large_file_skipped(self, tmp_path: Path) -> None: assert len(result.app_configs) == 1 assert result.app_configs[0].path.name == "small.json" - def test_max_files_per_app_cap(self, tmp_path: Path) -> None: + def test_processes_all_files_no_cap(self, tmp_path: Path) -> None: app_support = _setup_app_support(tmp_path) app_dir = app_support / "ManyFilesApp" app_dir.mkdir() @@ -287,7 +290,7 @@ def test_max_files_per_app_cap(self, tmp_path: Path) -> None: assert isinstance(result, LibraryResult) app_entries = [e for e in result.app_configs if e.app_name == "ManyFilesApp"] - assert len(app_entries) == 500 + assert len(app_entries) == 501 def test_skips_non_config_dirs(self, tmp_path: Path) -> None: app_support = _setup_app_support(tmp_path) @@ -724,7 +727,7 @@ def test_text_replacements_no_db(self, tmp_path: Path) -> None: result = LibraryScanner()._scan_text_replacements(lib) assert result == [] - def test_capture_uncovered_dir_walks_below_cap(self, tmp_path: Path) -> None: + def test_capture_uncovered_dir_walks_all_files(self, tmp_path: Path) -> None: for i in range(210): (tmp_path / f"file{i:03d}.txt").write_text(f"content {i}") @@ -734,7 +737,7 @@ def test_capture_uncovered_dir_walks_below_cap(self, tmp_path: Path) -> None: ): files, _workflows, _bundles = LibraryScanner()._capture_uncovered_dir(tmp_path) - assert len(files) == 210 # all files captured (below 1000 cap) + assert len(files) == 210 def test_capture_uncovered_dir_skips_non_config_dirs(self, tmp_path: Path) -> None: config_dir = tmp_path / "real_config" @@ -948,7 +951,7 @@ def test_null_rows_filtered(self, tmp_path: Path) -> None: class TestCaptureUncoveredDirEdgeCases: - def test_file_cap_enforced(self, tmp_path: Path) -> None: + def test_processes_all_files_no_dir_cap(self, tmp_path: Path) -> None: for i in range(1050): (tmp_path / f"file{i:04d}.txt").write_text(f"content {i}") @@ -958,8 +961,7 @@ def test_file_cap_enforced(self, tmp_path: Path) -> None: ): files, _workflows, _bundles = LibraryScanner()._capture_uncovered_dir(tmp_path) - # Count includes all files encountered, but total should be capped - assert len(files) <= 1000 + assert len(files) == 1050 def test_workflow_bundles_discovered(self, tmp_path: Path) -> None: wf = tmp_path / "MyAction.workflow" @@ -1011,6 +1013,83 @@ def test_plist_returns_non_dict(self, tmp_path: Path) -> None: assert entry.plist_content is None assert entry.content_hash == "abc123" + def test_classify_file_non_config_extension_returns_none(self, tmp_path: Path) -> None: + py_file = tmp_path / "script.py" + py_file.write_text("print('hello')") + + assert LibraryScanner()._classify_file(py_file) is None + + def test_classify_file_redacts_sensitive_text(self, tmp_path: Path) -> None: + conf_file = tmp_path / "app.conf" + conf_file.write_text("host = localhost\npassword = secret123\nport = 8080\n") + + with patch("mac2nix.scanners.library_scanner.hash_file", return_value="hash"): + entry = LibraryScanner()._classify_file(conf_file) + + assert entry is not None + assert entry.text_content is not None + assert "secret123" not in entry.text_content + assert "***REDACTED***" in entry.text_content + assert "localhost" in entry.text_content + assert "8080" in entry.text_content + + +class TestSensitiveValueRedaction: + """Test _SENSITIVE_VALUE_RE directly — no file I/O.""" + + @pytest.mark.parametrize( + ("line", "secret"), + [ + # Bare standalone keys + ("password = secret123", "secret123"), + ("secret = mysecret", "mysecret"), + ("token = ghp_abc", "ghp_abc"), + ("passwd = hunter2", "hunter2"), + # Compound keys with underscore + ("db_password = xxx", "xxx"), + ("api_key: sk-abc", "sk-abc"), + ("SECRET_KEY = xxx", "xxx"), + ("AUTH_TOKEN: ghp_xxx", "ghp_xxx"), + # Compound keys with hyphen/dot + ("access-token = xxx", "xxx"), + ("auth.token: xxx", "xxx"), + ("private-key = xxx", "xxx"), + # No-space separators + ("TOKEN=ghp_realtoken", "ghp_realtoken"), + # JSON quoted keys + ('"password": "secret123"', '"secret123"'), + ('"api_key": "sk-abc"', '"sk-abc"'), + # Indented (YAML) + (" password: secret123", "secret123"), + (" db_password: xxx", "xxx"), + ], + ids=lambda v: v[:25] if isinstance(v, str) else v, + ) + def test_redacts_sensitive_values(self, line: str, secret: str) -> None: + result = _SENSITIVE_VALUE_RE.sub(r"\1***REDACTED***", line) + assert secret not in result + assert "***REDACTED***" in result + + @pytest.mark.parametrize( + ("line", "preserved_value"), + [ + ("monkey = banana", "banana"), + ("turkey = bird", "bird"), + ("keyboard = us", "us"), + ("author = John", "John"), + ("hockey_score = 3", "3"), + ("donkey_kong = mario", "mario"), + ("host = localhost", "localhost"), + ("port = 8080", "8080"), + ("name = Alice", "Alice"), + ], + ids=lambda v: v[:25] if isinstance(v, str) else v, + ) + def test_preserves_non_sensitive_values(self, line: str, preserved_value: str) -> None: + result = _SENSITIVE_VALUE_RE.sub(r"\1***REDACTED***", line) + assert result == line + assert preserved_value in result + class TestKeyBindingsEdgeCases: def test_non_string_dict_actions_filtered(self, tmp_path: Path) -> None: @@ -1136,3 +1215,29 @@ def test_scan_bundles_in_dir_skips_symlinks(self, tmp_path: Path) -> None: names = [b.name for b in result] assert "Real.app" in names assert "Linked.app" not in names + + +class TestScanAppDirEdgeCases: + def test_scan_app_dir_skips_non_config_extensions(self, tmp_path: Path) -> None: + app_dir = tmp_path / "MyApp" + app_dir.mkdir() + (app_dir / "config.json").write_text('{"key": "value"}') + (app_dir / "module.py").write_text("print('hello')") + + entries = LibraryScanner()._scan_app_dir(app_dir) + + paths = [e.path.name for e in entries] + assert "config.json" in paths + assert "module.py" not in paths + + def test_scan_app_dir_skips_noindex_dirs(self, tmp_path: Path) -> None: + app_dir = tmp_path / "MyApp" + app_dir.mkdir() + noindex_dir = app_dir / "foo.noindex" + noindex_dir.mkdir() + (noindex_dir / "settings.json").write_text('{"hidden": true}') + + entries = LibraryScanner()._scan_app_dir(app_dir) + + paths = [str(e.path) for e in entries] + assert not any("foo.noindex" in p for p in paths) diff --git a/tests/scanners/test_nix_state.py b/tests/scanners/test_nix_state.py index 4597196..0e3ac30 100644 --- a/tests/scanners/test_nix_state.py +++ b/tests/scanners/test_nix_state.py @@ -318,7 +318,7 @@ def test_manifest_json_fallback(self, tmp_path: Path) -> None: assert len(result) == 1 assert result[0].packages[0].name == "curl-8.0" - def test_package_cap(self, cmd_result, tmp_path: Path) -> None: + def test_no_package_cap(self, cmd_result, tmp_path: Path) -> None: elements = [{"storePaths": [f"/nix/store/hash-pkg{i}-1.0"], "attrPath": f"pkg{i}"} for i in range(600)] profile_json = json.dumps({"elements": elements}) @@ -332,7 +332,7 @@ def test_package_cap(self, cmd_result, tmp_path: Path) -> None: ): result = scanner._detect_profiles() - assert len(result[0].packages) == 500 + assert len(result[0].packages) == 600 # --------------------------------------------------------------------------- @@ -801,6 +801,12 @@ def test_pruned_dirs_skipped(self, tmp_path: Path) -> None: node_modules.mkdir() (node_modules / "devbox.json").write_text(json.dumps({"packages": ["bar"]})) + # NEW: WALK_SKIP_DIRS entries (extended set replacing old _PRUNE_DIRS) + for skip_dir in ["DerivedData", "Caches", "site-packages"]: + d = tmp_path / "project" / skip_dir + d.mkdir(parents=True) + (d / "devbox.json").write_text(json.dumps({"packages": ["skip"]})) + scanner = NixStateScanner() with patch("mac2nix.scanners.nix_state.Path.home", return_value=tmp_path): devbox_projects, _, _ = scanner._detect_nix_adjacent() @@ -808,20 +814,22 @@ def test_pruned_dirs_skipped(self, tmp_path: Path) -> None: assert len(devbox_projects) == 0 def test_depth_limit(self, tmp_path: Path) -> None: - # depth 3 (home -> a -> b -> c) -- should not be found - deep = tmp_path / "a" / "b" / "c" + # depth 6 (home -> a -> b -> c -> d -> e -> f) -- should not be found + deep = tmp_path / "a" / "b" / "c" / "d" / "e" / "f" deep.mkdir(parents=True) (deep / "devbox.json").write_text(json.dumps({"packages": ["deep"]})) - # depth 2 (home -> a -> b) -- should be found - (tmp_path / "a" / "b" / "devbox.json").write_text(json.dumps({"packages": ["ok"]})) + # depth 5 (home -> a -> b -> c -> d -> e) -- should be found + (tmp_path / "a" / "b" / "c" / "d" / "e" / "devbox.json").write_text( + json.dumps({"packages": ["ok"]}), + ) scanner = NixStateScanner() with patch("mac2nix.scanners.nix_state.Path.home", return_value=tmp_path): devbox_projects, _, _ = scanner._detect_nix_adjacent() paths = [str(p.path) for p in devbox_projects] - assert str(tmp_path / "a" / "b") in paths + assert str(tmp_path / "a" / "b" / "c" / "d" / "e") in paths assert str(deep) not in paths def test_devbox_json_malformed(self, tmp_path: Path) -> None: @@ -836,7 +844,7 @@ def test_devbox_json_malformed(self, tmp_path: Path) -> None: assert len(devbox_projects) == 1 assert devbox_projects[0].packages == [] - def test_cap_limit(self, tmp_path: Path) -> None: + def test_no_cap_all_projects_returned(self, tmp_path: Path) -> None: for i in range(55): d = tmp_path / f"proj{i}" d.mkdir() @@ -846,7 +854,40 @@ def test_cap_limit(self, tmp_path: Path) -> None: with patch("mac2nix.scanners.nix_state.Path.home", return_value=tmp_path): devbox_projects, _, _ = scanner._detect_nix_adjacent() - assert len(devbox_projects) == 50 + assert len(devbox_projects) == 55 + + def test_non_project_dirs_skipped(self, tmp_path: Path) -> None: + for non_proj in ["Library", "Music", "Pictures", "Downloads"]: + d = tmp_path / non_proj + d.mkdir() + (d / "devbox.json").write_text(json.dumps({"packages": ["skip"]})) + + # Control: a normal project dir SHOULD be found + proj = tmp_path / "myproject" + proj.mkdir() + (proj / "devbox.json").write_text(json.dumps({"packages": ["found"]})) + + scanner = NixStateScanner() + with patch("mac2nix.scanners.nix_state.Path.home", return_value=tmp_path): + devbox_projects, _, _ = scanner._detect_nix_adjacent() + + assert len(devbox_projects) == 1 + assert devbox_projects[0].path == proj + + def test_walk_skip_suffixes_applied(self, tmp_path: Path) -> None: + noindex = tmp_path / "attachments.noindex" + noindex.mkdir() + (noindex / "devbox.json").write_text(json.dumps({"packages": ["skip"]})) + + lproj = tmp_path / "en.lproj" + lproj.mkdir() + (lproj / "devbox.json").write_text(json.dumps({"packages": ["skip"]})) + + scanner = NixStateScanner() + with patch("mac2nix.scanners.nix_state.Path.home", return_value=tmp_path): + devbox_projects, _, _ = scanner._detect_nix_adjacent() + + assert len(devbox_projects) == 0 # --------------------------------------------------------------------------- diff --git a/tests/scanners/test_parallel_walk.py b/tests/scanners/test_parallel_walk.py new file mode 100644 index 0000000..28959e3 --- /dev/null +++ b/tests/scanners/test_parallel_walk.py @@ -0,0 +1,166 @@ +"""Unit tests for parallel_walk_dirs() and related constants in _utils.py.""" + +from __future__ import annotations + +import logging +from pathlib import Path + +import pytest + +from mac2nix.scanners._utils import ( + NON_CONFIG_EXTENSIONS, + WALK_SKIP_DIRS, + WALK_SKIP_SUFFIXES, + parallel_walk_dirs, +) + + +class TestParallelWalkDirs: + def test_empty_dirs_list(self) -> None: + """Returns [] for empty input.""" + result = parallel_walk_dirs([], lambda d: d.name) + assert result == [] + + def test_single_dir_skips_pool(self, tmp_path: Path) -> None: + """≤2 dirs bypass ThreadPoolExecutor (serial path).""" + dirs = [tmp_path / "a", tmp_path / "b"] + for d in dirs: + d.mkdir() + + called: list[Path] = [] + + def collect(d: Path) -> str: + called.append(d) + return d.name + + result = parallel_walk_dirs(dirs, collect) + assert sorted(result) == ["a", "b"] + assert set(called) == set(dirs) + + def test_parallel_collects_all_results(self, tmp_path: Path) -> None: + """All dirs processed and results collected when >2 dirs (pool path).""" + dirs = [tmp_path / f"dir{i}" for i in range(6)] + for d in dirs: + d.mkdir() + + result = parallel_walk_dirs(dirs, lambda d: d.name) + assert sorted(result) == [f"dir{i}" for i in range(6)] + + def test_failed_dir_logged_not_raised(self, tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None: + """Exception in one worker doesn't stop others; error is logged.""" + dirs = [tmp_path / f"d{i}" for i in range(5)] + for d in dirs: + d.mkdir() + + def maybe_raise(d: Path) -> str: + if d.name == "d2": + msg = "deliberate failure" + raise ValueError(msg) + return d.name + + # Run — should not raise, d2 skipped, others collected + with caplog.at_level(logging.ERROR, logger="mac2nix.scanners._utils"): + results = parallel_walk_dirs(dirs, maybe_raise) + # 4 successful workers (d0, d1, d3, d4), d2 raises + assert len(results) == 4 + assert "d2" not in results + assert "Failed to process directory" in caplog.text + assert "d2" in caplog.text + + def test_failed_dir_serial_path_logged(self, tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None: + """Serial path (≤2 dirs) also logs exceptions and continues.""" + dirs = [tmp_path / "good", tmp_path / "bad"] + for d in dirs: + d.mkdir() + + def maybe_raise(d: Path) -> str: + if d.name == "bad": + msg = "serial failure" + raise ValueError(msg) + return d.name + + with caplog.at_level(logging.ERROR, logger="mac2nix.scanners._utils"): + results = parallel_walk_dirs(dirs, maybe_raise) + assert results == ["good"] + assert "Failed to process directory" in caplog.text + assert "bad" in caplog.text + + def test_max_workers_capped_at_dir_count(self, tmp_path: Path) -> None: + """Pool size is min(max_workers, len(dirs)) — no idle threads.""" + # 4 dirs but max_workers=100 → pool of 4 at most + dirs = [tmp_path / f"x{i}" for i in range(4)] + for d in dirs: + d.mkdir() + + # Just verifying it completes without error with a large max_workers + result = parallel_walk_dirs(dirs, lambda d: d.name, max_workers=100) + assert sorted(result) == [f"x{i}" for i in range(4)] + + def test_process_fn_receives_path(self, tmp_path: Path) -> None: + """Callback receives the exact Path objects passed in.""" + dirs = [tmp_path / f"p{i}" for i in range(4)] + for d in dirs: + d.mkdir() + + received: list[Path] = [] + + def capture(d: Path) -> int: + received.append(d) + return 1 + + parallel_walk_dirs(dirs, capture) + assert set(received) == set(dirs) + + def test_results_order_not_guaranteed(self, tmp_path: Path) -> None: + """Results may arrive in any order — sorted comparison is valid.""" + dirs = [tmp_path / f"z{i}" for i in range(5)] + for d in dirs: + d.mkdir() + + result = parallel_walk_dirs(dirs, lambda d: d.name) + # Order not guaranteed, but sorted must match + assert sorted(result) == sorted(f"z{i}" for i in range(5)) + + def test_walk_skip_dirs_contains_new_entries(self) -> None: + """Verify key new entries added to WALK_SKIP_DIRS.""" + assert "site-packages" in WALK_SKIP_DIRS + assert "Crashpad" in WALK_SKIP_DIRS + assert ".Spotlight-V100" in WALK_SKIP_DIRS + # Spot-check a few more new categories + assert ".direnv" in WALK_SKIP_DIRS + assert "CoreSimulator" in WALK_SKIP_DIRS + assert "steamapps" in WALK_SKIP_DIRS + + def test_non_config_extensions_contains_key_types(self) -> None: + """Verify representative extensions across all categories.""" + # Source code + assert ".py" in NON_CONFIG_EXTENSIONS + assert ".js" in NON_CONFIG_EXTENSIONS + # Media/images + assert ".png" in NON_CONFIG_EXTENSIONS + # Compiled/binary + assert ".so" in NON_CONFIG_EXTENSIONS + assert ".dylib" in NON_CONFIG_EXTENSIONS + + def test_walk_skip_suffixes_contains_expected(self) -> None: + """WALK_SKIP_SUFFIXES contains .noindex and .lproj.""" + assert ".noindex" in WALK_SKIP_SUFFIXES + assert ".lproj" in WALK_SKIP_SUFFIXES + + def test_parallel_results_match_serial(self, tmp_path: Path) -> None: + """Serial (max_workers=1) and parallel (max_workers=4) produce identical sorted results.""" + dirs = [tmp_path / f"m{i}" for i in range(6)] + for d in dirs: + d.mkdir() + # Give each dir a predictable value via a file + (d / "marker.txt").write_text(d.name) + + def process(d: Path) -> str: + return (d / "marker.txt").read_text() + + # Serial path: ≤2 bypasses pool, so use max_workers=1 on 6 dirs to test pool + serial = parallel_walk_dirs(dirs, process, max_workers=1) + parallel = parallel_walk_dirs(dirs, process, max_workers=4) + + assert sorted(serial) == sorted(parallel) + assert sorted(serial) == sorted(f"m{i}" for i in range(6))