diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index 708892ab..b1365684 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -350,7 +350,14 @@ def create_parser(self) -> argparse.ArgumentParser: ) # List command - subparsers.add_parser("list", help="List all indexes") + list_parser = subparsers.add_parser("list", help="List all indexes") + list_parser.add_argument( + "--max-depth", + type=int, + default=3, + help="Maximum directory depth to scan for indexes (default: 3). " + "Increase if your indexes are in deeply nested directories.", + ) # Remove command remove_parser = subparsers.add_parser("remove", help="Remove an index") @@ -404,6 +411,68 @@ def _should_exclude_file(self, file_path: Path, gitignore_matches) -> bool: absolute_path = Path(str(file_path)) return gitignore_matches(absolute_path.as_posix()) + def _find_meta_files_limited( + self, root: Path, max_depth: int = 3, pattern: str = "*.leann.meta.json" + ): + """Find meta files with limited depth to avoid scanning large directories. + + Skips common large directories that shouldn't contain LEANN indexes. + """ + # Directories to skip - these are typically large and won't contain user indexes + skip_dirs = { + ".git", + ".svn", + ".hg", + "node_modules", + "__pycache__", + ".venv", + "venv", + ".env", + "env", + ".tox", + ".nox", + ".mypy_cache", + ".pytest_cache", + ".ruff_cache", + "dist", + "build", + ".eggs", + "*.egg-info", + ".cache", + ".npm", + ".yarn", + "vendor", + "Pods", + ".gradle", + "target", + } + + def should_skip(dir_name: str) -> bool: + """Check if directory should be skipped.""" + if dir_name in skip_dirs: + return True + # Skip hidden directories (except .leann which we want) + if dir_name.startswith(".") and dir_name != ".leann": + return True + return False + + def search_dir(path: Path, current_depth: int): + """Recursively search with depth limit.""" + if current_depth > max_depth: + return + + try: + for item in path.iterdir(): + if item.is_file() and item.match(pattern): + yield item + elif item.is_dir() and not should_skip(item.name): + yield from search_dir(item, current_depth + 1) + except (PermissionError, OSError): + # Skip directories we can't read + pass + + yield from search_dir(root, 0) + def _is_git_submodule(self, path: Path) -> bool: """Check if a path is a git submodule.""" try: @@ -430,7 +499,13 @@ def _is_git_submodule(self, path: Path) -> bool: # If anything goes wrong, assume it's not a submodule return False - def list_indexes(self): + def list_indexes(self, max_depth: int = 3): + """List all LEANN indexes across registered projects. + + Args: + max_depth: Maximum directory depth to scan for app-format indexes. + Default is 3. Increase if indexes are in deeply nested directories. + """ # Get all project directories with .leann global_registry = Path.home() / ".leann" / "projects.json" all_projects = [] @@ -475,7 +550,7 @@ def list_indexes(self): print(" " + "─" * 45) current_indexes = self._discover_indexes_in_project( - current_path, exclude_dirs=other_projects + current_path, exclude_dirs=other_projects, max_depth=max_depth ) if current_indexes: for idx in current_indexes: @@ -494,7 +569,7 @@ def list_indexes(self): print(" " + "─" * 45) for project_path in other_projects: - project_indexes = self._discover_indexes_in_project(project_path) + project_indexes = self._discover_indexes_in_project(project_path, max_depth=max_depth) if not project_indexes: continue @@ -518,9 +593,9 @@ def list_indexes(self): projects_count = 0 for p in valid_projects: if p == current_path: - discovered = self._discover_indexes_in_project(p, exclude_dirs=other_projects) + discovered = self._discover_indexes_in_project(p, exclude_dirs=other_projects, max_depth=max_depth) else: - discovered = self._discover_indexes_in_project(p) + discovered = self._discover_indexes_in_project(p, max_depth=max_depth) if len(discovered) > 0: projects_count += 1 print(f"📊 Total: {total_indexes} indexes across {projects_count} projects") @@ -540,13 +615,17 @@ def list_indexes(self): print(" leann build my-docs --docs ./documents") def _discover_indexes_in_project( - self, project_path: Path, exclude_dirs: Optional[list[Path]] = None + self, project_path: Path, exclude_dirs: Optional[list[Path]] = None, max_depth: int = 3 ): """Discover all indexes in a project directory (both CLI and apps formats) - exclude_dirs: when provided, skip any APP-format index files that are - located under these directories. This prevents duplicates when the - current project is a parent directory of other registered projects. + Args: + project_path: The project directory to search. + exclude_dirs: When provided, skip any APP-format index files that are + located under these directories. This prevents duplicates when the + current project is a parent directory of other registered projects. + max_depth: Maximum directory depth to scan for app-format indexes. + Default is 3. Increase if indexes are in deeply nested directories. """ indexes = [] exclude_dirs = exclude_dirs or [] @@ -583,9 +662,10 @@ def _discover_indexes_in_project( } ) - # 2. Apps format: *.leann.meta.json files anywhere in the project + # 2. Apps format: *.leann.meta.json files in the project + # Use limited-depth search to avoid scanning entire large directories cli_indexes_dir = project_path / ".leann" / "indexes" - for meta_file in project_path.rglob("*.leann.meta.json"): + for meta_file in self._find_meta_files_limited(project_path, max_depth=max_depth): if meta_file.is_file(): # Skip CLI-built indexes (which store meta under .leann/indexes//) try: @@ -696,59 +776,43 @@ def _find_all_matching_indexes(self, index_name: str): # b) by the parent directory name (e.g., `new_txt`) seen_app_meta = set() - # 2a) by file base - for meta_file in project_path.rglob(f"{index_name}.leann.meta.json"): - if meta_file.is_file(): - # Skip CLI-built indexes' meta under .leann/indexes - try: - cli_indexes_dir = project_path / ".leann" / "indexes" - if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents: - continue - except Exception: - pass - is_current = project_path == current_path - key = (str(project_path), str(meta_file)) - if key in seen_app_meta: - continue - seen_app_meta.add(key) - matches.append( - { - "project_path": project_path, - "files_dir": meta_file.parent, - "meta_file": meta_file, - "is_current": is_current, - "kind": "app", - "display_name": meta_file.parent.name, - "file_base": meta_file.name.replace(".leann.meta.json", ""), - } - ) + # Use limited-depth search to avoid scanning large directories + for meta_file in self._find_meta_files_limited(project_path, max_depth=3): + if not meta_file.is_file(): + continue - # 2b) by parent directory name - for meta_file in project_path.rglob("*.leann.meta.json"): - if meta_file.is_file() and meta_file.parent.name == index_name: - # Skip CLI-built indexes' meta under .leann/indexes - try: - cli_indexes_dir = project_path / ".leann" / "indexes" - if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents: - continue - except Exception: - pass - is_current = project_path == current_path - key = (str(project_path), str(meta_file)) - if key in seen_app_meta: + # Skip CLI-built indexes' meta under .leann/indexes + try: + cli_indexes_dir = project_path / ".leann" / "indexes" + if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents: continue - seen_app_meta.add(key) - matches.append( - { - "project_path": project_path, - "files_dir": meta_file.parent, - "meta_file": meta_file, - "is_current": is_current, - "kind": "app", - "display_name": meta_file.parent.name, - "file_base": meta_file.name.replace(".leann.meta.json", ""), - } - ) + except Exception: + pass + + file_base = meta_file.name.replace(".leann.meta.json", "") + parent_name = meta_file.parent.name + + # Check if this matches the requested index_name + # Match by file base or by parent directory name + if file_base != index_name and parent_name != index_name: + continue + + is_current = project_path == current_path + key = (str(project_path), str(meta_file)) + if key in seen_app_meta: + continue + seen_app_meta.add(key) + matches.append( + { + "project_path": project_path, + "files_dir": meta_file.parent, + "meta_file": meta_file, + "is_current": is_current, + "kind": "app", + "display_name": parent_name, + "file_base": file_base, + } + ) # Sort: current project first, then by project name matches.sort(key=lambda x: (not x["is_current"], x["project_path"].name)) @@ -1679,7 +1743,7 @@ async def run(self, args=None): return if args.command == "list": - self.list_indexes() + self.list_indexes(max_depth=args.max_depth) elif args.command == "remove": self.remove_index(args.index_name, args.force) elif args.command == "build": diff --git a/packages/leann-core/src/leann/registry.py b/packages/leann-core/src/leann/registry.py index d4a559a5..bf6e6c2d 100644 --- a/packages/leann-core/src/leann/registry.py +++ b/packages/leann-core/src/leann/registry.py @@ -49,6 +49,63 @@ def autodiscover_backends(): # print("INFO: Backend auto-discovery finished.") +def _has_app_indexes_limited(root: Path, max_depth: int = 3) -> bool: + """Check if directory contains app-format indexes with limited depth search. + + Skips common large directories that shouldn't contain LEANN indexes. + """ + skip_dirs = { + ".git", + ".svn", + ".hg", + "node_modules", + "__pycache__", + ".venv", + "venv", + ".env", + "env", + ".tox", + ".nox", + ".mypy_cache", + ".pytest_cache", + ".ruff_cache", + "dist", + "build", + ".eggs", + ".cache", + ".npm", + ".yarn", + "vendor", + "Pods", + ".gradle", + "target", + } + + def should_skip(dir_name: str) -> bool: + if dir_name in skip_dirs: + return True + if dir_name.startswith(".") and dir_name != ".leann": + return True + return False + + def search_dir(path: Path, current_depth: int) -> bool: + if current_depth > max_depth: + return False + + try: + for item in path.iterdir(): + if item.is_file() and item.name.endswith(".leann.meta.json"): + return True + elif item.is_dir() and not should_skip(item.name): + if search_dir(item, current_depth + 1): + return True + except (PermissionError, OSError): + pass + return False + + return search_dir(root, 0) + + def register_project_directory(project_dir: Optional[Union[str, Path]] = None): """ Register a project directory in the global LEANN registry. @@ -65,8 +122,9 @@ def register_project_directory(project_dir: Optional[Union[str, Path]] = None): # Only register directories that have some kind of LEANN content # Either .leann/indexes/ (CLI format) or *.leann.meta.json files (apps format) + # Use limited-depth search to avoid scanning large directories like $HOME has_cli_indexes = (project_dir / ".leann" / "indexes").exists() - has_app_indexes = any(project_dir.rglob("*.leann.meta.json")) + has_app_indexes = _has_app_indexes_limited(project_dir, max_depth=3) if not (has_cli_indexes or has_app_indexes): # Don't register if there are no LEANN indexes diff --git a/tests/test_cli_list_performance.py b/tests/test_cli_list_performance.py new file mode 100644 index 00000000..01b9b430 --- /dev/null +++ b/tests/test_cli_list_performance.py @@ -0,0 +1,254 @@ +"""Tests for leann list command performance improvements. + +This module tests the limited-depth search functionality that prevents +leann list from scanning all files in large directories like $HOME. +See: https://github.com/yichuan-w/LEANN/issues/122 +""" + +from pathlib import Path + + +class TestLimitedDepthSearch: + """Test the _find_meta_files_limited method for performance.""" + + def test_find_meta_files_respects_max_depth(self, tmp_path: Path): + """Meta files beyond max_depth should not be found.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create a deep directory structure + # depth 0: tmp_path + # depth 1: level1 + # depth 2: level2 + # depth 3: level3 + # depth 4: level4 (beyond default max_depth=3) + level1 = tmp_path / "level1" + level2 = level1 / "level2" + level3 = level2 / "level3" + level4 = level3 / "level4" + + level4.mkdir(parents=True) + + # Create meta files at different depths + (tmp_path / "root.leann.meta.json").touch() + (level1 / "l1.leann.meta.json").touch() + (level2 / "l2.leann.meta.json").touch() + (level3 / "l3.leann.meta.json").touch() + (level4 / "l4.leann.meta.json").touch() + + # Find with max_depth=3 (should find root, l1, l2, l3 but not l4) + found = list(cli._find_meta_files_limited(tmp_path, max_depth=3)) + found_names = {f.name for f in found} + + assert "root.leann.meta.json" in found_names + assert "l1.leann.meta.json" in found_names + assert "l2.leann.meta.json" in found_names + assert "l3.leann.meta.json" in found_names + assert "l4.leann.meta.json" not in found_names + + def test_find_meta_files_skips_node_modules(self, tmp_path: Path): + """Meta files inside node_modules should be skipped.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create a meta file inside node_modules + node_modules = tmp_path / "node_modules" + node_modules.mkdir() + (node_modules / "pkg.leann.meta.json").touch() + + # Create a normal meta file + (tmp_path / "normal.leann.meta.json").touch() + + found = list(cli._find_meta_files_limited(tmp_path, max_depth=3)) + found_names = {f.name for f in found} + + assert "normal.leann.meta.json" in found_names + assert "pkg.leann.meta.json" not in found_names + + def test_find_meta_files_skips_hidden_dirs(self, tmp_path: Path): + """Meta files inside hidden directories (except .leann) should be skipped.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create meta files in hidden directories + hidden = tmp_path / ".hidden" + hidden.mkdir() + (hidden / "hidden.leann.meta.json").touch() + + # .leann should NOT be skipped + leann_dir = tmp_path / ".leann" + leann_dir.mkdir() + (leann_dir / "leann.leann.meta.json").touch() + + # Normal file + (tmp_path / "normal.leann.meta.json").touch() + + found = list(cli._find_meta_files_limited(tmp_path, max_depth=3)) + found_names = {f.name for f in found} + + assert "normal.leann.meta.json" in found_names + assert "leann.leann.meta.json" in found_names + assert "hidden.leann.meta.json" not in found_names + + def test_find_meta_files_skips_venv(self, tmp_path: Path): + """Meta files inside .venv and venv should be skipped.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create meta files in virtual env directories + for venv_name in [".venv", "venv", ".env", "env"]: + venv_dir = tmp_path / venv_name + venv_dir.mkdir() + (venv_dir / f"{venv_name}.leann.meta.json").touch() + + # Normal file + (tmp_path / "normal.leann.meta.json").touch() + + found = list(cli._find_meta_files_limited(tmp_path, max_depth=3)) + found_names = {f.name for f in found} + + assert "normal.leann.meta.json" in found_names + assert ".venv.leann.meta.json" not in found_names + assert "venv.leann.meta.json" not in found_names + assert ".env.leann.meta.json" not in found_names + assert "env.leann.meta.json" not in found_names + + def test_find_meta_files_skips_build_dirs(self, tmp_path: Path): + """Meta files inside build/dist directories should be skipped.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create meta files in build directories + for build_name in ["build", "dist", "__pycache__", ".cache"]: + build_dir = tmp_path / build_name + build_dir.mkdir() + (build_dir / f"{build_name}.leann.meta.json").touch() + + # Normal file + (tmp_path / "normal.leann.meta.json").touch() + + found = list(cli._find_meta_files_limited(tmp_path, max_depth=3)) + found_names = {f.name for f in found} + + assert "normal.leann.meta.json" in found_names + assert "build.leann.meta.json" not in found_names + assert "dist.leann.meta.json" not in found_names + assert "__pycache__.leann.meta.json" not in found_names + assert ".cache.leann.meta.json" not in found_names + + +class TestRegistryLimitedSearch: + """Test the registry limited search functionality.""" + + def test_has_app_indexes_limited_respects_depth(self, tmp_path: Path): + """Should not find indexes beyond max_depth.""" + from leann.registry import _has_app_indexes_limited + + # Create a deep directory structure + level4 = tmp_path / "l1" / "l2" / "l3" / "l4" + level4.mkdir(parents=True) + + # Only create a file beyond depth 3 + (level4 / "deep.leann.meta.json").touch() + + # Should not find it with max_depth=3 + assert not _has_app_indexes_limited(tmp_path, max_depth=3) + + # Create one at depth 2 + (tmp_path / "l1" / "l2" / "shallow.leann.meta.json").touch() + + # Now should find it + assert _has_app_indexes_limited(tmp_path, max_depth=3) + + def test_has_app_indexes_limited_skips_node_modules(self, tmp_path: Path): + """Should skip node_modules directory.""" + from leann.registry import _has_app_indexes_limited + + # Create a meta file inside node_modules + node_modules = tmp_path / "node_modules" + node_modules.mkdir() + (node_modules / "pkg.leann.meta.json").touch() + + # Should not find it + assert not _has_app_indexes_limited(tmp_path, max_depth=3) + + # Create a normal meta file + (tmp_path / "normal.leann.meta.json").touch() + + # Now should find it + assert _has_app_indexes_limited(tmp_path, max_depth=3) + + +class TestDiscoverIndexesPerformance: + """Test that _discover_indexes_in_project uses limited search.""" + + def test_discover_indexes_skips_deep_directories(self, tmp_path: Path): + """Should not scan directories beyond max_depth.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create a CLI format index (should always be found) + cli_indexes = tmp_path / ".leann" / "indexes" / "my-index" + cli_indexes.mkdir(parents=True) + (cli_indexes / "documents.leann.meta.json").touch() + + # Create an app format index at depth 4 (should not be found) + deep_dir = tmp_path / "a" / "b" / "c" / "d" + deep_dir.mkdir(parents=True) + (deep_dir / "deep.leann.meta.json").touch() + + indexes = cli._discover_indexes_in_project(tmp_path) + + # Should find the CLI index + assert any(idx["name"] == "my-index" for idx in indexes) + + # Should NOT find the deep app index + assert not any(idx["name"] == "d" for idx in indexes) + + def test_discover_indexes_respects_custom_max_depth(self, tmp_path: Path): + """Should find deeper indexes when max_depth is increased.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create an app format index at depth 5 + deep_dir = tmp_path / "a" / "b" / "c" / "d" / "e" + deep_dir.mkdir(parents=True) + (deep_dir / "deep.leann.meta.json").touch() + + # With default max_depth=3, should NOT find it + indexes_shallow = cli._discover_indexes_in_project(tmp_path, max_depth=3) + assert not any(idx["name"] == "e" for idx in indexes_shallow) + + # With max_depth=5, should find it + indexes_deep = cli._discover_indexes_in_project(tmp_path, max_depth=5) + assert any(idx["name"] == "e" for idx in indexes_deep) + + +class TestMaxDepthCliOption: + """Test the --max-depth CLI option for leann list.""" + + def test_max_depth_argument_is_parsed(self): + """The --max-depth argument should be properly parsed.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + parser = cli.create_parser() + + # Test default value + args = parser.parse_args(["list"]) + assert args.max_depth == 3 + + # Test custom value + args = parser.parse_args(["list", "--max-depth", "5"]) + assert args.max_depth == 5 + + # Test another custom value + args = parser.parse_args(["list", "--max-depth", "10"]) + assert args.max_depth == 10