From 3308b63657721704003080354ac55325904a375e Mon Sep 17 00:00:00 2001 From: majiayu000 <1835304752@qq.com> Date: Thu, 25 Dec 2025 11:58:55 +0800 Subject: [PATCH 1/2] fix: limit depth of index discovery to prevent slow scans Fixes #122 The `leann list` command was scanning entire directory trees using `rglob()`, causing extremely slow performance when run in large directories like $HOME. Changes: - Add `_find_meta_files_limited()` method with max_depth parameter - Skip common large directories (node_modules, .venv, .git, etc.) - Apply limited search in `_discover_indexes_in_project()` and `_find_all_matching_indexes()` - Add `_has_app_indexes_limited()` in registry.py for faster checks - Add comprehensive tests for the new functionality Signed-off-by: majiayu000 <1835304752@qq.com> --- packages/leann-core/src/leann/cli.py | 153 ++++++++++------ packages/leann-core/src/leann/registry.py | 60 +++++- tests/test_cli_list_performance.py | 212 ++++++++++++++++++++++ 3 files changed, 371 insertions(+), 54 deletions(-) create mode 100644 tests/test_cli_list_performance.py diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index 708892ab..139b4713 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -404,6 +404,68 @@ def _should_exclude_file(self, file_path: Path, gitignore_matches) -> bool: absolute_path = Path(str(file_path)) return gitignore_matches(absolute_path.as_posix()) + def _find_meta_files_limited( + self, root: Path, max_depth: int = 3, pattern: str = "*.leann.meta.json" + ): + """Find meta files with limited depth to avoid scanning large directories. + + Skips common large directories that shouldn't contain LEANN indexes. + """ + # Directories to skip - these are typically large and won't contain user indexes + skip_dirs = { + ".git", + ".svn", + ".hg", + "node_modules", + "__pycache__", + ".venv", + "venv", + ".env", + "env", + ".tox", + ".nox", + ".mypy_cache", + ".pytest_cache", + ".ruff_cache", + "dist", + "build", + ".eggs", + "*.egg-info", + ".cache", + ".npm", + ".yarn", + "vendor", + "Pods", + ".gradle", + "target", + } + + def should_skip(dir_name: str) -> bool: + """Check if directory should be skipped.""" + if dir_name in skip_dirs: + return True + # Skip hidden directories (except .leann which we want) + if dir_name.startswith(".") and dir_name != ".leann": + return True + return False + + def search_dir(path: Path, current_depth: int): + """Recursively search with depth limit.""" + if current_depth > max_depth: + return + + try: + for item in path.iterdir(): + if item.is_file() and item.match(pattern): + yield item + elif item.is_dir() and not should_skip(item.name): + yield from search_dir(item, current_depth + 1) + except (PermissionError, OSError): + # Skip directories we can't read + pass + + yield from search_dir(root, 0) + def _is_git_submodule(self, path: Path) -> bool: """Check if a path is a git submodule.""" try: @@ -583,9 +645,10 @@ def _discover_indexes_in_project( } ) - # 2. Apps format: *.leann.meta.json files anywhere in the project + # 2. Apps format: *.leann.meta.json files in the project + # Use limited-depth search to avoid scanning entire large directories cli_indexes_dir = project_path / ".leann" / "indexes" - for meta_file in project_path.rglob("*.leann.meta.json"): + for meta_file in self._find_meta_files_limited(project_path, max_depth=3): if meta_file.is_file(): # Skip CLI-built indexes (which store meta under .leann/indexes//) try: @@ -696,59 +759,43 @@ def _find_all_matching_indexes(self, index_name: str): # b) by the parent directory name (e.g., `new_txt`) seen_app_meta = set() - # 2a) by file base - for meta_file in project_path.rglob(f"{index_name}.leann.meta.json"): - if meta_file.is_file(): - # Skip CLI-built indexes' meta under .leann/indexes - try: - cli_indexes_dir = project_path / ".leann" / "indexes" - if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents: - continue - except Exception: - pass - is_current = project_path == current_path - key = (str(project_path), str(meta_file)) - if key in seen_app_meta: - continue - seen_app_meta.add(key) - matches.append( - { - "project_path": project_path, - "files_dir": meta_file.parent, - "meta_file": meta_file, - "is_current": is_current, - "kind": "app", - "display_name": meta_file.parent.name, - "file_base": meta_file.name.replace(".leann.meta.json", ""), - } - ) + # Use limited-depth search to avoid scanning large directories + for meta_file in self._find_meta_files_limited(project_path, max_depth=3): + if not meta_file.is_file(): + continue - # 2b) by parent directory name - for meta_file in project_path.rglob("*.leann.meta.json"): - if meta_file.is_file() and meta_file.parent.name == index_name: - # Skip CLI-built indexes' meta under .leann/indexes - try: - cli_indexes_dir = project_path / ".leann" / "indexes" - if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents: - continue - except Exception: - pass - is_current = project_path == current_path - key = (str(project_path), str(meta_file)) - if key in seen_app_meta: + # Skip CLI-built indexes' meta under .leann/indexes + try: + cli_indexes_dir = project_path / ".leann" / "indexes" + if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents: continue - seen_app_meta.add(key) - matches.append( - { - "project_path": project_path, - "files_dir": meta_file.parent, - "meta_file": meta_file, - "is_current": is_current, - "kind": "app", - "display_name": meta_file.parent.name, - "file_base": meta_file.name.replace(".leann.meta.json", ""), - } - ) + except Exception: + pass + + file_base = meta_file.name.replace(".leann.meta.json", "") + parent_name = meta_file.parent.name + + # Check if this matches the requested index_name + # Match by file base or by parent directory name + if file_base != index_name and parent_name != index_name: + continue + + is_current = project_path == current_path + key = (str(project_path), str(meta_file)) + if key in seen_app_meta: + continue + seen_app_meta.add(key) + matches.append( + { + "project_path": project_path, + "files_dir": meta_file.parent, + "meta_file": meta_file, + "is_current": is_current, + "kind": "app", + "display_name": parent_name, + "file_base": file_base, + } + ) # Sort: current project first, then by project name matches.sort(key=lambda x: (not x["is_current"], x["project_path"].name)) diff --git a/packages/leann-core/src/leann/registry.py b/packages/leann-core/src/leann/registry.py index d4a559a5..bf6e6c2d 100644 --- a/packages/leann-core/src/leann/registry.py +++ b/packages/leann-core/src/leann/registry.py @@ -49,6 +49,63 @@ def autodiscover_backends(): # print("INFO: Backend auto-discovery finished.") +def _has_app_indexes_limited(root: Path, max_depth: int = 3) -> bool: + """Check if directory contains app-format indexes with limited depth search. + + Skips common large directories that shouldn't contain LEANN indexes. + """ + skip_dirs = { + ".git", + ".svn", + ".hg", + "node_modules", + "__pycache__", + ".venv", + "venv", + ".env", + "env", + ".tox", + ".nox", + ".mypy_cache", + ".pytest_cache", + ".ruff_cache", + "dist", + "build", + ".eggs", + ".cache", + ".npm", + ".yarn", + "vendor", + "Pods", + ".gradle", + "target", + } + + def should_skip(dir_name: str) -> bool: + if dir_name in skip_dirs: + return True + if dir_name.startswith(".") and dir_name != ".leann": + return True + return False + + def search_dir(path: Path, current_depth: int) -> bool: + if current_depth > max_depth: + return False + + try: + for item in path.iterdir(): + if item.is_file() and item.name.endswith(".leann.meta.json"): + return True + elif item.is_dir() and not should_skip(item.name): + if search_dir(item, current_depth + 1): + return True + except (PermissionError, OSError): + pass + return False + + return search_dir(root, 0) + + def register_project_directory(project_dir: Optional[Union[str, Path]] = None): """ Register a project directory in the global LEANN registry. @@ -65,8 +122,9 @@ def register_project_directory(project_dir: Optional[Union[str, Path]] = None): # Only register directories that have some kind of LEANN content # Either .leann/indexes/ (CLI format) or *.leann.meta.json files (apps format) + # Use limited-depth search to avoid scanning large directories like $HOME has_cli_indexes = (project_dir / ".leann" / "indexes").exists() - has_app_indexes = any(project_dir.rglob("*.leann.meta.json")) + has_app_indexes = _has_app_indexes_limited(project_dir, max_depth=3) if not (has_cli_indexes or has_app_indexes): # Don't register if there are no LEANN indexes diff --git a/tests/test_cli_list_performance.py b/tests/test_cli_list_performance.py new file mode 100644 index 00000000..5e53864d --- /dev/null +++ b/tests/test_cli_list_performance.py @@ -0,0 +1,212 @@ +"""Tests for leann list command performance improvements. + +This module tests the limited-depth search functionality that prevents +leann list from scanning all files in large directories like $HOME. +See: https://github.com/yichuan-w/LEANN/issues/122 +""" + +from pathlib import Path + + +class TestLimitedDepthSearch: + """Test the _find_meta_files_limited method for performance.""" + + def test_find_meta_files_respects_max_depth(self, tmp_path: Path): + """Meta files beyond max_depth should not be found.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create a deep directory structure + # depth 0: tmp_path + # depth 1: level1 + # depth 2: level2 + # depth 3: level3 + # depth 4: level4 (beyond default max_depth=3) + level1 = tmp_path / "level1" + level2 = level1 / "level2" + level3 = level2 / "level3" + level4 = level3 / "level4" + + level4.mkdir(parents=True) + + # Create meta files at different depths + (tmp_path / "root.leann.meta.json").touch() + (level1 / "l1.leann.meta.json").touch() + (level2 / "l2.leann.meta.json").touch() + (level3 / "l3.leann.meta.json").touch() + (level4 / "l4.leann.meta.json").touch() + + # Find with max_depth=3 (should find root, l1, l2, l3 but not l4) + found = list(cli._find_meta_files_limited(tmp_path, max_depth=3)) + found_names = {f.name for f in found} + + assert "root.leann.meta.json" in found_names + assert "l1.leann.meta.json" in found_names + assert "l2.leann.meta.json" in found_names + assert "l3.leann.meta.json" in found_names + assert "l4.leann.meta.json" not in found_names + + def test_find_meta_files_skips_node_modules(self, tmp_path: Path): + """Meta files inside node_modules should be skipped.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create a meta file inside node_modules + node_modules = tmp_path / "node_modules" + node_modules.mkdir() + (node_modules / "pkg.leann.meta.json").touch() + + # Create a normal meta file + (tmp_path / "normal.leann.meta.json").touch() + + found = list(cli._find_meta_files_limited(tmp_path, max_depth=3)) + found_names = {f.name for f in found} + + assert "normal.leann.meta.json" in found_names + assert "pkg.leann.meta.json" not in found_names + + def test_find_meta_files_skips_hidden_dirs(self, tmp_path: Path): + """Meta files inside hidden directories (except .leann) should be skipped.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create meta files in hidden directories + hidden = tmp_path / ".hidden" + hidden.mkdir() + (hidden / "hidden.leann.meta.json").touch() + + # .leann should NOT be skipped + leann_dir = tmp_path / ".leann" + leann_dir.mkdir() + (leann_dir / "leann.leann.meta.json").touch() + + # Normal file + (tmp_path / "normal.leann.meta.json").touch() + + found = list(cli._find_meta_files_limited(tmp_path, max_depth=3)) + found_names = {f.name for f in found} + + assert "normal.leann.meta.json" in found_names + assert "leann.leann.meta.json" in found_names + assert "hidden.leann.meta.json" not in found_names + + def test_find_meta_files_skips_venv(self, tmp_path: Path): + """Meta files inside .venv and venv should be skipped.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create meta files in virtual env directories + for venv_name in [".venv", "venv", ".env", "env"]: + venv_dir = tmp_path / venv_name + venv_dir.mkdir() + (venv_dir / f"{venv_name}.leann.meta.json").touch() + + # Normal file + (tmp_path / "normal.leann.meta.json").touch() + + found = list(cli._find_meta_files_limited(tmp_path, max_depth=3)) + found_names = {f.name for f in found} + + assert "normal.leann.meta.json" in found_names + assert ".venv.leann.meta.json" not in found_names + assert "venv.leann.meta.json" not in found_names + assert ".env.leann.meta.json" not in found_names + assert "env.leann.meta.json" not in found_names + + def test_find_meta_files_skips_build_dirs(self, tmp_path: Path): + """Meta files inside build/dist directories should be skipped.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create meta files in build directories + for build_name in ["build", "dist", "__pycache__", ".cache"]: + build_dir = tmp_path / build_name + build_dir.mkdir() + (build_dir / f"{build_name}.leann.meta.json").touch() + + # Normal file + (tmp_path / "normal.leann.meta.json").touch() + + found = list(cli._find_meta_files_limited(tmp_path, max_depth=3)) + found_names = {f.name for f in found} + + assert "normal.leann.meta.json" in found_names + assert "build.leann.meta.json" not in found_names + assert "dist.leann.meta.json" not in found_names + assert "__pycache__.leann.meta.json" not in found_names + assert ".cache.leann.meta.json" not in found_names + + +class TestRegistryLimitedSearch: + """Test the registry limited search functionality.""" + + def test_has_app_indexes_limited_respects_depth(self, tmp_path: Path): + """Should not find indexes beyond max_depth.""" + from leann.registry import _has_app_indexes_limited + + # Create a deep directory structure + level4 = tmp_path / "l1" / "l2" / "l3" / "l4" + level4.mkdir(parents=True) + + # Only create a file beyond depth 3 + (level4 / "deep.leann.meta.json").touch() + + # Should not find it with max_depth=3 + assert not _has_app_indexes_limited(tmp_path, max_depth=3) + + # Create one at depth 2 + (tmp_path / "l1" / "l2" / "shallow.leann.meta.json").touch() + + # Now should find it + assert _has_app_indexes_limited(tmp_path, max_depth=3) + + def test_has_app_indexes_limited_skips_node_modules(self, tmp_path: Path): + """Should skip node_modules directory.""" + from leann.registry import _has_app_indexes_limited + + # Create a meta file inside node_modules + node_modules = tmp_path / "node_modules" + node_modules.mkdir() + (node_modules / "pkg.leann.meta.json").touch() + + # Should not find it + assert not _has_app_indexes_limited(tmp_path, max_depth=3) + + # Create a normal meta file + (tmp_path / "normal.leann.meta.json").touch() + + # Now should find it + assert _has_app_indexes_limited(tmp_path, max_depth=3) + + +class TestDiscoverIndexesPerformance: + """Test that _discover_indexes_in_project uses limited search.""" + + def test_discover_indexes_skips_deep_directories(self, tmp_path: Path): + """Should not scan directories beyond max_depth.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create a CLI format index (should always be found) + cli_indexes = tmp_path / ".leann" / "indexes" / "my-index" + cli_indexes.mkdir(parents=True) + (cli_indexes / "documents.leann.meta.json").touch() + + # Create an app format index at depth 4 (should not be found) + deep_dir = tmp_path / "a" / "b" / "c" / "d" + deep_dir.mkdir(parents=True) + (deep_dir / "deep.leann.meta.json").touch() + + indexes = cli._discover_indexes_in_project(tmp_path) + + # Should find the CLI index + assert any(idx["name"] == "my-index" for idx in indexes) + + # Should NOT find the deep app index + assert not any(idx["name"] == "d" for idx in indexes) From ba9164482af9a5c4283d303573865411d86adb73 Mon Sep 17 00:00:00 2001 From: majiayu000 <1835304752@qq.com> Date: Thu, 25 Dec 2025 19:01:48 +0800 Subject: [PATCH 2/2] feat: add --max-depth CLI option for leann list command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address reviewer feedback by making the directory scan depth configurable instead of hardcoding it to 3. Users with deeply nested project structures can now increase the depth limit as needed. - Add --max-depth argument to list command (default: 3) - Update list_indexes() and _discover_indexes_in_project() to accept max_depth - Add tests for the new CLI option and custom depth behavior 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- packages/leann-core/src/leann/cli.py | 41 +++++++++++++++++++-------- tests/test_cli_list_performance.py | 42 ++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 12 deletions(-) diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index 139b4713..b1365684 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -350,7 +350,14 @@ def create_parser(self) -> argparse.ArgumentParser: ) # List command - subparsers.add_parser("list", help="List all indexes") + list_parser = subparsers.add_parser("list", help="List all indexes") + list_parser.add_argument( + "--max-depth", + type=int, + default=3, + help="Maximum directory depth to scan for indexes (default: 3). " + "Increase if your indexes are in deeply nested directories.", + ) # Remove command remove_parser = subparsers.add_parser("remove", help="Remove an index") @@ -492,7 +499,13 @@ def _is_git_submodule(self, path: Path) -> bool: # If anything goes wrong, assume it's not a submodule return False - def list_indexes(self): + def list_indexes(self, max_depth: int = 3): + """List all LEANN indexes across registered projects. + + Args: + max_depth: Maximum directory depth to scan for app-format indexes. + Default is 3. Increase if indexes are in deeply nested directories. + """ # Get all project directories with .leann global_registry = Path.home() / ".leann" / "projects.json" all_projects = [] @@ -537,7 +550,7 @@ def list_indexes(self): print(" " + "─" * 45) current_indexes = self._discover_indexes_in_project( - current_path, exclude_dirs=other_projects + current_path, exclude_dirs=other_projects, max_depth=max_depth ) if current_indexes: for idx in current_indexes: @@ -556,7 +569,7 @@ def list_indexes(self): print(" " + "─" * 45) for project_path in other_projects: - project_indexes = self._discover_indexes_in_project(project_path) + project_indexes = self._discover_indexes_in_project(project_path, max_depth=max_depth) if not project_indexes: continue @@ -580,9 +593,9 @@ def list_indexes(self): projects_count = 0 for p in valid_projects: if p == current_path: - discovered = self._discover_indexes_in_project(p, exclude_dirs=other_projects) + discovered = self._discover_indexes_in_project(p, exclude_dirs=other_projects, max_depth=max_depth) else: - discovered = self._discover_indexes_in_project(p) + discovered = self._discover_indexes_in_project(p, max_depth=max_depth) if len(discovered) > 0: projects_count += 1 print(f"📊 Total: {total_indexes} indexes across {projects_count} projects") @@ -602,13 +615,17 @@ def list_indexes(self): print(" leann build my-docs --docs ./documents") def _discover_indexes_in_project( - self, project_path: Path, exclude_dirs: Optional[list[Path]] = None + self, project_path: Path, exclude_dirs: Optional[list[Path]] = None, max_depth: int = 3 ): """Discover all indexes in a project directory (both CLI and apps formats) - exclude_dirs: when provided, skip any APP-format index files that are - located under these directories. This prevents duplicates when the - current project is a parent directory of other registered projects. + Args: + project_path: The project directory to search. + exclude_dirs: When provided, skip any APP-format index files that are + located under these directories. This prevents duplicates when the + current project is a parent directory of other registered projects. + max_depth: Maximum directory depth to scan for app-format indexes. + Default is 3. Increase if indexes are in deeply nested directories. """ indexes = [] exclude_dirs = exclude_dirs or [] @@ -648,7 +665,7 @@ def _discover_indexes_in_project( # 2. Apps format: *.leann.meta.json files in the project # Use limited-depth search to avoid scanning entire large directories cli_indexes_dir = project_path / ".leann" / "indexes" - for meta_file in self._find_meta_files_limited(project_path, max_depth=3): + for meta_file in self._find_meta_files_limited(project_path, max_depth=max_depth): if meta_file.is_file(): # Skip CLI-built indexes (which store meta under .leann/indexes//) try: @@ -1726,7 +1743,7 @@ async def run(self, args=None): return if args.command == "list": - self.list_indexes() + self.list_indexes(max_depth=args.max_depth) elif args.command == "remove": self.remove_index(args.index_name, args.force) elif args.command == "build": diff --git a/tests/test_cli_list_performance.py b/tests/test_cli_list_performance.py index 5e53864d..01b9b430 100644 --- a/tests/test_cli_list_performance.py +++ b/tests/test_cli_list_performance.py @@ -210,3 +210,45 @@ def test_discover_indexes_skips_deep_directories(self, tmp_path: Path): # Should NOT find the deep app index assert not any(idx["name"] == "d" for idx in indexes) + + def test_discover_indexes_respects_custom_max_depth(self, tmp_path: Path): + """Should find deeper indexes when max_depth is increased.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create an app format index at depth 5 + deep_dir = tmp_path / "a" / "b" / "c" / "d" / "e" + deep_dir.mkdir(parents=True) + (deep_dir / "deep.leann.meta.json").touch() + + # With default max_depth=3, should NOT find it + indexes_shallow = cli._discover_indexes_in_project(tmp_path, max_depth=3) + assert not any(idx["name"] == "e" for idx in indexes_shallow) + + # With max_depth=5, should find it + indexes_deep = cli._discover_indexes_in_project(tmp_path, max_depth=5) + assert any(idx["name"] == "e" for idx in indexes_deep) + + +class TestMaxDepthCliOption: + """Test the --max-depth CLI option for leann list.""" + + def test_max_depth_argument_is_parsed(self): + """The --max-depth argument should be properly parsed.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + parser = cli.create_parser() + + # Test default value + args = parser.parse_args(["list"]) + assert args.max_depth == 3 + + # Test custom value + args = parser.parse_args(["list", "--max-depth", "5"]) + assert args.max_depth == 5 + + # Test another custom value + args = parser.parse_args(["list", "--max-depth", "10"]) + assert args.max_depth == 10