From 3308b63657721704003080354ac55325904a375e Mon Sep 17 00:00:00 2001 From: majiayu000 <1835304752@qq.com> Date: Thu, 25 Dec 2025 11:58:55 +0800 Subject: [PATCH 1/3] fix: limit depth of index discovery to prevent slow scans Fixes #122 The `leann list` command was scanning entire directory trees using `rglob()`, causing extremely slow performance when run in large directories like $HOME. Changes: - Add `_find_meta_files_limited()` method with max_depth parameter - Skip common large directories (node_modules, .venv, .git, etc.) - Apply limited search in `_discover_indexes_in_project()` and `_find_all_matching_indexes()` - Add `_has_app_indexes_limited()` in registry.py for faster checks - Add comprehensive tests for the new functionality Signed-off-by: majiayu000 <1835304752@qq.com> --- packages/leann-core/src/leann/cli.py | 153 ++++++++++------ packages/leann-core/src/leann/registry.py | 60 +++++- tests/test_cli_list_performance.py | 212 ++++++++++++++++++++++ 3 files changed, 371 insertions(+), 54 deletions(-) create mode 100644 tests/test_cli_list_performance.py diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index 708892ab..139b4713 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -404,6 +404,68 @@ def _should_exclude_file(self, file_path: Path, gitignore_matches) -> bool: absolute_path = Path(str(file_path)) return gitignore_matches(absolute_path.as_posix()) + def _find_meta_files_limited( + self, root: Path, max_depth: int = 3, pattern: str = "*.leann.meta.json" + ): + """Find meta files with limited depth to avoid scanning large directories. + + Skips common large directories that shouldn't contain LEANN indexes. + """ + # Directories to skip - these are typically large and won't contain user indexes + skip_dirs = { + ".git", + ".svn", + ".hg", + "node_modules", + "__pycache__", + ".venv", + "venv", + ".env", + "env", + ".tox", + ".nox", + ".mypy_cache", + ".pytest_cache", + ".ruff_cache", + "dist", + "build", + ".eggs", + "*.egg-info", + ".cache", + ".npm", + ".yarn", + "vendor", + "Pods", + ".gradle", + "target", + } + + def should_skip(dir_name: str) -> bool: + """Check if directory should be skipped.""" + if dir_name in skip_dirs: + return True + # Skip hidden directories (except .leann which we want) + if dir_name.startswith(".") and dir_name != ".leann": + return True + return False + + def search_dir(path: Path, current_depth: int): + """Recursively search with depth limit.""" + if current_depth > max_depth: + return + + try: + for item in path.iterdir(): + if item.is_file() and item.match(pattern): + yield item + elif item.is_dir() and not should_skip(item.name): + yield from search_dir(item, current_depth + 1) + except (PermissionError, OSError): + # Skip directories we can't read + pass + + yield from search_dir(root, 0) + def _is_git_submodule(self, path: Path) -> bool: """Check if a path is a git submodule.""" try: @@ -583,9 +645,10 @@ def _discover_indexes_in_project( } ) - # 2. Apps format: *.leann.meta.json files anywhere in the project + # 2. Apps format: *.leann.meta.json files in the project + # Use limited-depth search to avoid scanning entire large directories cli_indexes_dir = project_path / ".leann" / "indexes" - for meta_file in project_path.rglob("*.leann.meta.json"): + for meta_file in self._find_meta_files_limited(project_path, max_depth=3): if meta_file.is_file(): # Skip CLI-built indexes (which store meta under .leann/indexes//) try: @@ -696,59 +759,43 @@ def _find_all_matching_indexes(self, index_name: str): # b) by the parent directory name (e.g., `new_txt`) seen_app_meta = set() - # 2a) by file base - for meta_file in project_path.rglob(f"{index_name}.leann.meta.json"): - if meta_file.is_file(): - # Skip CLI-built indexes' meta under .leann/indexes - try: - cli_indexes_dir = project_path / ".leann" / "indexes" - if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents: - continue - except Exception: - pass - is_current = project_path == current_path - key = (str(project_path), str(meta_file)) - if key in seen_app_meta: - continue - seen_app_meta.add(key) - matches.append( - { - "project_path": project_path, - "files_dir": meta_file.parent, - "meta_file": meta_file, - "is_current": is_current, - "kind": "app", - "display_name": meta_file.parent.name, - "file_base": meta_file.name.replace(".leann.meta.json", ""), - } - ) + # Use limited-depth search to avoid scanning large directories + for meta_file in self._find_meta_files_limited(project_path, max_depth=3): + if not meta_file.is_file(): + continue - # 2b) by parent directory name - for meta_file in project_path.rglob("*.leann.meta.json"): - if meta_file.is_file() and meta_file.parent.name == index_name: - # Skip CLI-built indexes' meta under .leann/indexes - try: - cli_indexes_dir = project_path / ".leann" / "indexes" - if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents: - continue - except Exception: - pass - is_current = project_path == current_path - key = (str(project_path), str(meta_file)) - if key in seen_app_meta: + # Skip CLI-built indexes' meta under .leann/indexes + try: + cli_indexes_dir = project_path / ".leann" / "indexes" + if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents: continue - seen_app_meta.add(key) - matches.append( - { - "project_path": project_path, - "files_dir": meta_file.parent, - "meta_file": meta_file, - "is_current": is_current, - "kind": "app", - "display_name": meta_file.parent.name, - "file_base": meta_file.name.replace(".leann.meta.json", ""), - } - ) + except Exception: + pass + + file_base = meta_file.name.replace(".leann.meta.json", "") + parent_name = meta_file.parent.name + + # Check if this matches the requested index_name + # Match by file base or by parent directory name + if file_base != index_name and parent_name != index_name: + continue + + is_current = project_path == current_path + key = (str(project_path), str(meta_file)) + if key in seen_app_meta: + continue + seen_app_meta.add(key) + matches.append( + { + "project_path": project_path, + "files_dir": meta_file.parent, + "meta_file": meta_file, + "is_current": is_current, + "kind": "app", + "display_name": parent_name, + "file_base": file_base, + } + ) # Sort: current project first, then by project name matches.sort(key=lambda x: (not x["is_current"], x["project_path"].name)) diff --git a/packages/leann-core/src/leann/registry.py b/packages/leann-core/src/leann/registry.py index d4a559a5..bf6e6c2d 100644 --- a/packages/leann-core/src/leann/registry.py +++ b/packages/leann-core/src/leann/registry.py @@ -49,6 +49,63 @@ def autodiscover_backends(): # print("INFO: Backend auto-discovery finished.") +def _has_app_indexes_limited(root: Path, max_depth: int = 3) -> bool: + """Check if directory contains app-format indexes with limited depth search. + + Skips common large directories that shouldn't contain LEANN indexes. + """ + skip_dirs = { + ".git", + ".svn", + ".hg", + "node_modules", + "__pycache__", + ".venv", + "venv", + ".env", + "env", + ".tox", + ".nox", + ".mypy_cache", + ".pytest_cache", + ".ruff_cache", + "dist", + "build", + ".eggs", + ".cache", + ".npm", + ".yarn", + "vendor", + "Pods", + ".gradle", + "target", + } + + def should_skip(dir_name: str) -> bool: + if dir_name in skip_dirs: + return True + if dir_name.startswith(".") and dir_name != ".leann": + return True + return False + + def search_dir(path: Path, current_depth: int) -> bool: + if current_depth > max_depth: + return False + + try: + for item in path.iterdir(): + if item.is_file() and item.name.endswith(".leann.meta.json"): + return True + elif item.is_dir() and not should_skip(item.name): + if search_dir(item, current_depth + 1): + return True + except (PermissionError, OSError): + pass + return False + + return search_dir(root, 0) + + def register_project_directory(project_dir: Optional[Union[str, Path]] = None): """ Register a project directory in the global LEANN registry. @@ -65,8 +122,9 @@ def register_project_directory(project_dir: Optional[Union[str, Path]] = None): # Only register directories that have some kind of LEANN content # Either .leann/indexes/ (CLI format) or *.leann.meta.json files (apps format) + # Use limited-depth search to avoid scanning large directories like $HOME has_cli_indexes = (project_dir / ".leann" / "indexes").exists() - has_app_indexes = any(project_dir.rglob("*.leann.meta.json")) + has_app_indexes = _has_app_indexes_limited(project_dir, max_depth=3) if not (has_cli_indexes or has_app_indexes): # Don't register if there are no LEANN indexes diff --git a/tests/test_cli_list_performance.py b/tests/test_cli_list_performance.py new file mode 100644 index 00000000..5e53864d --- /dev/null +++ b/tests/test_cli_list_performance.py @@ -0,0 +1,212 @@ +"""Tests for leann list command performance improvements. + +This module tests the limited-depth search functionality that prevents +leann list from scanning all files in large directories like $HOME. +See: https://github.com/yichuan-w/LEANN/issues/122 +""" + +from pathlib import Path + + +class TestLimitedDepthSearch: + """Test the _find_meta_files_limited method for performance.""" + + def test_find_meta_files_respects_max_depth(self, tmp_path: Path): + """Meta files beyond max_depth should not be found.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create a deep directory structure + # depth 0: tmp_path + # depth 1: level1 + # depth 2: level2 + # depth 3: level3 + # depth 4: level4 (beyond default max_depth=3) + level1 = tmp_path / "level1" + level2 = level1 / "level2" + level3 = level2 / "level3" + level4 = level3 / "level4" + + level4.mkdir(parents=True) + + # Create meta files at different depths + (tmp_path / "root.leann.meta.json").touch() + (level1 / "l1.leann.meta.json").touch() + (level2 / "l2.leann.meta.json").touch() + (level3 / "l3.leann.meta.json").touch() + (level4 / "l4.leann.meta.json").touch() + + # Find with max_depth=3 (should find root, l1, l2, l3 but not l4) + found = list(cli._find_meta_files_limited(tmp_path, max_depth=3)) + found_names = {f.name for f in found} + + assert "root.leann.meta.json" in found_names + assert "l1.leann.meta.json" in found_names + assert "l2.leann.meta.json" in found_names + assert "l3.leann.meta.json" in found_names + assert "l4.leann.meta.json" not in found_names + + def test_find_meta_files_skips_node_modules(self, tmp_path: Path): + """Meta files inside node_modules should be skipped.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create a meta file inside node_modules + node_modules = tmp_path / "node_modules" + node_modules.mkdir() + (node_modules / "pkg.leann.meta.json").touch() + + # Create a normal meta file + (tmp_path / "normal.leann.meta.json").touch() + + found = list(cli._find_meta_files_limited(tmp_path, max_depth=3)) + found_names = {f.name for f in found} + + assert "normal.leann.meta.json" in found_names + assert "pkg.leann.meta.json" not in found_names + + def test_find_meta_files_skips_hidden_dirs(self, tmp_path: Path): + """Meta files inside hidden directories (except .leann) should be skipped.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create meta files in hidden directories + hidden = tmp_path / ".hidden" + hidden.mkdir() + (hidden / "hidden.leann.meta.json").touch() + + # .leann should NOT be skipped + leann_dir = tmp_path / ".leann" + leann_dir.mkdir() + (leann_dir / "leann.leann.meta.json").touch() + + # Normal file + (tmp_path / "normal.leann.meta.json").touch() + + found = list(cli._find_meta_files_limited(tmp_path, max_depth=3)) + found_names = {f.name for f in found} + + assert "normal.leann.meta.json" in found_names + assert "leann.leann.meta.json" in found_names + assert "hidden.leann.meta.json" not in found_names + + def test_find_meta_files_skips_venv(self, tmp_path: Path): + """Meta files inside .venv and venv should be skipped.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create meta files in virtual env directories + for venv_name in [".venv", "venv", ".env", "env"]: + venv_dir = tmp_path / venv_name + venv_dir.mkdir() + (venv_dir / f"{venv_name}.leann.meta.json").touch() + + # Normal file + (tmp_path / "normal.leann.meta.json").touch() + + found = list(cli._find_meta_files_limited(tmp_path, max_depth=3)) + found_names = {f.name for f in found} + + assert "normal.leann.meta.json" in found_names + assert ".venv.leann.meta.json" not in found_names + assert "venv.leann.meta.json" not in found_names + assert ".env.leann.meta.json" not in found_names + assert "env.leann.meta.json" not in found_names + + def test_find_meta_files_skips_build_dirs(self, tmp_path: Path): + """Meta files inside build/dist directories should be skipped.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create meta files in build directories + for build_name in ["build", "dist", "__pycache__", ".cache"]: + build_dir = tmp_path / build_name + build_dir.mkdir() + (build_dir / f"{build_name}.leann.meta.json").touch() + + # Normal file + (tmp_path / "normal.leann.meta.json").touch() + + found = list(cli._find_meta_files_limited(tmp_path, max_depth=3)) + found_names = {f.name for f in found} + + assert "normal.leann.meta.json" in found_names + assert "build.leann.meta.json" not in found_names + assert "dist.leann.meta.json" not in found_names + assert "__pycache__.leann.meta.json" not in found_names + assert ".cache.leann.meta.json" not in found_names + + +class TestRegistryLimitedSearch: + """Test the registry limited search functionality.""" + + def test_has_app_indexes_limited_respects_depth(self, tmp_path: Path): + """Should not find indexes beyond max_depth.""" + from leann.registry import _has_app_indexes_limited + + # Create a deep directory structure + level4 = tmp_path / "l1" / "l2" / "l3" / "l4" + level4.mkdir(parents=True) + + # Only create a file beyond depth 3 + (level4 / "deep.leann.meta.json").touch() + + # Should not find it with max_depth=3 + assert not _has_app_indexes_limited(tmp_path, max_depth=3) + + # Create one at depth 2 + (tmp_path / "l1" / "l2" / "shallow.leann.meta.json").touch() + + # Now should find it + assert _has_app_indexes_limited(tmp_path, max_depth=3) + + def test_has_app_indexes_limited_skips_node_modules(self, tmp_path: Path): + """Should skip node_modules directory.""" + from leann.registry import _has_app_indexes_limited + + # Create a meta file inside node_modules + node_modules = tmp_path / "node_modules" + node_modules.mkdir() + (node_modules / "pkg.leann.meta.json").touch() + + # Should not find it + assert not _has_app_indexes_limited(tmp_path, max_depth=3) + + # Create a normal meta file + (tmp_path / "normal.leann.meta.json").touch() + + # Now should find it + assert _has_app_indexes_limited(tmp_path, max_depth=3) + + +class TestDiscoverIndexesPerformance: + """Test that _discover_indexes_in_project uses limited search.""" + + def test_discover_indexes_skips_deep_directories(self, tmp_path: Path): + """Should not scan directories beyond max_depth.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create a CLI format index (should always be found) + cli_indexes = tmp_path / ".leann" / "indexes" / "my-index" + cli_indexes.mkdir(parents=True) + (cli_indexes / "documents.leann.meta.json").touch() + + # Create an app format index at depth 4 (should not be found) + deep_dir = tmp_path / "a" / "b" / "c" / "d" + deep_dir.mkdir(parents=True) + (deep_dir / "deep.leann.meta.json").touch() + + indexes = cli._discover_indexes_in_project(tmp_path) + + # Should find the CLI index + assert any(idx["name"] == "my-index" for idx in indexes) + + # Should NOT find the deep app index + assert not any(idx["name"] == "d" for idx in indexes) From ba9164482af9a5c4283d303573865411d86adb73 Mon Sep 17 00:00:00 2001 From: majiayu000 <1835304752@qq.com> Date: Thu, 25 Dec 2025 19:01:48 +0800 Subject: [PATCH 2/3] feat: add --max-depth CLI option for leann list command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address reviewer feedback by making the directory scan depth configurable instead of hardcoding it to 3. Users with deeply nested project structures can now increase the depth limit as needed. - Add --max-depth argument to list command (default: 3) - Update list_indexes() and _discover_indexes_in_project() to accept max_depth - Add tests for the new CLI option and custom depth behavior šŸ¤– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- packages/leann-core/src/leann/cli.py | 41 +++++++++++++++++++-------- tests/test_cli_list_performance.py | 42 ++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 12 deletions(-) diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index 139b4713..b1365684 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -350,7 +350,14 @@ def create_parser(self) -> argparse.ArgumentParser: ) # List command - subparsers.add_parser("list", help="List all indexes") + list_parser = subparsers.add_parser("list", help="List all indexes") + list_parser.add_argument( + "--max-depth", + type=int, + default=3, + help="Maximum directory depth to scan for indexes (default: 3). " + "Increase if your indexes are in deeply nested directories.", + ) # Remove command remove_parser = subparsers.add_parser("remove", help="Remove an index") @@ -492,7 +499,13 @@ def _is_git_submodule(self, path: Path) -> bool: # If anything goes wrong, assume it's not a submodule return False - def list_indexes(self): + def list_indexes(self, max_depth: int = 3): + """List all LEANN indexes across registered projects. + + Args: + max_depth: Maximum directory depth to scan for app-format indexes. + Default is 3. Increase if indexes are in deeply nested directories. + """ # Get all project directories with .leann global_registry = Path.home() / ".leann" / "projects.json" all_projects = [] @@ -537,7 +550,7 @@ def list_indexes(self): print(" " + "─" * 45) current_indexes = self._discover_indexes_in_project( - current_path, exclude_dirs=other_projects + current_path, exclude_dirs=other_projects, max_depth=max_depth ) if current_indexes: for idx in current_indexes: @@ -556,7 +569,7 @@ def list_indexes(self): print(" " + "─" * 45) for project_path in other_projects: - project_indexes = self._discover_indexes_in_project(project_path) + project_indexes = self._discover_indexes_in_project(project_path, max_depth=max_depth) if not project_indexes: continue @@ -580,9 +593,9 @@ def list_indexes(self): projects_count = 0 for p in valid_projects: if p == current_path: - discovered = self._discover_indexes_in_project(p, exclude_dirs=other_projects) + discovered = self._discover_indexes_in_project(p, exclude_dirs=other_projects, max_depth=max_depth) else: - discovered = self._discover_indexes_in_project(p) + discovered = self._discover_indexes_in_project(p, max_depth=max_depth) if len(discovered) > 0: projects_count += 1 print(f"šŸ“Š Total: {total_indexes} indexes across {projects_count} projects") @@ -602,13 +615,17 @@ def list_indexes(self): print(" leann build my-docs --docs ./documents") def _discover_indexes_in_project( - self, project_path: Path, exclude_dirs: Optional[list[Path]] = None + self, project_path: Path, exclude_dirs: Optional[list[Path]] = None, max_depth: int = 3 ): """Discover all indexes in a project directory (both CLI and apps formats) - exclude_dirs: when provided, skip any APP-format index files that are - located under these directories. This prevents duplicates when the - current project is a parent directory of other registered projects. + Args: + project_path: The project directory to search. + exclude_dirs: When provided, skip any APP-format index files that are + located under these directories. This prevents duplicates when the + current project is a parent directory of other registered projects. + max_depth: Maximum directory depth to scan for app-format indexes. + Default is 3. Increase if indexes are in deeply nested directories. """ indexes = [] exclude_dirs = exclude_dirs or [] @@ -648,7 +665,7 @@ def _discover_indexes_in_project( # 2. Apps format: *.leann.meta.json files in the project # Use limited-depth search to avoid scanning entire large directories cli_indexes_dir = project_path / ".leann" / "indexes" - for meta_file in self._find_meta_files_limited(project_path, max_depth=3): + for meta_file in self._find_meta_files_limited(project_path, max_depth=max_depth): if meta_file.is_file(): # Skip CLI-built indexes (which store meta under .leann/indexes//) try: @@ -1726,7 +1743,7 @@ async def run(self, args=None): return if args.command == "list": - self.list_indexes() + self.list_indexes(max_depth=args.max_depth) elif args.command == "remove": self.remove_index(args.index_name, args.force) elif args.command == "build": diff --git a/tests/test_cli_list_performance.py b/tests/test_cli_list_performance.py index 5e53864d..01b9b430 100644 --- a/tests/test_cli_list_performance.py +++ b/tests/test_cli_list_performance.py @@ -210,3 +210,45 @@ def test_discover_indexes_skips_deep_directories(self, tmp_path: Path): # Should NOT find the deep app index assert not any(idx["name"] == "d" for idx in indexes) + + def test_discover_indexes_respects_custom_max_depth(self, tmp_path: Path): + """Should find deeper indexes when max_depth is increased.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create an app format index at depth 5 + deep_dir = tmp_path / "a" / "b" / "c" / "d" / "e" + deep_dir.mkdir(parents=True) + (deep_dir / "deep.leann.meta.json").touch() + + # With default max_depth=3, should NOT find it + indexes_shallow = cli._discover_indexes_in_project(tmp_path, max_depth=3) + assert not any(idx["name"] == "e" for idx in indexes_shallow) + + # With max_depth=5, should find it + indexes_deep = cli._discover_indexes_in_project(tmp_path, max_depth=5) + assert any(idx["name"] == "e" for idx in indexes_deep) + + +class TestMaxDepthCliOption: + """Test the --max-depth CLI option for leann list.""" + + def test_max_depth_argument_is_parsed(self): + """The --max-depth argument should be properly parsed.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + parser = cli.create_parser() + + # Test default value + args = parser.parse_args(["list"]) + assert args.max_depth == 3 + + # Test custom value + args = parser.parse_args(["list", "--max-depth", "5"]) + assert args.max_depth == 5 + + # Test another custom value + args = parser.parse_args(["list", "--max-depth", "10"]) + assert args.max_depth == 10 From 9efaecea72247be3a3dbfc5aa364de3d0271a43f Mon Sep 17 00:00:00 2001 From: majiayu000 <1835304752@qq.com> Date: Thu, 25 Dec 2025 19:40:27 +0800 Subject: [PATCH 3/3] feat: implement global index registry for O(1) index discovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements a centralized index registry at ~/.leann/indexes.json that stores all LEANN index paths, enabling O(1) lookup instead of directory scanning. Changes: - Add register_index/unregister_index/list_registered_indexes functions - Update leann build to register indexes in global registry - Update leann list to use registry when available (with scan fallback) - Update leann remove to unregister indexes from registry - Auto-cleanup stale registry entries on list - Add comprehensive tests for registry functionality When registry is used, `leann list` output shows: ⚔ Using global registry (O(1) lookup) When falling back to scan: šŸ” Using directory scan (run 'leann build' to enable fast registry) Closes #198 šŸ¤– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- packages/leann-core/src/leann/cli.py | 164 +++++++++++++++++++++- packages/leann-core/src/leann/registry.py | 148 ++++++++++++++++++- tests/test_cli_list_performance.py | 128 +++++++++++++++++ 3 files changed, 433 insertions(+), 7 deletions(-) diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index b1365684..e83c6e63 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -10,7 +10,12 @@ from .api import LeannBuilder, LeannChat, LeannSearcher from .interactive_utils import create_cli_session -from .registry import register_project_directory +from .registry import ( + list_registered_indexes, + register_index, + register_project_directory, + unregister_index, +) from .settings import ( resolve_anthropic_base_url, resolve_ollama_host, @@ -502,10 +507,137 @@ def _is_git_submodule(self, path: Path) -> bool: def list_indexes(self, max_depth: int = 3): """List all LEANN indexes across registered projects. + Uses the global index registry for O(1) lookup when available. + Falls back to directory scanning for legacy indexes not yet registered. + Args: max_depth: Maximum directory depth to scan for app-format indexes. Default is 3. Increase if indexes are in deeply nested directories. """ + current_path = Path.cwd() + + # Try to use global index registry first (O(1) lookup) + registered_indexes = list_registered_indexes(validate=True) + + print("šŸ“š LEANN Indexes") + print("=" * 50) + + if registered_indexes: + # Use the fast path - global registry + self._list_indexes_from_registry(registered_indexes, current_path) + else: + # Fall back to directory scanning for legacy support + self._list_indexes_by_scanning(current_path, max_depth) + + def _list_indexes_from_registry(self, registered_indexes: list, current_path: Path): + """List indexes using the global registry (O(1) lookup).""" + # Group indexes by project + current_indexes = [] + other_indexes_by_project: dict[str, list] = {} + + for idx in registered_indexes: + idx_path = Path(idx["path"]) + # Determine which project this index belongs to + # CLI indexes: /path/to/project/.leann/indexes/name/documents.leann + # App indexes: /path/to/project/somewhere/file.leann + try: + if ".leann/indexes" in idx["path"]: + # CLI format - project is 3 levels up from .leann + project_path = idx_path.parent.parent.parent.parent + else: + # App format - use parent directory + project_path = idx_path.parent + except Exception: + project_path = idx_path.parent + + # Calculate size + size_mb = 0 + try: + meta_path = Path(idx["path"] + ".meta.json") + if meta_path.exists(): + index_dir = meta_path.parent + for f in index_dir.glob(f"{meta_path.stem.replace('.meta', '')}*"): + if f.is_file(): + size_mb += f.stat().st_size / (1024 * 1024) + except (OSError, PermissionError): + pass + + index_info = { + "name": idx["name"], + "type": idx["index_type"], + "status": "āœ…", + "size_mb": size_mb, + "path": idx["path"], + "project_path": project_path, + } + + # Check if this is in current project + try: + if project_path.resolve() == current_path.resolve(): + current_indexes.append(index_info) + else: + project_key = str(project_path) + if project_key not in other_indexes_by_project: + other_indexes_by_project[project_key] = [] + other_indexes_by_project[project_key].append(index_info) + except Exception: + # If comparison fails, treat as other project + project_key = str(project_path) + if project_key not in other_indexes_by_project: + other_indexes_by_project[project_key] = [] + other_indexes_by_project[project_key].append(index_info) + + total_indexes = len(registered_indexes) + current_indexes_count = len(current_indexes) + + # Show current project first + print("\nšŸ  Current Project") + print(f" {current_path}") + print(" " + "─" * 45) + + if current_indexes: + for i, idx in enumerate(current_indexes, 1): + type_icon = "šŸ“" if idx["type"] == "cli" else "šŸ“„" + print(f" {i}. {type_icon} {idx['name']} {idx['status']}") + if idx["size_mb"] > 0: + print(f" šŸ“¦ Size: {idx['size_mb']:.1f} MB") + else: + print(" šŸ“­ No indexes in current project") + + # Show other projects + if other_indexes_by_project: + print("\n\nšŸ—‚ļø Other Projects") + print(" " + "─" * 45) + + for project_key, indexes in other_indexes_by_project.items(): + project_path = Path(project_key) + print(f"\n šŸ“‚ {project_path.name}") + print(f" {project_path}") + + for idx in indexes: + type_icon = "šŸ“" if idx["type"] == "cli" else "šŸ“„" + print(f" • {type_icon} {idx['name']} {idx['status']}") + if idx["size_mb"] > 0: + print(f" šŸ“¦ {idx['size_mb']:.1f} MB") + + # Summary + print("\n" + "=" * 50) + projects_count = 1 if current_indexes else 0 + projects_count += len(other_indexes_by_project) + print(f"šŸ“Š Total: {total_indexes} indexes across {projects_count} projects") + print("⚔ Using global registry (O(1) lookup)") + + if current_indexes_count > 0: + print("\nšŸ’« Quick start (current project):") + example_name = current_indexes[0]["name"] + print(f' leann search {example_name} "your query"') + print(f" leann ask {example_name} --interactive") + else: + print("\nšŸ’” Create your first index:") + print(" leann build my-docs --docs ./documents") + + def _list_indexes_by_scanning(self, current_path: Path, max_depth: int): + """List indexes by scanning directories (legacy fallback).""" # Get all project directories with .leann global_registry = Path.home() / ".leann" / "projects.json" all_projects = [] @@ -527,7 +659,6 @@ def list_indexes(self, max_depth: int = 3): valid_projects.append(project_path) # Add current project if it has .leann but not in registry - current_path = Path.cwd() if (current_path / ".leann" / "indexes").exists() and current_path not in valid_projects: valid_projects.append(current_path) @@ -538,9 +669,6 @@ def list_indexes(self, max_depth: int = 3): if project_path != current_path: other_projects.append(project_path) - print("šŸ“š LEANN Indexes") - print("=" * 50) - total_indexes = 0 current_indexes_count = 0 @@ -599,6 +727,7 @@ def list_indexes(self, max_depth: int = 3): if len(discovered) > 0: projects_count += 1 print(f"šŸ“Š Total: {total_indexes} indexes across {projects_count} projects") + print("šŸ” Using directory scan (run 'leann build' to enable fast registry)") if current_indexes_count > 0: print("\nšŸ’« Quick start (current project):") @@ -974,11 +1103,21 @@ def _delete_index_directory( ): """Delete a CLI index directory or APP index files safely.""" try: + # Determine index path for unregistering from global registry + index_path_for_registry = None + if is_app: removed = 0 errors = 0 # Delete only files that belong to this app index (based on file base) pattern_base = app_file_base or "" + + # Find the .leann file path for unregistering + for f in index_dir.glob(f"{pattern_base}.leann"): + if f.is_file() and not f.name.endswith(".meta.json"): + index_path_for_registry = str(f) + break + for f in index_dir.glob(f"{pattern_base}.leann*"): try: f.unlink() @@ -994,6 +1133,10 @@ def _delete_index_directory( errors += 1 if removed > 0 and errors == 0: + # Unregister from global registry + if index_path_for_registry: + unregister_index(index_path_for_registry) + if project_path: print( f"āœ… App index '{index_display_name}' removed from {project_path.name}" @@ -1014,8 +1157,14 @@ def _delete_index_directory( else: import shutil + # For CLI indexes, the path is index_dir / "documents.leann" + index_path_for_registry = str(index_dir / "documents.leann") + shutil.rmtree(index_dir) + # Unregister from global registry + unregister_index(index_path_for_registry) + if project_path: print(f"āœ… Index '{index_display_name}' removed from {project_path.name}") else: @@ -1521,7 +1670,10 @@ async def build_index(self, args): builder.build_index(index_path) print(f"Index built at {index_path}") - # Register this project directory in global registry + # Register this index in global registry for O(1) discovery + register_index(name=index_name, path=index_path, index_type="cli") + + # Register this project directory in global registry (legacy support) self.register_project_dir() async def search_documents(self, args): diff --git a/packages/leann-core/src/leann/registry.py b/packages/leann-core/src/leann/registry.py index bf6e6c2d..0d267ec8 100644 --- a/packages/leann-core/src/leann/registry.py +++ b/packages/leann-core/src/leann/registry.py @@ -4,8 +4,9 @@ import importlib.metadata import json import logging +from datetime import datetime, timezone from pathlib import Path -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Optional, TypedDict, Union if TYPE_CHECKING: from leann.interface import LeannBackendFactoryInterface @@ -13,6 +14,151 @@ # Set up logger for this module logger = logging.getLogger(__name__) + +# Global index registry path +GLOBAL_INDEX_REGISTRY_PATH = Path.home() / ".leann" / "indexes.json" + + +class IndexEntry(TypedDict): + """Schema for a registered index entry.""" + + name: str + path: str + index_type: str # "cli" or "app" + created_at: str # ISO format datetime + + +def _load_index_registry() -> list[IndexEntry]: + """Load the global index registry from disk.""" + if not GLOBAL_INDEX_REGISTRY_PATH.exists(): + return [] + try: + with open(GLOBAL_INDEX_REGISTRY_PATH) as f: + data = json.load(f) + return data.get("indexes", []) + except Exception as e: + logger.debug(f"Could not load index registry: {e}") + return [] + + +def _save_index_registry(indexes: list[IndexEntry]) -> bool: + """Save the global index registry to disk.""" + try: + GLOBAL_INDEX_REGISTRY_PATH.parent.mkdir(parents=True, exist_ok=True) + with open(GLOBAL_INDEX_REGISTRY_PATH, "w") as f: + json.dump({"indexes": indexes}, f, indent=2) + return True + except Exception as e: + logger.warning(f"Could not save index registry: {e}") + return False + + +def register_index( + name: str, + path: Union[str, Path], + index_type: str = "cli", +) -> bool: + """Register an index in the global registry. + + Args: + name: Display name of the index. + path: Path to the index file (e.g., /path/to/.leann/indexes/my-index/documents.leann). + index_type: Type of index - "cli" or "app". + + Returns: + True if registration succeeded, False otherwise. + """ + path_str = str(Path(path).resolve()) + + indexes = _load_index_registry() + + # Check if already registered (by path) + for idx in indexes: + if idx["path"] == path_str: + # Update existing entry + idx["name"] = name + idx["index_type"] = index_type + return _save_index_registry(indexes) + + # Add new entry + entry: IndexEntry = { + "name": name, + "path": path_str, + "index_type": index_type, + "created_at": datetime.now(timezone.utc).isoformat(), + } + indexes.append(entry) + return _save_index_registry(indexes) + + +def unregister_index(path: Union[str, Path]) -> bool: + """Remove an index from the global registry. + + Args: + path: Path to the index file. + + Returns: + True if unregistration succeeded, False otherwise. + """ + path_str = str(Path(path).resolve()) + indexes = _load_index_registry() + + original_count = len(indexes) + indexes = [idx for idx in indexes if idx["path"] != path_str] + + if len(indexes) < original_count: + return _save_index_registry(indexes) + return True # Nothing to remove is still success + + +def list_registered_indexes(validate: bool = True) -> list[IndexEntry]: + """Get all registered indexes from the global registry. + + Args: + validate: If True, removes entries whose paths no longer exist. + + Returns: + List of registered index entries. + """ + indexes = _load_index_registry() + + if validate: + valid_indexes = [] + for idx in indexes: + # Check if the meta file exists + meta_path = Path(idx["path"] + ".meta.json") + if meta_path.exists(): + valid_indexes.append(idx) + else: + logger.debug(f"Removing stale index entry: {idx['path']}") + + if len(valid_indexes) < len(indexes): + _save_index_registry(valid_indexes) + return valid_indexes + + return indexes + + +def cleanup_stale_indexes() -> int: + """Remove registry entries for indexes that no longer exist. + + Returns: + Number of stale entries removed. + """ + indexes = _load_index_registry() + original_count = len(indexes) + + valid_indexes = [] + for idx in indexes: + meta_path = Path(idx["path"] + ".meta.json") + if meta_path.exists(): + valid_indexes.append(idx) + + if len(valid_indexes) < original_count: + _save_index_registry(valid_indexes) + + return original_count - len(valid_indexes) + BACKEND_REGISTRY: dict[str, "LeannBackendFactoryInterface"] = {} diff --git a/tests/test_cli_list_performance.py b/tests/test_cli_list_performance.py index 01b9b430..1cdd7ab8 100644 --- a/tests/test_cli_list_performance.py +++ b/tests/test_cli_list_performance.py @@ -5,7 +5,9 @@ See: https://github.com/yichuan-w/LEANN/issues/122 """ +import json from pathlib import Path +from unittest.mock import patch class TestLimitedDepthSearch: @@ -252,3 +254,129 @@ def test_max_depth_argument_is_parsed(self): # Test another custom value args = parser.parse_args(["list", "--max-depth", "10"]) assert args.max_depth == 10 + + +class TestGlobalIndexRegistry: + """Test the global index registry for O(1) index discovery.""" + + def test_register_and_list_index(self, tmp_path: Path): + """Should register an index and list it from the registry.""" + from leann.registry import ( + GLOBAL_INDEX_REGISTRY_PATH, + _load_index_registry, + _save_index_registry, + register_index, + list_registered_indexes, + unregister_index, + ) + + # Use a temporary registry file + test_registry = tmp_path / "indexes.json" + with patch("leann.registry.GLOBAL_INDEX_REGISTRY_PATH", test_registry): + # Register an index + index_path = tmp_path / ".leann" / "indexes" / "test-index" / "documents.leann" + index_path.parent.mkdir(parents=True) + index_path.touch() + (index_path.parent / "documents.leann.meta.json").touch() + + result = register_index( + name="test-index", + path=str(index_path), + index_type="cli", + ) + assert result is True + + # List indexes + indexes = list_registered_indexes(validate=True) + assert len(indexes) == 1 + assert indexes[0]["name"] == "test-index" + assert indexes[0]["index_type"] == "cli" + + # Unregister + result = unregister_index(str(index_path)) + assert result is True + + # Should be empty now + indexes = list_registered_indexes(validate=False) + assert len(indexes) == 0 + + def test_registry_validates_stale_entries(self, tmp_path: Path): + """Should remove entries for indexes that no longer exist.""" + from leann.registry import ( + _save_index_registry, + list_registered_indexes, + ) + + test_registry = tmp_path / "indexes.json" + with patch("leann.registry.GLOBAL_INDEX_REGISTRY_PATH", test_registry): + # Create a registry with a stale entry + stale_entry = { + "name": "stale-index", + "path": str(tmp_path / "nonexistent" / "documents.leann"), + "index_type": "cli", + "created_at": "2024-01-01T00:00:00+00:00", + } + _save_index_registry([stale_entry]) + + # List with validation should remove the stale entry + indexes = list_registered_indexes(validate=True) + assert len(indexes) == 0 + + def test_register_index_updates_existing(self, tmp_path: Path): + """Should update an existing entry instead of duplicating.""" + from leann.registry import ( + register_index, + list_registered_indexes, + ) + + test_registry = tmp_path / "indexes.json" + with patch("leann.registry.GLOBAL_INDEX_REGISTRY_PATH", test_registry): + # Create the index files + index_path = tmp_path / "test.leann" + index_path.touch() + (tmp_path / "test.leann.meta.json").touch() + + # Register twice with different names + register_index(name="first-name", path=str(index_path), index_type="app") + register_index(name="second-name", path=str(index_path), index_type="app") + + # Should only have one entry with the updated name + indexes = list_registered_indexes(validate=True) + assert len(indexes) == 1 + assert indexes[0]["name"] == "second-name" + + +class TestListIndexesWithRegistry: + """Test that list_indexes uses the global registry when available.""" + + def test_list_indexes_uses_registry_when_available(self, tmp_path: Path, capsys): + """Should use O(1) registry lookup when indexes are registered.""" + from leann.cli import LeannCLI + from leann.registry import register_index + + test_registry = tmp_path / "indexes.json" + + # Create an index + index_dir = tmp_path / ".leann" / "indexes" / "my-index" + index_dir.mkdir(parents=True) + index_path = index_dir / "documents.leann" + index_path.touch() + (index_dir / "documents.leann.meta.json").touch() + + with patch("leann.registry.GLOBAL_INDEX_REGISTRY_PATH", test_registry): + with patch("leann.cli.list_registered_indexes") as mock_list: + # Mock the registry to return our index + mock_list.return_value = [ + { + "name": "my-index", + "path": str(index_path), + "index_type": "cli", + "created_at": "2024-01-01T00:00:00+00:00", + } + ] + + cli = LeannCLI() + cli.list_indexes() + + captured = capsys.readouterr() + assert "O(1) lookup" in captured.out or "global registry" in captured.out.lower()