diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index 708892ab..e83c6e63 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -10,7 +10,12 @@ from .api import LeannBuilder, LeannChat, LeannSearcher from .interactive_utils import create_cli_session -from .registry import register_project_directory +from .registry import ( + list_registered_indexes, + register_index, + register_project_directory, + unregister_index, +) from .settings import ( resolve_anthropic_base_url, resolve_ollama_host, @@ -350,7 +355,14 @@ def create_parser(self) -> argparse.ArgumentParser: ) # List command - subparsers.add_parser("list", help="List all indexes") + list_parser = subparsers.add_parser("list", help="List all indexes") + list_parser.add_argument( + "--max-depth", + type=int, + default=3, + help="Maximum directory depth to scan for indexes (default: 3). " + "Increase if your indexes are in deeply nested directories.", + ) # Remove command remove_parser = subparsers.add_parser("remove", help="Remove an index") @@ -404,6 +416,68 @@ def _should_exclude_file(self, file_path: Path, gitignore_matches) -> bool: absolute_path = Path(str(file_path)) return gitignore_matches(absolute_path.as_posix()) + def _find_meta_files_limited( + self, root: Path, max_depth: int = 3, pattern: str = "*.leann.meta.json" + ): + """Find meta files with limited depth to avoid scanning large directories. + + Skips common large directories that shouldn't contain LEANN indexes. + """ + # Directories to skip - these are typically large and won't contain user indexes + skip_dirs = { + ".git", + ".svn", + ".hg", + "node_modules", + "__pycache__", + ".venv", + "venv", + ".env", + "env", + ".tox", + ".nox", + ".mypy_cache", + ".pytest_cache", + ".ruff_cache", + "dist", + "build", + ".eggs", + "*.egg-info", + ".cache", + ".npm", + ".yarn", + "vendor", + "Pods", + ".gradle", + "target", + } + + def should_skip(dir_name: str) -> bool: + """Check if directory should be skipped.""" + if dir_name in skip_dirs: + return True + # Skip hidden directories (except .leann which we want) + if dir_name.startswith(".") and dir_name != ".leann": + return True + return False + + def search_dir(path: Path, current_depth: int): + """Recursively search with depth limit.""" + if current_depth > max_depth: + return + + try: + for item in path.iterdir(): + if item.is_file() and item.match(pattern): + yield item + elif item.is_dir() and not should_skip(item.name): + yield from search_dir(item, current_depth + 1) + except (PermissionError, OSError): + # Skip directories we can't read + pass + + yield from search_dir(root, 0) + def _is_git_submodule(self, path: Path) -> bool: """Check if a path is a git submodule.""" try: @@ -430,7 +504,140 @@ def _is_git_submodule(self, path: Path) -> bool: # If anything goes wrong, assume it's not a submodule return False - def list_indexes(self): + def list_indexes(self, max_depth: int = 3): + """List all LEANN indexes across registered projects. + + Uses the global index registry for O(1) lookup when available. + Falls back to directory scanning for legacy indexes not yet registered. + + Args: + max_depth: Maximum directory depth to scan for app-format indexes. + Default is 3. Increase if indexes are in deeply nested directories. + """ + current_path = Path.cwd() + + # Try to use global index registry first (O(1) lookup) + registered_indexes = list_registered_indexes(validate=True) + + print("šŸ“š LEANN Indexes") + print("=" * 50) + + if registered_indexes: + # Use the fast path - global registry + self._list_indexes_from_registry(registered_indexes, current_path) + else: + # Fall back to directory scanning for legacy support + self._list_indexes_by_scanning(current_path, max_depth) + + def _list_indexes_from_registry(self, registered_indexes: list, current_path: Path): + """List indexes using the global registry (O(1) lookup).""" + # Group indexes by project + current_indexes = [] + other_indexes_by_project: dict[str, list] = {} + + for idx in registered_indexes: + idx_path = Path(idx["path"]) + # Determine which project this index belongs to + # CLI indexes: /path/to/project/.leann/indexes/name/documents.leann + # App indexes: /path/to/project/somewhere/file.leann + try: + if ".leann/indexes" in idx["path"]: + # CLI format - project is 3 levels up from .leann + project_path = idx_path.parent.parent.parent.parent + else: + # App format - use parent directory + project_path = idx_path.parent + except Exception: + project_path = idx_path.parent + + # Calculate size + size_mb = 0 + try: + meta_path = Path(idx["path"] + ".meta.json") + if meta_path.exists(): + index_dir = meta_path.parent + for f in index_dir.glob(f"{meta_path.stem.replace('.meta', '')}*"): + if f.is_file(): + size_mb += f.stat().st_size / (1024 * 1024) + except (OSError, PermissionError): + pass + + index_info = { + "name": idx["name"], + "type": idx["index_type"], + "status": "āœ…", + "size_mb": size_mb, + "path": idx["path"], + "project_path": project_path, + } + + # Check if this is in current project + try: + if project_path.resolve() == current_path.resolve(): + current_indexes.append(index_info) + else: + project_key = str(project_path) + if project_key not in other_indexes_by_project: + other_indexes_by_project[project_key] = [] + other_indexes_by_project[project_key].append(index_info) + except Exception: + # If comparison fails, treat as other project + project_key = str(project_path) + if project_key not in other_indexes_by_project: + other_indexes_by_project[project_key] = [] + other_indexes_by_project[project_key].append(index_info) + + total_indexes = len(registered_indexes) + current_indexes_count = len(current_indexes) + + # Show current project first + print("\nšŸ  Current Project") + print(f" {current_path}") + print(" " + "─" * 45) + + if current_indexes: + for i, idx in enumerate(current_indexes, 1): + type_icon = "šŸ“" if idx["type"] == "cli" else "šŸ“„" + print(f" {i}. {type_icon} {idx['name']} {idx['status']}") + if idx["size_mb"] > 0: + print(f" šŸ“¦ Size: {idx['size_mb']:.1f} MB") + else: + print(" šŸ“­ No indexes in current project") + + # Show other projects + if other_indexes_by_project: + print("\n\nšŸ—‚ļø Other Projects") + print(" " + "─" * 45) + + for project_key, indexes in other_indexes_by_project.items(): + project_path = Path(project_key) + print(f"\n šŸ“‚ {project_path.name}") + print(f" {project_path}") + + for idx in indexes: + type_icon = "šŸ“" if idx["type"] == "cli" else "šŸ“„" + print(f" • {type_icon} {idx['name']} {idx['status']}") + if idx["size_mb"] > 0: + print(f" šŸ“¦ {idx['size_mb']:.1f} MB") + + # Summary + print("\n" + "=" * 50) + projects_count = 1 if current_indexes else 0 + projects_count += len(other_indexes_by_project) + print(f"šŸ“Š Total: {total_indexes} indexes across {projects_count} projects") + print("⚔ Using global registry (O(1) lookup)") + + if current_indexes_count > 0: + print("\nšŸ’« Quick start (current project):") + example_name = current_indexes[0]["name"] + print(f' leann search {example_name} "your query"') + print(f" leann ask {example_name} --interactive") + else: + print("\nšŸ’” Create your first index:") + print(" leann build my-docs --docs ./documents") + + def _list_indexes_by_scanning(self, current_path: Path, max_depth: int): + """List indexes by scanning directories (legacy fallback).""" # Get all project directories with .leann global_registry = Path.home() / ".leann" / "projects.json" all_projects = [] @@ -452,7 +659,6 @@ def list_indexes(self): valid_projects.append(project_path) # Add current project if it has .leann but not in registry - current_path = Path.cwd() if (current_path / ".leann" / "indexes").exists() and current_path not in valid_projects: valid_projects.append(current_path) @@ -463,9 +669,6 @@ def list_indexes(self): if project_path != current_path: other_projects.append(project_path) - print("šŸ“š LEANN Indexes") - print("=" * 50) - total_indexes = 0 current_indexes_count = 0 @@ -475,7 +678,7 @@ def list_indexes(self): print(" " + "─" * 45) current_indexes = self._discover_indexes_in_project( - current_path, exclude_dirs=other_projects + current_path, exclude_dirs=other_projects, max_depth=max_depth ) if current_indexes: for idx in current_indexes: @@ -494,7 +697,7 @@ def list_indexes(self): print(" " + "─" * 45) for project_path in other_projects: - project_indexes = self._discover_indexes_in_project(project_path) + project_indexes = self._discover_indexes_in_project(project_path, max_depth=max_depth) if not project_indexes: continue @@ -518,12 +721,13 @@ def list_indexes(self): projects_count = 0 for p in valid_projects: if p == current_path: - discovered = self._discover_indexes_in_project(p, exclude_dirs=other_projects) + discovered = self._discover_indexes_in_project(p, exclude_dirs=other_projects, max_depth=max_depth) else: - discovered = self._discover_indexes_in_project(p) + discovered = self._discover_indexes_in_project(p, max_depth=max_depth) if len(discovered) > 0: projects_count += 1 print(f"šŸ“Š Total: {total_indexes} indexes across {projects_count} projects") + print("šŸ” Using directory scan (run 'leann build' to enable fast registry)") if current_indexes_count > 0: print("\nšŸ’« Quick start (current project):") @@ -540,13 +744,17 @@ def list_indexes(self): print(" leann build my-docs --docs ./documents") def _discover_indexes_in_project( - self, project_path: Path, exclude_dirs: Optional[list[Path]] = None + self, project_path: Path, exclude_dirs: Optional[list[Path]] = None, max_depth: int = 3 ): """Discover all indexes in a project directory (both CLI and apps formats) - exclude_dirs: when provided, skip any APP-format index files that are - located under these directories. This prevents duplicates when the - current project is a parent directory of other registered projects. + Args: + project_path: The project directory to search. + exclude_dirs: When provided, skip any APP-format index files that are + located under these directories. This prevents duplicates when the + current project is a parent directory of other registered projects. + max_depth: Maximum directory depth to scan for app-format indexes. + Default is 3. Increase if indexes are in deeply nested directories. """ indexes = [] exclude_dirs = exclude_dirs or [] @@ -583,9 +791,10 @@ def _discover_indexes_in_project( } ) - # 2. Apps format: *.leann.meta.json files anywhere in the project + # 2. Apps format: *.leann.meta.json files in the project + # Use limited-depth search to avoid scanning entire large directories cli_indexes_dir = project_path / ".leann" / "indexes" - for meta_file in project_path.rglob("*.leann.meta.json"): + for meta_file in self._find_meta_files_limited(project_path, max_depth=max_depth): if meta_file.is_file(): # Skip CLI-built indexes (which store meta under .leann/indexes//) try: @@ -696,59 +905,43 @@ def _find_all_matching_indexes(self, index_name: str): # b) by the parent directory name (e.g., `new_txt`) seen_app_meta = set() - # 2a) by file base - for meta_file in project_path.rglob(f"{index_name}.leann.meta.json"): - if meta_file.is_file(): - # Skip CLI-built indexes' meta under .leann/indexes - try: - cli_indexes_dir = project_path / ".leann" / "indexes" - if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents: - continue - except Exception: - pass - is_current = project_path == current_path - key = (str(project_path), str(meta_file)) - if key in seen_app_meta: - continue - seen_app_meta.add(key) - matches.append( - { - "project_path": project_path, - "files_dir": meta_file.parent, - "meta_file": meta_file, - "is_current": is_current, - "kind": "app", - "display_name": meta_file.parent.name, - "file_base": meta_file.name.replace(".leann.meta.json", ""), - } - ) + # Use limited-depth search to avoid scanning large directories + for meta_file in self._find_meta_files_limited(project_path, max_depth=3): + if not meta_file.is_file(): + continue - # 2b) by parent directory name - for meta_file in project_path.rglob("*.leann.meta.json"): - if meta_file.is_file() and meta_file.parent.name == index_name: - # Skip CLI-built indexes' meta under .leann/indexes - try: - cli_indexes_dir = project_path / ".leann" / "indexes" - if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents: - continue - except Exception: - pass - is_current = project_path == current_path - key = (str(project_path), str(meta_file)) - if key in seen_app_meta: + # Skip CLI-built indexes' meta under .leann/indexes + try: + cli_indexes_dir = project_path / ".leann" / "indexes" + if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents: continue - seen_app_meta.add(key) - matches.append( - { - "project_path": project_path, - "files_dir": meta_file.parent, - "meta_file": meta_file, - "is_current": is_current, - "kind": "app", - "display_name": meta_file.parent.name, - "file_base": meta_file.name.replace(".leann.meta.json", ""), - } - ) + except Exception: + pass + + file_base = meta_file.name.replace(".leann.meta.json", "") + parent_name = meta_file.parent.name + + # Check if this matches the requested index_name + # Match by file base or by parent directory name + if file_base != index_name and parent_name != index_name: + continue + + is_current = project_path == current_path + key = (str(project_path), str(meta_file)) + if key in seen_app_meta: + continue + seen_app_meta.add(key) + matches.append( + { + "project_path": project_path, + "files_dir": meta_file.parent, + "meta_file": meta_file, + "is_current": is_current, + "kind": "app", + "display_name": parent_name, + "file_base": file_base, + } + ) # Sort: current project first, then by project name matches.sort(key=lambda x: (not x["is_current"], x["project_path"].name)) @@ -910,11 +1103,21 @@ def _delete_index_directory( ): """Delete a CLI index directory or APP index files safely.""" try: + # Determine index path for unregistering from global registry + index_path_for_registry = None + if is_app: removed = 0 errors = 0 # Delete only files that belong to this app index (based on file base) pattern_base = app_file_base or "" + + # Find the .leann file path for unregistering + for f in index_dir.glob(f"{pattern_base}.leann"): + if f.is_file() and not f.name.endswith(".meta.json"): + index_path_for_registry = str(f) + break + for f in index_dir.glob(f"{pattern_base}.leann*"): try: f.unlink() @@ -930,6 +1133,10 @@ def _delete_index_directory( errors += 1 if removed > 0 and errors == 0: + # Unregister from global registry + if index_path_for_registry: + unregister_index(index_path_for_registry) + if project_path: print( f"āœ… App index '{index_display_name}' removed from {project_path.name}" @@ -950,8 +1157,14 @@ def _delete_index_directory( else: import shutil + # For CLI indexes, the path is index_dir / "documents.leann" + index_path_for_registry = str(index_dir / "documents.leann") + shutil.rmtree(index_dir) + # Unregister from global registry + unregister_index(index_path_for_registry) + if project_path: print(f"āœ… Index '{index_display_name}' removed from {project_path.name}") else: @@ -1457,7 +1670,10 @@ async def build_index(self, args): builder.build_index(index_path) print(f"Index built at {index_path}") - # Register this project directory in global registry + # Register this index in global registry for O(1) discovery + register_index(name=index_name, path=index_path, index_type="cli") + + # Register this project directory in global registry (legacy support) self.register_project_dir() async def search_documents(self, args): @@ -1679,7 +1895,7 @@ async def run(self, args=None): return if args.command == "list": - self.list_indexes() + self.list_indexes(max_depth=args.max_depth) elif args.command == "remove": self.remove_index(args.index_name, args.force) elif args.command == "build": diff --git a/packages/leann-core/src/leann/registry.py b/packages/leann-core/src/leann/registry.py index d4a559a5..0d267ec8 100644 --- a/packages/leann-core/src/leann/registry.py +++ b/packages/leann-core/src/leann/registry.py @@ -4,8 +4,9 @@ import importlib.metadata import json import logging +from datetime import datetime, timezone from pathlib import Path -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Optional, TypedDict, Union if TYPE_CHECKING: from leann.interface import LeannBackendFactoryInterface @@ -13,6 +14,151 @@ # Set up logger for this module logger = logging.getLogger(__name__) + +# Global index registry path +GLOBAL_INDEX_REGISTRY_PATH = Path.home() / ".leann" / "indexes.json" + + +class IndexEntry(TypedDict): + """Schema for a registered index entry.""" + + name: str + path: str + index_type: str # "cli" or "app" + created_at: str # ISO format datetime + + +def _load_index_registry() -> list[IndexEntry]: + """Load the global index registry from disk.""" + if not GLOBAL_INDEX_REGISTRY_PATH.exists(): + return [] + try: + with open(GLOBAL_INDEX_REGISTRY_PATH) as f: + data = json.load(f) + return data.get("indexes", []) + except Exception as e: + logger.debug(f"Could not load index registry: {e}") + return [] + + +def _save_index_registry(indexes: list[IndexEntry]) -> bool: + """Save the global index registry to disk.""" + try: + GLOBAL_INDEX_REGISTRY_PATH.parent.mkdir(parents=True, exist_ok=True) + with open(GLOBAL_INDEX_REGISTRY_PATH, "w") as f: + json.dump({"indexes": indexes}, f, indent=2) + return True + except Exception as e: + logger.warning(f"Could not save index registry: {e}") + return False + + +def register_index( + name: str, + path: Union[str, Path], + index_type: str = "cli", +) -> bool: + """Register an index in the global registry. + + Args: + name: Display name of the index. + path: Path to the index file (e.g., /path/to/.leann/indexes/my-index/documents.leann). + index_type: Type of index - "cli" or "app". + + Returns: + True if registration succeeded, False otherwise. + """ + path_str = str(Path(path).resolve()) + + indexes = _load_index_registry() + + # Check if already registered (by path) + for idx in indexes: + if idx["path"] == path_str: + # Update existing entry + idx["name"] = name + idx["index_type"] = index_type + return _save_index_registry(indexes) + + # Add new entry + entry: IndexEntry = { + "name": name, + "path": path_str, + "index_type": index_type, + "created_at": datetime.now(timezone.utc).isoformat(), + } + indexes.append(entry) + return _save_index_registry(indexes) + + +def unregister_index(path: Union[str, Path]) -> bool: + """Remove an index from the global registry. + + Args: + path: Path to the index file. + + Returns: + True if unregistration succeeded, False otherwise. + """ + path_str = str(Path(path).resolve()) + indexes = _load_index_registry() + + original_count = len(indexes) + indexes = [idx for idx in indexes if idx["path"] != path_str] + + if len(indexes) < original_count: + return _save_index_registry(indexes) + return True # Nothing to remove is still success + + +def list_registered_indexes(validate: bool = True) -> list[IndexEntry]: + """Get all registered indexes from the global registry. + + Args: + validate: If True, removes entries whose paths no longer exist. + + Returns: + List of registered index entries. + """ + indexes = _load_index_registry() + + if validate: + valid_indexes = [] + for idx in indexes: + # Check if the meta file exists + meta_path = Path(idx["path"] + ".meta.json") + if meta_path.exists(): + valid_indexes.append(idx) + else: + logger.debug(f"Removing stale index entry: {idx['path']}") + + if len(valid_indexes) < len(indexes): + _save_index_registry(valid_indexes) + return valid_indexes + + return indexes + + +def cleanup_stale_indexes() -> int: + """Remove registry entries for indexes that no longer exist. + + Returns: + Number of stale entries removed. + """ + indexes = _load_index_registry() + original_count = len(indexes) + + valid_indexes = [] + for idx in indexes: + meta_path = Path(idx["path"] + ".meta.json") + if meta_path.exists(): + valid_indexes.append(idx) + + if len(valid_indexes) < original_count: + _save_index_registry(valid_indexes) + + return original_count - len(valid_indexes) + BACKEND_REGISTRY: dict[str, "LeannBackendFactoryInterface"] = {} @@ -49,6 +195,63 @@ def autodiscover_backends(): # print("INFO: Backend auto-discovery finished.") +def _has_app_indexes_limited(root: Path, max_depth: int = 3) -> bool: + """Check if directory contains app-format indexes with limited depth search. + + Skips common large directories that shouldn't contain LEANN indexes. + """ + skip_dirs = { + ".git", + ".svn", + ".hg", + "node_modules", + "__pycache__", + ".venv", + "venv", + ".env", + "env", + ".tox", + ".nox", + ".mypy_cache", + ".pytest_cache", + ".ruff_cache", + "dist", + "build", + ".eggs", + ".cache", + ".npm", + ".yarn", + "vendor", + "Pods", + ".gradle", + "target", + } + + def should_skip(dir_name: str) -> bool: + if dir_name in skip_dirs: + return True + if dir_name.startswith(".") and dir_name != ".leann": + return True + return False + + def search_dir(path: Path, current_depth: int) -> bool: + if current_depth > max_depth: + return False + + try: + for item in path.iterdir(): + if item.is_file() and item.name.endswith(".leann.meta.json"): + return True + elif item.is_dir() and not should_skip(item.name): + if search_dir(item, current_depth + 1): + return True + except (PermissionError, OSError): + pass + return False + + return search_dir(root, 0) + + def register_project_directory(project_dir: Optional[Union[str, Path]] = None): """ Register a project directory in the global LEANN registry. @@ -65,8 +268,9 @@ def register_project_directory(project_dir: Optional[Union[str, Path]] = None): # Only register directories that have some kind of LEANN content # Either .leann/indexes/ (CLI format) or *.leann.meta.json files (apps format) + # Use limited-depth search to avoid scanning large directories like $HOME has_cli_indexes = (project_dir / ".leann" / "indexes").exists() - has_app_indexes = any(project_dir.rglob("*.leann.meta.json")) + has_app_indexes = _has_app_indexes_limited(project_dir, max_depth=3) if not (has_cli_indexes or has_app_indexes): # Don't register if there are no LEANN indexes diff --git a/tests/test_cli_list_performance.py b/tests/test_cli_list_performance.py new file mode 100644 index 00000000..1cdd7ab8 --- /dev/null +++ b/tests/test_cli_list_performance.py @@ -0,0 +1,382 @@ +"""Tests for leann list command performance improvements. + +This module tests the limited-depth search functionality that prevents +leann list from scanning all files in large directories like $HOME. +See: https://github.com/yichuan-w/LEANN/issues/122 +""" + +import json +from pathlib import Path +from unittest.mock import patch + + +class TestLimitedDepthSearch: + """Test the _find_meta_files_limited method for performance.""" + + def test_find_meta_files_respects_max_depth(self, tmp_path: Path): + """Meta files beyond max_depth should not be found.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create a deep directory structure + # depth 0: tmp_path + # depth 1: level1 + # depth 2: level2 + # depth 3: level3 + # depth 4: level4 (beyond default max_depth=3) + level1 = tmp_path / "level1" + level2 = level1 / "level2" + level3 = level2 / "level3" + level4 = level3 / "level4" + + level4.mkdir(parents=True) + + # Create meta files at different depths + (tmp_path / "root.leann.meta.json").touch() + (level1 / "l1.leann.meta.json").touch() + (level2 / "l2.leann.meta.json").touch() + (level3 / "l3.leann.meta.json").touch() + (level4 / "l4.leann.meta.json").touch() + + # Find with max_depth=3 (should find root, l1, l2, l3 but not l4) + found = list(cli._find_meta_files_limited(tmp_path, max_depth=3)) + found_names = {f.name for f in found} + + assert "root.leann.meta.json" in found_names + assert "l1.leann.meta.json" in found_names + assert "l2.leann.meta.json" in found_names + assert "l3.leann.meta.json" in found_names + assert "l4.leann.meta.json" not in found_names + + def test_find_meta_files_skips_node_modules(self, tmp_path: Path): + """Meta files inside node_modules should be skipped.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create a meta file inside node_modules + node_modules = tmp_path / "node_modules" + node_modules.mkdir() + (node_modules / "pkg.leann.meta.json").touch() + + # Create a normal meta file + (tmp_path / "normal.leann.meta.json").touch() + + found = list(cli._find_meta_files_limited(tmp_path, max_depth=3)) + found_names = {f.name for f in found} + + assert "normal.leann.meta.json" in found_names + assert "pkg.leann.meta.json" not in found_names + + def test_find_meta_files_skips_hidden_dirs(self, tmp_path: Path): + """Meta files inside hidden directories (except .leann) should be skipped.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create meta files in hidden directories + hidden = tmp_path / ".hidden" + hidden.mkdir() + (hidden / "hidden.leann.meta.json").touch() + + # .leann should NOT be skipped + leann_dir = tmp_path / ".leann" + leann_dir.mkdir() + (leann_dir / "leann.leann.meta.json").touch() + + # Normal file + (tmp_path / "normal.leann.meta.json").touch() + + found = list(cli._find_meta_files_limited(tmp_path, max_depth=3)) + found_names = {f.name for f in found} + + assert "normal.leann.meta.json" in found_names + assert "leann.leann.meta.json" in found_names + assert "hidden.leann.meta.json" not in found_names + + def test_find_meta_files_skips_venv(self, tmp_path: Path): + """Meta files inside .venv and venv should be skipped.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create meta files in virtual env directories + for venv_name in [".venv", "venv", ".env", "env"]: + venv_dir = tmp_path / venv_name + venv_dir.mkdir() + (venv_dir / f"{venv_name}.leann.meta.json").touch() + + # Normal file + (tmp_path / "normal.leann.meta.json").touch() + + found = list(cli._find_meta_files_limited(tmp_path, max_depth=3)) + found_names = {f.name for f in found} + + assert "normal.leann.meta.json" in found_names + assert ".venv.leann.meta.json" not in found_names + assert "venv.leann.meta.json" not in found_names + assert ".env.leann.meta.json" not in found_names + assert "env.leann.meta.json" not in found_names + + def test_find_meta_files_skips_build_dirs(self, tmp_path: Path): + """Meta files inside build/dist directories should be skipped.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create meta files in build directories + for build_name in ["build", "dist", "__pycache__", ".cache"]: + build_dir = tmp_path / build_name + build_dir.mkdir() + (build_dir / f"{build_name}.leann.meta.json").touch() + + # Normal file + (tmp_path / "normal.leann.meta.json").touch() + + found = list(cli._find_meta_files_limited(tmp_path, max_depth=3)) + found_names = {f.name for f in found} + + assert "normal.leann.meta.json" in found_names + assert "build.leann.meta.json" not in found_names + assert "dist.leann.meta.json" not in found_names + assert "__pycache__.leann.meta.json" not in found_names + assert ".cache.leann.meta.json" not in found_names + + +class TestRegistryLimitedSearch: + """Test the registry limited search functionality.""" + + def test_has_app_indexes_limited_respects_depth(self, tmp_path: Path): + """Should not find indexes beyond max_depth.""" + from leann.registry import _has_app_indexes_limited + + # Create a deep directory structure + level4 = tmp_path / "l1" / "l2" / "l3" / "l4" + level4.mkdir(parents=True) + + # Only create a file beyond depth 3 + (level4 / "deep.leann.meta.json").touch() + + # Should not find it with max_depth=3 + assert not _has_app_indexes_limited(tmp_path, max_depth=3) + + # Create one at depth 2 + (tmp_path / "l1" / "l2" / "shallow.leann.meta.json").touch() + + # Now should find it + assert _has_app_indexes_limited(tmp_path, max_depth=3) + + def test_has_app_indexes_limited_skips_node_modules(self, tmp_path: Path): + """Should skip node_modules directory.""" + from leann.registry import _has_app_indexes_limited + + # Create a meta file inside node_modules + node_modules = tmp_path / "node_modules" + node_modules.mkdir() + (node_modules / "pkg.leann.meta.json").touch() + + # Should not find it + assert not _has_app_indexes_limited(tmp_path, max_depth=3) + + # Create a normal meta file + (tmp_path / "normal.leann.meta.json").touch() + + # Now should find it + assert _has_app_indexes_limited(tmp_path, max_depth=3) + + +class TestDiscoverIndexesPerformance: + """Test that _discover_indexes_in_project uses limited search.""" + + def test_discover_indexes_skips_deep_directories(self, tmp_path: Path): + """Should not scan directories beyond max_depth.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create a CLI format index (should always be found) + cli_indexes = tmp_path / ".leann" / "indexes" / "my-index" + cli_indexes.mkdir(parents=True) + (cli_indexes / "documents.leann.meta.json").touch() + + # Create an app format index at depth 4 (should not be found) + deep_dir = tmp_path / "a" / "b" / "c" / "d" + deep_dir.mkdir(parents=True) + (deep_dir / "deep.leann.meta.json").touch() + + indexes = cli._discover_indexes_in_project(tmp_path) + + # Should find the CLI index + assert any(idx["name"] == "my-index" for idx in indexes) + + # Should NOT find the deep app index + assert not any(idx["name"] == "d" for idx in indexes) + + def test_discover_indexes_respects_custom_max_depth(self, tmp_path: Path): + """Should find deeper indexes when max_depth is increased.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + + # Create an app format index at depth 5 + deep_dir = tmp_path / "a" / "b" / "c" / "d" / "e" + deep_dir.mkdir(parents=True) + (deep_dir / "deep.leann.meta.json").touch() + + # With default max_depth=3, should NOT find it + indexes_shallow = cli._discover_indexes_in_project(tmp_path, max_depth=3) + assert not any(idx["name"] == "e" for idx in indexes_shallow) + + # With max_depth=5, should find it + indexes_deep = cli._discover_indexes_in_project(tmp_path, max_depth=5) + assert any(idx["name"] == "e" for idx in indexes_deep) + + +class TestMaxDepthCliOption: + """Test the --max-depth CLI option for leann list.""" + + def test_max_depth_argument_is_parsed(self): + """The --max-depth argument should be properly parsed.""" + from leann.cli import LeannCLI + + cli = LeannCLI() + parser = cli.create_parser() + + # Test default value + args = parser.parse_args(["list"]) + assert args.max_depth == 3 + + # Test custom value + args = parser.parse_args(["list", "--max-depth", "5"]) + assert args.max_depth == 5 + + # Test another custom value + args = parser.parse_args(["list", "--max-depth", "10"]) + assert args.max_depth == 10 + + +class TestGlobalIndexRegistry: + """Test the global index registry for O(1) index discovery.""" + + def test_register_and_list_index(self, tmp_path: Path): + """Should register an index and list it from the registry.""" + from leann.registry import ( + GLOBAL_INDEX_REGISTRY_PATH, + _load_index_registry, + _save_index_registry, + register_index, + list_registered_indexes, + unregister_index, + ) + + # Use a temporary registry file + test_registry = tmp_path / "indexes.json" + with patch("leann.registry.GLOBAL_INDEX_REGISTRY_PATH", test_registry): + # Register an index + index_path = tmp_path / ".leann" / "indexes" / "test-index" / "documents.leann" + index_path.parent.mkdir(parents=True) + index_path.touch() + (index_path.parent / "documents.leann.meta.json").touch() + + result = register_index( + name="test-index", + path=str(index_path), + index_type="cli", + ) + assert result is True + + # List indexes + indexes = list_registered_indexes(validate=True) + assert len(indexes) == 1 + assert indexes[0]["name"] == "test-index" + assert indexes[0]["index_type"] == "cli" + + # Unregister + result = unregister_index(str(index_path)) + assert result is True + + # Should be empty now + indexes = list_registered_indexes(validate=False) + assert len(indexes) == 0 + + def test_registry_validates_stale_entries(self, tmp_path: Path): + """Should remove entries for indexes that no longer exist.""" + from leann.registry import ( + _save_index_registry, + list_registered_indexes, + ) + + test_registry = tmp_path / "indexes.json" + with patch("leann.registry.GLOBAL_INDEX_REGISTRY_PATH", test_registry): + # Create a registry with a stale entry + stale_entry = { + "name": "stale-index", + "path": str(tmp_path / "nonexistent" / "documents.leann"), + "index_type": "cli", + "created_at": "2024-01-01T00:00:00+00:00", + } + _save_index_registry([stale_entry]) + + # List with validation should remove the stale entry + indexes = list_registered_indexes(validate=True) + assert len(indexes) == 0 + + def test_register_index_updates_existing(self, tmp_path: Path): + """Should update an existing entry instead of duplicating.""" + from leann.registry import ( + register_index, + list_registered_indexes, + ) + + test_registry = tmp_path / "indexes.json" + with patch("leann.registry.GLOBAL_INDEX_REGISTRY_PATH", test_registry): + # Create the index files + index_path = tmp_path / "test.leann" + index_path.touch() + (tmp_path / "test.leann.meta.json").touch() + + # Register twice with different names + register_index(name="first-name", path=str(index_path), index_type="app") + register_index(name="second-name", path=str(index_path), index_type="app") + + # Should only have one entry with the updated name + indexes = list_registered_indexes(validate=True) + assert len(indexes) == 1 + assert indexes[0]["name"] == "second-name" + + +class TestListIndexesWithRegistry: + """Test that list_indexes uses the global registry when available.""" + + def test_list_indexes_uses_registry_when_available(self, tmp_path: Path, capsys): + """Should use O(1) registry lookup when indexes are registered.""" + from leann.cli import LeannCLI + from leann.registry import register_index + + test_registry = tmp_path / "indexes.json" + + # Create an index + index_dir = tmp_path / ".leann" / "indexes" / "my-index" + index_dir.mkdir(parents=True) + index_path = index_dir / "documents.leann" + index_path.touch() + (index_dir / "documents.leann.meta.json").touch() + + with patch("leann.registry.GLOBAL_INDEX_REGISTRY_PATH", test_registry): + with patch("leann.cli.list_registered_indexes") as mock_list: + # Mock the registry to return our index + mock_list.return_value = [ + { + "name": "my-index", + "path": str(index_path), + "index_type": "cli", + "created_at": "2024-01-01T00:00:00+00:00", + } + ] + + cli = LeannCLI() + cli.list_indexes() + + captured = capsys.readouterr() + assert "O(1) lookup" in captured.out or "global registry" in captured.out.lower()