diff --git a/README.md b/README.md index 69fa865..e21b037 100644 --- a/README.md +++ b/README.md @@ -130,6 +130,7 @@ Use the cocoindex-code MCP server for semantic code search when: | `COCOINDEX_CODE_EMBEDDING_MODEL` | Embedding model (see below) | `sbert/sentence-transformers/all-MiniLM-L6-v2` | | `COCOINDEX_CODE_BATCH_SIZE` | Max batch size for local embedding model | `16` | | `COCOINDEX_CODE_EXTRA_EXTENSIONS` | Additional file extensions to index (comma-separated, e.g. `"inc:php,yaml,toml"` — use `ext:lang` to override language detection) | _(none)_ | +| `COCOINDEX_CODE_EXCLUDED_PATTERNS` | Additional glob patterns to exclude from indexing as a JSON array (e.g. `'["**/migration.sql", "{**/*.md,**/*.txt}"]'`) | _(none)_ | ### Root Path Discovery diff --git a/src/cocoindex_code/config.py b/src/cocoindex_code/config.py index 07dce95..9040bce 100644 --- a/src/cocoindex_code/config.py +++ b/src/cocoindex_code/config.py @@ -2,6 +2,7 @@ from __future__ import annotations +import json import os from dataclasses import dataclass from pathlib import Path @@ -42,6 +43,33 @@ def _discover_codebase_root() -> Path: return root if root is not None else cwd +def _parse_json_string_list_env(var_name: str) -> list[str]: + """Parse an environment variable as a JSON array of strings.""" + raw_value = os.environ.get(var_name, "") + if not raw_value.strip(): + return [] + + try: + parsed = json.loads(raw_value) + except json.JSONDecodeError as exc: + raise ValueError( + f"{var_name} must be a JSON array of strings, got invalid JSON" + ) from exc + + if not isinstance(parsed, list): + raise ValueError(f"{var_name} must be a JSON array of strings") + + result: list[str] = [] + for item in parsed: + if not isinstance(item, str): + raise ValueError(f"{var_name} must be a JSON array of strings") + item = item.strip() + if item: + result.append(item) + + return result + + @dataclass class Config: """Configuration loaded from environment variables.""" @@ -52,6 +80,7 @@ class Config: device: str | None trust_remote_code: bool extra_extensions: dict[str, str | None] + excluded_patterns: list[str] @classmethod def from_env(cls) -> Config: @@ -99,6 +128,11 @@ def from_env(cls) -> Config: else: extra_extensions[f".{token}"] = None + # Excluded file glob patterns + excluded_patterns = _parse_json_string_list_env( + "COCOINDEX_CODE_EXCLUDED_PATTERNS" + ) + return cls( codebase_root_path=root, embedding_model=embedding_model, @@ -106,6 +140,7 @@ def from_env(cls) -> Config: device=device, trust_remote_code=trust_remote_code, extra_extensions=extra_extensions, + excluded_patterns=excluded_patterns, ) @property diff --git a/src/cocoindex_code/indexer.py b/src/cocoindex_code/indexer.py index 37b1f7f..da21191 100644 --- a/src/cocoindex_code/indexer.py +++ b/src/cocoindex_code/indexer.py @@ -51,7 +51,7 @@ ext: lang for ext, lang in config.extra_extensions.items() if lang is not None } -EXCLUDED_PATTERNS = [ +DEFAULT_EXCLUDED_PATTERNS = [ "**/.*", # Hidden directories "**/__pycache__", # Python cache "**/node_modules", # Node.js dependencies @@ -63,6 +63,8 @@ "**/.cocoindex_code", # Our own index directory ] +EXCLUDED_PATTERNS = DEFAULT_EXCLUDED_PATTERNS + config.excluded_patterns + # Chunking configuration CHUNK_SIZE = 2000 MIN_CHUNK_SIZE = 300 diff --git a/tests/test_config.py b/tests/test_config.py index 5db91bc..d0abb38 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -6,35 +6,33 @@ from pathlib import Path from unittest.mock import patch -from cocoindex_code.config import Config, _detect_device +import pytest +from cocoindex_code.config import Config -class TestDetectDevice: - """Tests for device auto-detection.""" - def test_returns_cuda_when_available(self) -> None: - with patch.dict(os.environ, {}, clear=False): - # Ensure env var is unset - os.environ.pop("COCOINDEX_CODE_DEVICE", None) - with patch("torch.cuda.is_available", return_value=True): - assert _detect_device() == "cuda" +class TestConfigDevice: + """Tests for COCOINDEX_CODE_DEVICE env var handling.""" - def test_returns_cpu_when_cuda_unavailable(self) -> None: - with patch.dict(os.environ, {}, clear=False): + def test_none_by_default(self, tmp_path: Path) -> None: + with patch.dict( + os.environ, + {"COCOINDEX_CODE_ROOT_PATH": str(tmp_path)}, + ): os.environ.pop("COCOINDEX_CODE_DEVICE", None) - with patch("torch.cuda.is_available", return_value=False): - assert _detect_device() == "cpu" - - def test_env_var_overrides_auto_detection(self) -> None: - with patch.dict(os.environ, {"COCOINDEX_CODE_DEVICE": "cpu"}): - with patch("torch.cuda.is_available", return_value=True): - assert _detect_device() == "cpu" + config = Config.from_env() + assert config.device is None - def test_returns_cpu_when_torch_missing(self) -> None: - with patch.dict(os.environ, {}, clear=False): - os.environ.pop("COCOINDEX_CODE_DEVICE", None) - with patch.dict("sys.modules", {"torch": None}): - assert _detect_device() == "cpu" + def test_env_var_overrides_device(self, tmp_path: Path) -> None: + with patch.dict( + os.environ, + { + "COCOINDEX_CODE_ROOT_PATH": str(tmp_path), + "COCOINDEX_CODE_DEVICE": "cpu", + }, + ): + config = Config.from_env() + assert config.device == "cpu" class TestConfigTrustRemoteCode: @@ -158,3 +156,105 @@ def test_mixed_with_and_without_lang(self, tmp_path: Path) -> None: ): config = Config.from_env() assert config.extra_extensions == {".inc": "php", ".yaml": None, ".tpl": "html"} + + +class TestExcludedPatterns: + """Tests for COCOINDEX_CODE_EXCLUDED_PATTERNS env var.""" + + def test_empty_by_default(self, tmp_path: Path) -> None: + with patch.dict( + os.environ, + {"COCOINDEX_CODE_ROOT_PATH": str(tmp_path)}, + ): + os.environ.pop("COCOINDEX_CODE_EXCLUDED_PATTERNS", None) + config = Config.from_env() + assert config.excluded_patterns == [] + + def test_parses_json_array(self, tmp_path: Path) -> None: + with patch.dict( + os.environ, + { + "COCOINDEX_CODE_ROOT_PATH": str(tmp_path), + "COCOINDEX_CODE_EXCLUDED_PATTERNS": '["**/migration.sql", "**/*.d.ts"]', + }, + ): + config = Config.from_env() + assert config.excluded_patterns == ["**/migration.sql", "**/*.d.ts"] + + def test_preserves_commas_inside_globs(self, tmp_path: Path) -> None: + with patch.dict( + os.environ, + { + "COCOINDEX_CODE_ROOT_PATH": str(tmp_path), + "COCOINDEX_CODE_EXCLUDED_PATTERNS": '["{**/*.md,**/*.txt}"]', + }, + ): + config = Config.from_env() + assert config.excluded_patterns == ["{**/*.md,**/*.txt}"] + + def test_trims_whitespace_and_ignores_empty_entries(self, tmp_path: Path) -> None: + with patch.dict( + os.environ, + { + "COCOINDEX_CODE_ROOT_PATH": str(tmp_path), + "COCOINDEX_CODE_EXCLUDED_PATTERNS": '[" **/migration.sql ", " ", "**/*.d.ts"]', + }, + ): + config = Config.from_env() + assert config.excluded_patterns == ["**/migration.sql", "**/*.d.ts"] + + def test_empty_string_gives_empty_list(self, tmp_path: Path) -> None: + with patch.dict( + os.environ, + { + "COCOINDEX_CODE_ROOT_PATH": str(tmp_path), + "COCOINDEX_CODE_EXCLUDED_PATTERNS": "", + }, + ): + config = Config.from_env() + assert config.excluded_patterns == [] + + def test_rejects_invalid_json(self, tmp_path: Path) -> None: + with patch.dict( + os.environ, + { + "COCOINDEX_CODE_ROOT_PATH": str(tmp_path), + "COCOINDEX_CODE_EXCLUDED_PATTERNS": "**/migration.sql,**/*.d.ts", + }, + ): + with pytest.raises( + ValueError, + match=( + "COCOINDEX_CODE_EXCLUDED_PATTERNS must be a JSON array of strings, " + "got invalid JSON" + ), + ): + Config.from_env() + + def test_rejects_valid_json_non_list(self, tmp_path: Path) -> None: + with patch.dict( + os.environ, + { + "COCOINDEX_CODE_ROOT_PATH": str(tmp_path), + "COCOINDEX_CODE_EXCLUDED_PATTERNS": "{}", + }, + ): + with pytest.raises( + ValueError, + match="COCOINDEX_CODE_EXCLUDED_PATTERNS must be a JSON array of strings", + ): + Config.from_env() + + def test_rejects_non_string_entries(self, tmp_path: Path) -> None: + with patch.dict( + os.environ, + { + "COCOINDEX_CODE_ROOT_PATH": str(tmp_path), + "COCOINDEX_CODE_EXCLUDED_PATTERNS": '["**/*.py", 1]', + }, + ): + with pytest.raises( + ValueError, + match="COCOINDEX_CODE_EXCLUDED_PATTERNS must be a JSON array of strings", + ): + Config.from_env()