From a7f5c3c7c417efe40fd16271462de8f430eae69b Mon Sep 17 00:00:00 2001 From: Ruben Fricke Date: Thu, 12 Mar 2026 16:37:26 +0100 Subject: [PATCH 1/2] feat: support custom exclude patterns --- README.md | 1 + src/cocoindex_code/config.py | 11 +++++ src/cocoindex_code/indexer.py | 4 +- tests/test_config.py | 90 +++++++++++++++++++++++++---------- 4 files changed, 81 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 69fa865..4355395 100644 --- a/README.md +++ b/README.md @@ -130,6 +130,7 @@ Use the cocoindex-code MCP server for semantic code search when: | `COCOINDEX_CODE_EMBEDDING_MODEL` | Embedding model (see below) | `sbert/sentence-transformers/all-MiniLM-L6-v2` | | `COCOINDEX_CODE_BATCH_SIZE` | Max batch size for local embedding model | `16` | | `COCOINDEX_CODE_EXTRA_EXTENSIONS` | Additional file extensions to index (comma-separated, e.g. `"inc:php,yaml,toml"` — use `ext:lang` to override language detection) | _(none)_ | +| `COCOINDEX_CODE_EXCLUDED_PATTERNS` | Additional glob patterns to exclude from indexing (comma-separated, e.g. `"**/migration.sql,**/*.d.ts"`) | _(none)_ | ### Root Path Discovery diff --git a/src/cocoindex_code/config.py b/src/cocoindex_code/config.py index 07dce95..5fc7f4e 100644 --- a/src/cocoindex_code/config.py +++ b/src/cocoindex_code/config.py @@ -52,6 +52,7 @@ class Config: device: str | None trust_remote_code: bool extra_extensions: dict[str, str | None] + excluded_patterns: list[str] @classmethod def from_env(cls) -> Config: @@ -99,6 +100,15 @@ def from_env(cls) -> Config: else: extra_extensions[f".{token}"] = None + # Excluded file glob patterns + raw_excluded_patterns = os.environ.get("COCOINDEX_CODE_EXCLUDED_PATTERNS", "") + excluded_patterns: list[str] = [] + for pattern in raw_excluded_patterns.split(","): + pattern = pattern.strip() + if not pattern: + continue + excluded_patterns.append(pattern) + return cls( codebase_root_path=root, embedding_model=embedding_model, @@ -106,6 +116,7 @@ def from_env(cls) -> Config: device=device, trust_remote_code=trust_remote_code, extra_extensions=extra_extensions, + excluded_patterns=excluded_patterns, ) @property diff --git a/src/cocoindex_code/indexer.py b/src/cocoindex_code/indexer.py index 37b1f7f..da21191 100644 --- a/src/cocoindex_code/indexer.py +++ b/src/cocoindex_code/indexer.py @@ -51,7 +51,7 @@ ext: lang for ext, lang in config.extra_extensions.items() if lang is not None } -EXCLUDED_PATTERNS = [ +DEFAULT_EXCLUDED_PATTERNS = [ "**/.*", # Hidden directories "**/__pycache__", # Python cache "**/node_modules", # Node.js dependencies @@ -63,6 +63,8 @@ "**/.cocoindex_code", # Our own index directory ] +EXCLUDED_PATTERNS = DEFAULT_EXCLUDED_PATTERNS + config.excluded_patterns + # Chunking configuration CHUNK_SIZE = 2000 MIN_CHUNK_SIZE = 300 diff --git a/tests/test_config.py b/tests/test_config.py index 5db91bc..5dd20a9 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -6,35 +6,31 @@ from pathlib import Path from unittest.mock import patch -from cocoindex_code.config import Config, _detect_device +from cocoindex_code.config import Config -class TestDetectDevice: - """Tests for device auto-detection.""" +class TestConfigDevice: + """Tests for COCOINDEX_CODE_DEVICE env var handling.""" - def test_returns_cuda_when_available(self) -> None: - with patch.dict(os.environ, {}, clear=False): - # Ensure env var is unset - os.environ.pop("COCOINDEX_CODE_DEVICE", None) - with patch("torch.cuda.is_available", return_value=True): - assert _detect_device() == "cuda" - - def test_returns_cpu_when_cuda_unavailable(self) -> None: - with patch.dict(os.environ, {}, clear=False): + def test_none_by_default(self, tmp_path: Path) -> None: + with patch.dict( + os.environ, + {"COCOINDEX_CODE_ROOT_PATH": str(tmp_path)}, + ): os.environ.pop("COCOINDEX_CODE_DEVICE", None) - with patch("torch.cuda.is_available", return_value=False): - assert _detect_device() == "cpu" - - def test_env_var_overrides_auto_detection(self) -> None: - with patch.dict(os.environ, {"COCOINDEX_CODE_DEVICE": "cpu"}): - with patch("torch.cuda.is_available", return_value=True): - assert _detect_device() == "cpu" + config = Config.from_env() + assert config.device is None - def test_returns_cpu_when_torch_missing(self) -> None: - with patch.dict(os.environ, {}, clear=False): - os.environ.pop("COCOINDEX_CODE_DEVICE", None) - with patch.dict("sys.modules", {"torch": None}): - assert _detect_device() == "cpu" + def test_env_var_overrides_device(self, tmp_path: Path) -> None: + with patch.dict( + os.environ, + { + "COCOINDEX_CODE_ROOT_PATH": str(tmp_path), + "COCOINDEX_CODE_DEVICE": "cpu", + }, + ): + config = Config.from_env() + assert config.device == "cpu" class TestConfigTrustRemoteCode: @@ -158,3 +154,49 @@ def test_mixed_with_and_without_lang(self, tmp_path: Path) -> None: ): config = Config.from_env() assert config.extra_extensions == {".inc": "php", ".yaml": None, ".tpl": "html"} + + +class TestExcludedPatterns: + """Tests for COCOINDEX_CODE_EXCLUDED_PATTERNS env var.""" + + def test_empty_by_default(self, tmp_path: Path) -> None: + with patch.dict( + os.environ, + {"COCOINDEX_CODE_ROOT_PATH": str(tmp_path)}, + ): + os.environ.pop("COCOINDEX_CODE_EXCLUDED_PATTERNS", None) + config = Config.from_env() + assert config.excluded_patterns == [] + + def test_parses_comma_separated(self, tmp_path: Path) -> None: + with patch.dict( + os.environ, + { + "COCOINDEX_CODE_ROOT_PATH": str(tmp_path), + "COCOINDEX_CODE_EXCLUDED_PATTERNS": "**/migration.sql,**/*.d.ts", + }, + ): + config = Config.from_env() + assert config.excluded_patterns == ["**/migration.sql", "**/*.d.ts"] + + def test_trims_whitespace(self, tmp_path: Path) -> None: + with patch.dict( + os.environ, + { + "COCOINDEX_CODE_ROOT_PATH": str(tmp_path), + "COCOINDEX_CODE_EXCLUDED_PATTERNS": " **/migration.sql , **/*.d.ts , ", + }, + ): + config = Config.from_env() + assert config.excluded_patterns == ["**/migration.sql", "**/*.d.ts"] + + def test_empty_string_gives_empty_list(self, tmp_path: Path) -> None: + with patch.dict( + os.environ, + { + "COCOINDEX_CODE_ROOT_PATH": str(tmp_path), + "COCOINDEX_CODE_EXCLUDED_PATTERNS": "", + }, + ): + config = Config.from_env() + assert config.excluded_patterns == [] From bdac42e55b59b88f41e1b57384350d43bf500b3d Mon Sep 17 00:00:00 2001 From: Ruben Fricke Date: Thu, 12 Mar 2026 17:05:48 +0100 Subject: [PATCH 2/2] fix: support commas in custom exclude patterns --- README.md | 2 +- src/cocoindex_code/config.py | 38 +++++++++++++++++---- tests/test_config.py | 66 +++++++++++++++++++++++++++++++++--- 3 files changed, 94 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 4355395..e21b037 100644 --- a/README.md +++ b/README.md @@ -130,7 +130,7 @@ Use the cocoindex-code MCP server for semantic code search when: | `COCOINDEX_CODE_EMBEDDING_MODEL` | Embedding model (see below) | `sbert/sentence-transformers/all-MiniLM-L6-v2` | | `COCOINDEX_CODE_BATCH_SIZE` | Max batch size for local embedding model | `16` | | `COCOINDEX_CODE_EXTRA_EXTENSIONS` | Additional file extensions to index (comma-separated, e.g. `"inc:php,yaml,toml"` — use `ext:lang` to override language detection) | _(none)_ | -| `COCOINDEX_CODE_EXCLUDED_PATTERNS` | Additional glob patterns to exclude from indexing (comma-separated, e.g. `"**/migration.sql,**/*.d.ts"`) | _(none)_ | +| `COCOINDEX_CODE_EXCLUDED_PATTERNS` | Additional glob patterns to exclude from indexing as a JSON array (e.g. `'["**/migration.sql", "{**/*.md,**/*.txt}"]'`) | _(none)_ | ### Root Path Discovery diff --git a/src/cocoindex_code/config.py b/src/cocoindex_code/config.py index 5fc7f4e..9040bce 100644 --- a/src/cocoindex_code/config.py +++ b/src/cocoindex_code/config.py @@ -2,6 +2,7 @@ from __future__ import annotations +import json import os from dataclasses import dataclass from pathlib import Path @@ -42,6 +43,33 @@ def _discover_codebase_root() -> Path: return root if root is not None else cwd +def _parse_json_string_list_env(var_name: str) -> list[str]: + """Parse an environment variable as a JSON array of strings.""" + raw_value = os.environ.get(var_name, "") + if not raw_value.strip(): + return [] + + try: + parsed = json.loads(raw_value) + except json.JSONDecodeError as exc: + raise ValueError( + f"{var_name} must be a JSON array of strings, got invalid JSON" + ) from exc + + if not isinstance(parsed, list): + raise ValueError(f"{var_name} must be a JSON array of strings") + + result: list[str] = [] + for item in parsed: + if not isinstance(item, str): + raise ValueError(f"{var_name} must be a JSON array of strings") + item = item.strip() + if item: + result.append(item) + + return result + + @dataclass class Config: """Configuration loaded from environment variables.""" @@ -101,13 +129,9 @@ def from_env(cls) -> Config: extra_extensions[f".{token}"] = None # Excluded file glob patterns - raw_excluded_patterns = os.environ.get("COCOINDEX_CODE_EXCLUDED_PATTERNS", "") - excluded_patterns: list[str] = [] - for pattern in raw_excluded_patterns.split(","): - pattern = pattern.strip() - if not pattern: - continue - excluded_patterns.append(pattern) + excluded_patterns = _parse_json_string_list_env( + "COCOINDEX_CODE_EXCLUDED_PATTERNS" + ) return cls( codebase_root_path=root, diff --git a/tests/test_config.py b/tests/test_config.py index 5dd20a9..d0abb38 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -6,6 +6,8 @@ from pathlib import Path from unittest.mock import patch +import pytest + from cocoindex_code.config import Config @@ -168,23 +170,34 @@ def test_empty_by_default(self, tmp_path: Path) -> None: config = Config.from_env() assert config.excluded_patterns == [] - def test_parses_comma_separated(self, tmp_path: Path) -> None: + def test_parses_json_array(self, tmp_path: Path) -> None: with patch.dict( os.environ, { "COCOINDEX_CODE_ROOT_PATH": str(tmp_path), - "COCOINDEX_CODE_EXCLUDED_PATTERNS": "**/migration.sql,**/*.d.ts", + "COCOINDEX_CODE_EXCLUDED_PATTERNS": '["**/migration.sql", "**/*.d.ts"]', }, ): config = Config.from_env() assert config.excluded_patterns == ["**/migration.sql", "**/*.d.ts"] - def test_trims_whitespace(self, tmp_path: Path) -> None: + def test_preserves_commas_inside_globs(self, tmp_path: Path) -> None: with patch.dict( os.environ, { "COCOINDEX_CODE_ROOT_PATH": str(tmp_path), - "COCOINDEX_CODE_EXCLUDED_PATTERNS": " **/migration.sql , **/*.d.ts , ", + "COCOINDEX_CODE_EXCLUDED_PATTERNS": '["{**/*.md,**/*.txt}"]', + }, + ): + config = Config.from_env() + assert config.excluded_patterns == ["{**/*.md,**/*.txt}"] + + def test_trims_whitespace_and_ignores_empty_entries(self, tmp_path: Path) -> None: + with patch.dict( + os.environ, + { + "COCOINDEX_CODE_ROOT_PATH": str(tmp_path), + "COCOINDEX_CODE_EXCLUDED_PATTERNS": '[" **/migration.sql ", " ", "**/*.d.ts"]', }, ): config = Config.from_env() @@ -200,3 +213,48 @@ def test_empty_string_gives_empty_list(self, tmp_path: Path) -> None: ): config = Config.from_env() assert config.excluded_patterns == [] + + def test_rejects_invalid_json(self, tmp_path: Path) -> None: + with patch.dict( + os.environ, + { + "COCOINDEX_CODE_ROOT_PATH": str(tmp_path), + "COCOINDEX_CODE_EXCLUDED_PATTERNS": "**/migration.sql,**/*.d.ts", + }, + ): + with pytest.raises( + ValueError, + match=( + "COCOINDEX_CODE_EXCLUDED_PATTERNS must be a JSON array of strings, " + "got invalid JSON" + ), + ): + Config.from_env() + + def test_rejects_valid_json_non_list(self, tmp_path: Path) -> None: + with patch.dict( + os.environ, + { + "COCOINDEX_CODE_ROOT_PATH": str(tmp_path), + "COCOINDEX_CODE_EXCLUDED_PATTERNS": "{}", + }, + ): + with pytest.raises( + ValueError, + match="COCOINDEX_CODE_EXCLUDED_PATTERNS must be a JSON array of strings", + ): + Config.from_env() + + def test_rejects_non_string_entries(self, tmp_path: Path) -> None: + with patch.dict( + os.environ, + { + "COCOINDEX_CODE_ROOT_PATH": str(tmp_path), + "COCOINDEX_CODE_EXCLUDED_PATTERNS": '["**/*.py", 1]', + }, + ): + with pytest.raises( + ValueError, + match="COCOINDEX_CODE_EXCLUDED_PATTERNS must be a JSON array of strings", + ): + Config.from_env()