Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ Use the cocoindex-code MCP server for semantic code search when:
| `COCOINDEX_CODE_EMBEDDING_MODEL` | Embedding model (see below) | `sbert/sentence-transformers/all-MiniLM-L6-v2` |
| `COCOINDEX_CODE_BATCH_SIZE` | Max batch size for local embedding model | `16` |
| `COCOINDEX_CODE_EXTRA_EXTENSIONS` | Additional file extensions to index (comma-separated, e.g. `"inc:php,yaml,toml"` — use `ext:lang` to override language detection) | _(none)_ |
| `COCOINDEX_CODE_EXCLUDED_PATTERNS` | Additional glob patterns to exclude from indexing as a JSON array (e.g. `'["**/migration.sql", "{**/*.md,**/*.txt}"]'`) | _(none)_ |


### Root Path Discovery
Expand Down
35 changes: 35 additions & 0 deletions src/cocoindex_code/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import json
import os
from dataclasses import dataclass
from pathlib import Path
Expand Down Expand Up @@ -42,6 +43,33 @@ def _discover_codebase_root() -> Path:
return root if root is not None else cwd


def _parse_json_string_list_env(var_name: str) -> list[str]:
"""Parse an environment variable as a JSON array of strings."""
raw_value = os.environ.get(var_name, "")
if not raw_value.strip():
return []

try:
parsed = json.loads(raw_value)
except json.JSONDecodeError as exc:
raise ValueError(
f"{var_name} must be a JSON array of strings, got invalid JSON"
) from exc

if not isinstance(parsed, list):
raise ValueError(f"{var_name} must be a JSON array of strings")

result: list[str] = []
for item in parsed:
if not isinstance(item, str):
raise ValueError(f"{var_name} must be a JSON array of strings")
item = item.strip()
if item:
result.append(item)

return result


@dataclass
class Config:
"""Configuration loaded from environment variables."""
Expand All @@ -52,6 +80,7 @@ class Config:
device: str | None
trust_remote_code: bool
extra_extensions: dict[str, str | None]
excluded_patterns: list[str]

@classmethod
def from_env(cls) -> Config:
Expand Down Expand Up @@ -99,13 +128,19 @@ def from_env(cls) -> Config:
else:
extra_extensions[f".{token}"] = None

# Excluded file glob patterns
excluded_patterns = _parse_json_string_list_env(
"COCOINDEX_CODE_EXCLUDED_PATTERNS"
)

return cls(
codebase_root_path=root,
embedding_model=embedding_model,
index_dir=index_dir,
device=device,
trust_remote_code=trust_remote_code,
extra_extensions=extra_extensions,
excluded_patterns=excluded_patterns,
)

@property
Expand Down
4 changes: 3 additions & 1 deletion src/cocoindex_code/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
ext: lang for ext, lang in config.extra_extensions.items() if lang is not None
}

EXCLUDED_PATTERNS = [
DEFAULT_EXCLUDED_PATTERNS = [
"**/.*", # Hidden directories
"**/__pycache__", # Python cache
"**/node_modules", # Node.js dependencies
Expand All @@ -63,6 +63,8 @@
"**/.cocoindex_code", # Our own index directory
]

EXCLUDED_PATTERNS = DEFAULT_EXCLUDED_PATTERNS + config.excluded_patterns

# Chunking configuration
CHUNK_SIZE = 2000
MIN_CHUNK_SIZE = 300
Expand Down
146 changes: 123 additions & 23 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,35 +6,33 @@
from pathlib import Path
from unittest.mock import patch

from cocoindex_code.config import Config, _detect_device
import pytest

from cocoindex_code.config import Config

class TestDetectDevice:
"""Tests for device auto-detection."""

def test_returns_cuda_when_available(self) -> None:
with patch.dict(os.environ, {}, clear=False):
# Ensure env var is unset
os.environ.pop("COCOINDEX_CODE_DEVICE", None)
with patch("torch.cuda.is_available", return_value=True):
assert _detect_device() == "cuda"
class TestConfigDevice:
"""Tests for COCOINDEX_CODE_DEVICE env var handling."""

def test_returns_cpu_when_cuda_unavailable(self) -> None:
with patch.dict(os.environ, {}, clear=False):
def test_none_by_default(self, tmp_path: Path) -> None:
with patch.dict(
os.environ,
{"COCOINDEX_CODE_ROOT_PATH": str(tmp_path)},
):
os.environ.pop("COCOINDEX_CODE_DEVICE", None)
with patch("torch.cuda.is_available", return_value=False):
assert _detect_device() == "cpu"

def test_env_var_overrides_auto_detection(self) -> None:
with patch.dict(os.environ, {"COCOINDEX_CODE_DEVICE": "cpu"}):
with patch("torch.cuda.is_available", return_value=True):
assert _detect_device() == "cpu"
config = Config.from_env()
assert config.device is None

def test_returns_cpu_when_torch_missing(self) -> None:
with patch.dict(os.environ, {}, clear=False):
os.environ.pop("COCOINDEX_CODE_DEVICE", None)
with patch.dict("sys.modules", {"torch": None}):
assert _detect_device() == "cpu"
def test_env_var_overrides_device(self, tmp_path: Path) -> None:
with patch.dict(
os.environ,
{
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
"COCOINDEX_CODE_DEVICE": "cpu",
},
):
config = Config.from_env()
assert config.device == "cpu"


class TestConfigTrustRemoteCode:
Expand Down Expand Up @@ -158,3 +156,105 @@ def test_mixed_with_and_without_lang(self, tmp_path: Path) -> None:
):
config = Config.from_env()
assert config.extra_extensions == {".inc": "php", ".yaml": None, ".tpl": "html"}


class TestExcludedPatterns:
"""Tests for COCOINDEX_CODE_EXCLUDED_PATTERNS env var."""

def test_empty_by_default(self, tmp_path: Path) -> None:
with patch.dict(
os.environ,
{"COCOINDEX_CODE_ROOT_PATH": str(tmp_path)},
):
os.environ.pop("COCOINDEX_CODE_EXCLUDED_PATTERNS", None)
config = Config.from_env()
assert config.excluded_patterns == []

def test_parses_json_array(self, tmp_path: Path) -> None:
with patch.dict(
os.environ,
{
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
"COCOINDEX_CODE_EXCLUDED_PATTERNS": '["**/migration.sql", "**/*.d.ts"]',
},
):
config = Config.from_env()
assert config.excluded_patterns == ["**/migration.sql", "**/*.d.ts"]

def test_preserves_commas_inside_globs(self, tmp_path: Path) -> None:
with patch.dict(
os.environ,
{
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
"COCOINDEX_CODE_EXCLUDED_PATTERNS": '["{**/*.md,**/*.txt}"]',
},
):
config = Config.from_env()
assert config.excluded_patterns == ["{**/*.md,**/*.txt}"]

def test_trims_whitespace_and_ignores_empty_entries(self, tmp_path: Path) -> None:
with patch.dict(
os.environ,
{
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
"COCOINDEX_CODE_EXCLUDED_PATTERNS": '[" **/migration.sql ", " ", "**/*.d.ts"]',
},
):
config = Config.from_env()
assert config.excluded_patterns == ["**/migration.sql", "**/*.d.ts"]

def test_empty_string_gives_empty_list(self, tmp_path: Path) -> None:
with patch.dict(
os.environ,
{
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
"COCOINDEX_CODE_EXCLUDED_PATTERNS": "",
},
):
config = Config.from_env()
assert config.excluded_patterns == []

def test_rejects_invalid_json(self, tmp_path: Path) -> None:
with patch.dict(
os.environ,
{
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
"COCOINDEX_CODE_EXCLUDED_PATTERNS": "**/migration.sql,**/*.d.ts",
},
):
with pytest.raises(
ValueError,
match=(
"COCOINDEX_CODE_EXCLUDED_PATTERNS must be a JSON array of strings, "
"got invalid JSON"
),
):
Config.from_env()

def test_rejects_valid_json_non_list(self, tmp_path: Path) -> None:
with patch.dict(
os.environ,
{
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
"COCOINDEX_CODE_EXCLUDED_PATTERNS": "{}",
},
):
with pytest.raises(
ValueError,
match="COCOINDEX_CODE_EXCLUDED_PATTERNS must be a JSON array of strings",
):
Config.from_env()

def test_rejects_non_string_entries(self, tmp_path: Path) -> None:
with patch.dict(
os.environ,
{
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
"COCOINDEX_CODE_EXCLUDED_PATTERNS": '["**/*.py", 1]',
},
):
with pytest.raises(
ValueError,
match="COCOINDEX_CODE_EXCLUDED_PATTERNS must be a JSON array of strings",
):
Config.from_env()
Loading