crd · crd · Nov 7, 2025
diff --git a/.gitignore b/.gitignore
@@ -12,5 +12,8 @@ wheels/
 # RAG location
 .rag
 
+# Evaluation artefacts
+evaluations/results/
+
 # Config file
 config.yaml
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: venv install ingest chat lock clean
+.PHONY: venv install ingest chat evaluate lock clean
 
 # Create/refresh a local .venv and install deps from pyproject/uv.lock
 install:
@@ -13,11 +13,14 @@ ingest:
 	uv run ingest.py
 
 chat:
-	uv run chat.py
+        uv run chat.py
+
+evaluate:
+        uv run evaluation/runner.py
 
 # Run the automated test suite
 test:
-	uv run --extra dev pytest
+        uv run --extra dev pytest
 
 # Create/update a lockfile explicitly (optional; uv sync also updates it)
 lock:

diff --git a/README.md b/README.md
@@ -24,7 +24,23 @@ cd logseq-chat
 make install
 ```
 
-Edit `config.yaml` and at a minimum set `logseq_root` to your Logseq graph directory.
+Copy `config.yaml.sample` to `config.yaml` and customise the values. The sample
+lists tuned defaults plus alternative values (chunk sizes, retrieval depth,
+synonym lists, etc.) that you can toggle as you experiment. At minimum set
+`logseq_root` to your Logseq graph directory.
+
+## Configuration cheat sheet
+- **Chunk size / overlap** – controls how much context each embedding sees.
+  Smaller chunks with slightly larger overlaps (`chunk_size: 650`,
+  `chunk_overlap: 160`) improve recall; larger chunks (`chunk_size: 1200`) speed
+  things up on slower machines.
+- **Retrieval depth** – adjust `retrieval.top_k` and `retrieval.mmr.enabled` to
+  trade recall for latency.
+- **Query expansion** – populate `retrieval.query_expansion.synonyms` with
+  domain-specific vocabulary. Asking “What did I write about sailing?” will also
+  search for “sloop” and “schooner” with the default config.
+- **Model temperature** – lower values keep answers grounded; increase towards
+  `0.3` for more conversational replies.
 
 ## Build index
 ```bash
@@ -41,11 +57,34 @@ make chat
 make test
 ```
 
+## Evaluate presets
+```bash
+make evaluate
+```
+
+The evaluation harness ingests your graph for each preset listed in
+`evaluations/configurations.yaml`, runs the labelled queries from
+`evaluations/datasets/baseline.yaml`, and prints a leaderboard ranked by the
+weighted scoring formula defined in `config.yaml`. The bundled presets are:
+
+| Name        | Purpose                                           |
+| ----------- | ------------------------------------------------- |
+| balanced    | Default profile – accuracy, coverage, and speed.  |
+| high_recall | Smaller chunks, deeper retrieval, more overlap.   |
+| fast_local  | Larger chunks, shallow retrieval for quick tests. |
+
+After the run, the best-scoring configuration is reported and summarised in
+`evaluations/results/latest.yaml`. Use that preset as a starting point for new
+experiments or promote it to your day-to-day `config.yaml`.
+
 ### Example questions
 - Summarize tasks tagged #home in October 2025.
 - Find notes referencing [[Team Topologies]] and list my pros/cons.
 
 ## Notes
 - Skips `assets/` by default. Enable OCR later if needed.
 - Uses Markdown-aware chunking; tags from `#tag` and `tags::` stored in metadata.
-- For faster machines, try bigger models; for CPU-only, consider `llama3.2` or `qwen2.5:7b` and smaller chunks.
+- The default configuration enables targeted synonym expansion to improve recall
+  for concept-driven queries (e.g. “sailing” → “sloop”, “schooner”).
+- For faster machines, try bigger models; for CPU-only, consider `llama3.2` or
+  `qwen2.5:7b` and larger chunk sizes to reduce request volume.
diff --git a/app_config.py b/app_config.py
@@ -0,0 +1,169 @@
+"""Configuration loader and helpers for logseq-chat.
+
+This module centralises default settings and exposes a convenient helper for
+loading ``config.yaml`` (or preset overrides) as a nested namespace.  Keeping
+all configuration semantics in one place makes it easier to experiment with the
+RAG pipeline while ensuring scripts like ``ingest.py`` and ``chat.py`` stay in
+sync.
+"""
+
+from __future__ import annotations
+
+import copy
+from pathlib import Path
+from typing import Any, Dict, Iterable, MutableMapping, Optional
+
+import yaml
+
+
+DEFAULT_CONFIG: Dict[str, Any] = {
+    "logseq_root": "",
+    "include_dirs": ["journals", "pages"],
+    "exclude_globs": ["**/.git/**", "**/.DS_Store", "**/assets/**"],
+    "file_exts": [".md"],
+    "runtime": {
+        "request_timeout": 180,
+    },
+    "chunk": {
+        "chunk_size": 900,
+        "chunk_overlap": 120,
+    },
+    "retrieval": {
+        "top_k": 6,
+        "mmr": {"enabled": True},
+        "query_expansion": {
+            "enabled": True,
+            "max_expansions": 6,
+            "synonyms": {
+                "sailing": ["sloop", "schooner", "boat"],
+                "boat": ["vessel", "ship"],
+            },
+        },
+    },
+    "models": {
+        "llm": {
+            "name": "llama3.1",
+            "temperature": 0.1,
+        },
+        "embedding": {
+            "name": "nomic-embed-text",
+        },
+    },
+    "storage": {
+        "chroma_path": ".rag/chroma",
+        "collection_name": "logseq_rag",
+        "clear_before_ingest": False,
+    },
+    "evaluation": {
+        "dataset": "evaluations/datasets/baseline.yaml",
+        "configurations_file": "evaluations/configurations.yaml",
+        "max_queries": None,
+        "scoring": {
+            "accuracy_weight": 0.35,
+            "coverage_weight": 0.2,
+            "relevance_weight": 0.2,
+            "hallucination_weight": 0.15,
+            "speed_weight": 0.1,
+        },
+    },
+}
+
+
+def _deep_merge(base: MutableMapping[str, Any], updates: MutableMapping[str, Any]) -> MutableMapping[str, Any]:
+    """Recursively merge ``updates`` into ``base`` and return ``base``.
+
+    Lists are replaced wholesale to keep intent explicit.  Dictionaries are
+    merged key-by-key.  ``base`` is mutated in-place, so callers should pass a
+    copy when they need to preserve the original.
+    """
+
+    for key, value in updates.items():
+        if isinstance(value, dict) and isinstance(base.get(key), dict):
+            _deep_merge(base[key], value)  # type: ignore[index]
+        else:
+            base[key] = value
+    return base
+
+
+class ConfigNamespace(dict):
+    """Dict subclass that exposes attribute access for convenience."""
+
+    def __getattr__(self, item: str) -> Any:  # pragma: no cover - trivial proxy
+        try:
+            value = self[item]
+        except KeyError as exc:  # pragma: no cover - guard for clarity
+            raise AttributeError(item) from exc
+        return _wrap(value)
+
+    __setattr__ = dict.__setitem__  # type: ignore
+    __delattr__ = dict.__delitem__  # type: ignore
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Return a deep Python ``dict`` copy of the namespace."""
+
+        return _unwrap(self)
+
+
+def _wrap(value: Any) -> Any:
+    if isinstance(value, dict):
+        return ConfigNamespace(value)
+    if isinstance(value, list):
+        return [
+            _wrap(item)
+            for item in value
+        ]
+    return value
+
+
+def _unwrap(value: Any) -> Any:
+    if isinstance(value, ConfigNamespace):
+        return {k: _unwrap(v) for k, v in value.items()}
+    if isinstance(value, dict):
+        return {k: _unwrap(v) for k, v in value.items()}
+    if isinstance(value, list):
+        return [_unwrap(item) for item in value]
+    return copy.deepcopy(value)
+
+
+def _load_yaml(path: Path) -> Dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(
+            f"Configuration file not found: {path}\n"
+            "Create one by copying config.yaml.sample and adjusting the values."
+        )
+    with path.open("r", encoding="utf-8") as f:
+        data = yaml.safe_load(f) or {}
+    if not isinstance(data, dict):
+        raise ValueError(f"Expected mapping at top of {path}, got {type(data)!r}")
+    return data
+
+
+def load_app_config(
+    path: Optional[Path] = None,
+    *,
+    overrides: Optional[Iterable[MutableMapping[str, Any]]] = None,
+) -> ConfigNamespace:
+    """Load the application configuration and apply optional overrides."""
+
+    config_path = path or Path("config.yaml")
+    base = copy.deepcopy(DEFAULT_CONFIG)
+    file_values = _load_yaml(config_path)
+    _deep_merge(base, file_values)
+
+    if overrides:
+        for override in overrides:
+            _deep_merge(base, copy.deepcopy(override))
+
+    return ConfigNamespace(base)
+
+
+def apply_overrides(base: ConfigNamespace, *overrides: MutableMapping[str, Any]) -> ConfigNamespace:
+    """Return a new ``ConfigNamespace`` with overrides applied to ``base``."""
+
+    merged = base.to_dict()
+    for override in overrides:
+        _deep_merge(merged, copy.deepcopy(override))
+    return ConfigNamespace(merged)
+
+
+__all__ = ["ConfigNamespace", "DEFAULT_CONFIG", "apply_overrides", "load_app_config"]
diff --git a/chat.py b/chat.py
@@ -7,16 +7,15 @@
 """
 
 import chromadb
-import yaml
 from llama_index.core import Settings, VectorStoreIndex
 from llama_index.embeddings.ollama import OllamaEmbedding
 from llama_index.llms.ollama import Ollama
 from llama_index.vector_stores.chroma import ChromaVectorStore
 
-with open("config.yaml", "r", encoding="utf-8") as f:
-    CONFIG = yaml.safe_load(f)
+from app_config import ConfigNamespace, load_app_config
+from query_expansion import expand_query
 
-def build_query_engine():
+def build_query_engine(config: ConfigNamespace):
     """Create a ``QueryEngine`` that can answer questions over the Logseq index.
 
     The steps here mirror the high-level components of a RAG system: choose an
@@ -32,25 +31,24 @@ def build_query_engine():
 
     # Models (local via Ollama)
     Settings.llm = Ollama(
-        model=CONFIG["models"]["llm"],
-        request_timeout=180,
-    )
-    Settings.embed_model = OllamaEmbedding(
-        model_name=CONFIG["models"]["embedding"],
+        model=config.models.llm.name,
+        request_timeout=config.runtime.request_timeout,
+        temperature=config.models.llm.temperature,
     )
+    Settings.embed_model = OllamaEmbedding(model_name=config.models.embedding.name)
 
     # Vector store
-    client = chromadb.PersistentClient(path=CONFIG["storage"]["chroma_path"])
-    collection = client.get_or_create_collection("logseq_rag")
+    client = chromadb.PersistentClient(path=config.storage.chroma_path)
+    collection = client.get_or_create_collection(config.storage.collection_name)
     vector_store = ChromaVectorStore(chroma_collection=collection)
 
     # Index from existing Chroma collection
     index = VectorStoreIndex.from_vector_store(vector_store)
 
     # Let LlamaIndex create the retriever internally; pass our knobs only
     query_engine = index.as_query_engine(
-        similarity_top_k=CONFIG["retrieval"]["top_k"],
-        use_mmr=CONFIG["retrieval"]["mmr"],
+        similarity_top_k=config.retrieval.top_k,
+        use_mmr=getattr(config.retrieval.mmr, "enabled", False),
     )
     return query_engine
 
@@ -61,8 +59,10 @@ def main():
     pages. Use ``:q`` to exit when you are done experimenting.
     """
 
+    config = load_app_config()
+
     print("Loading query engine...")
-    qe = build_query_engine()
+    qe = build_query_engine(config)
     print("Ready. Type your question (or :q to quit).")
     while True:
         try:
@@ -73,7 +73,11 @@ def main():
         if q == ":q":
             break
 
-        resp = qe.query(q)
+        expanded = expand_query(q, config)
+        if expanded.changed:
+            print(f"\n(expanded query with synonyms: {', '.join(expanded.added_terms)})")
+
+        resp = qe.query(expanded.expanded)
 
         print("\n--- Answer ---")
         print(resp.response)