From cae50f1a90157af7974fb4bfb6c86d9c5542dc24 Mon Sep 17 00:00:00 2001
From: Cory Donnelly <152584+crd@users.noreply.github.com>
Date: Fri, 7 Nov 2025 08:23:49 -0500
Subject: [PATCH] Add configuration presets and evaluation harness

---
 .gitignore                         |   3 +
 Makefile                           |   9 +-
 README.md                          |  43 ++++-
 app_config.py                      | 169 ++++++++++++++++++++
 chat.py                            |  34 ++--
 config.yaml.sample                 |  62 +++++---
 config/presets/default.yaml        |  47 ++++++
 config/presets/high_recall.yaml    |  16 ++
 config/presets/lightweight.yaml    |  15 ++
 evaluation/runner.py               | 247 +++++++++++++++++++++++++++++
 evaluations/configurations.yaml    |  14 ++
 evaluations/datasets/baseline.yaml |  47 ++++++
 ingest.py                          |  70 +++++---
 query_expansion.py                 |  77 +++++++++
 tests/test_ingest.py               | 143 ++++++++++++++++-
 15 files changed, 930 insertions(+), 66 deletions(-)
 create mode 100644 app_config.py
 create mode 100644 config/presets/default.yaml
 create mode 100644 config/presets/high_recall.yaml
 create mode 100644 config/presets/lightweight.yaml
 create mode 100644 evaluation/runner.py
 create mode 100644 evaluations/configurations.yaml
 create mode 100644 evaluations/datasets/baseline.yaml
 create mode 100644 query_expansion.py

diff --git a/.gitignore b/.gitignore
index 4e3547a..cb73e9d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,5 +12,8 @@ wheels/
 # RAG location
 .rag
 
+# Evaluation artefacts
+evaluations/results/
+
 # Config file
 config.yaml
diff --git a/Makefile b/Makefile
index 0a65ca5..c270313 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: venv install ingest chat lock clean
+.PHONY: venv install ingest chat evaluate lock clean
 
 # Create/refresh a local .venv and install deps from pyproject/uv.lock
 install:
@@ -13,11 +13,14 @@ ingest:
 	uv run ingest.py
 
 chat:
-	uv run chat.py
+        uv run chat.py
+
+evaluate:
+        uv run evaluation/runner.py
 
 # Run the automated test suite
 test:
-	uv run --extra dev pytest
+        uv run --extra dev pytest
 
 # Create/update a lockfile explicitly (optional; uv sync also updates it)
 lock:
diff --git a/README.md b/README.md
index d54ba7e..3d61259 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,23 @@ cd logseq-chat
 make install
 ```
 
-Edit `config.yaml` and at a minimum set `logseq_root` to your Logseq graph directory.
+Copy `config.yaml.sample` to `config.yaml` and customise the values. The sample
+lists tuned defaults plus alternative values (chunk sizes, retrieval depth,
+synonym lists, etc.) that you can toggle as you experiment. At minimum set
+`logseq_root` to your Logseq graph directory.
+
+## Configuration cheat sheet
+- **Chunk size / overlap** – controls how much context each embedding sees.
+  Smaller chunks with slightly larger overlaps (`chunk_size: 650`,
+  `chunk_overlap: 160`) improve recall; larger chunks (`chunk_size: 1200`) speed
+  things up on slower machines.
+- **Retrieval depth** – adjust `retrieval.top_k` and `retrieval.mmr.enabled` to
+  trade recall for latency.
+- **Query expansion** – populate `retrieval.query_expansion.synonyms` with
+  domain-specific vocabulary. Asking “What did I write about sailing?” will also
+  search for “sloop” and “schooner” with the default config.
+- **Model temperature** – lower values keep answers grounded; increase towards
+  `0.3` for more conversational replies.
 
 ## Build index
 ```bash
@@ -41,6 +57,26 @@ make chat
 make test
 ```
 
+## Evaluate presets
+```bash
+make evaluate
+```
+
+The evaluation harness ingests your graph for each preset listed in
+`evaluations/configurations.yaml`, runs the labelled queries from
+`evaluations/datasets/baseline.yaml`, and prints a leaderboard ranked by the
+weighted scoring formula defined in `config.yaml`. The bundled presets are:
+
+| Name        | Purpose                                           |
+| ----------- | ------------------------------------------------- |
+| balanced    | Default profile – accuracy, coverage, and speed.  |
+| high_recall | Smaller chunks, deeper retrieval, more overlap.   |
+| fast_local  | Larger chunks, shallow retrieval for quick tests. |
+
+After the run, the best-scoring configuration is reported and summarised in
+`evaluations/results/latest.yaml`. Use that preset as a starting point for new
+experiments or promote it to your day-to-day `config.yaml`.
+
 ### Example questions
 - Summarize tasks tagged #home in October 2025.
 - Find notes referencing [[Team Topologies]] and list my pros/cons.
@@ -48,4 +84,7 @@ make test
 ## Notes
 - Skips `assets/` by default. Enable OCR later if needed.
 - Uses Markdown-aware chunking; tags from `#tag` and `tags::` stored in metadata.
-- For faster machines, try bigger models; for CPU-only, consider `llama3.2` or `qwen2.5:7b` and smaller chunks.
+- The default configuration enables targeted synonym expansion to improve recall
+  for concept-driven queries (e.g. “sailing” → “sloop”, “schooner”).
+- For faster machines, try bigger models; for CPU-only, consider `llama3.2` or
+  `qwen2.5:7b` and larger chunk sizes to reduce request volume.
diff --git a/app_config.py b/app_config.py
new file mode 100644
index 0000000..0b4003a
--- /dev/null
+++ b/app_config.py
@@ -0,0 +1,169 @@
+"""Configuration loader and helpers for logseq-chat.
+
+This module centralises default settings and exposes a convenient helper for
+loading ``config.yaml`` (or preset overrides) as a nested namespace.  Keeping
+all configuration semantics in one place makes it easier to experiment with the
+RAG pipeline while ensuring scripts like ``ingest.py`` and ``chat.py`` stay in
+sync.
+"""
+
+from __future__ import annotations
+
+import copy
+from pathlib import Path
+from typing import Any, Dict, Iterable, MutableMapping, Optional
+
+import yaml
+
+
+DEFAULT_CONFIG: Dict[str, Any] = {
+    "logseq_root": "",
+    "include_dirs": ["journals", "pages"],
+    "exclude_globs": ["**/.git/**", "**/.DS_Store", "**/assets/**"],
+    "file_exts": [".md"],
+    "runtime": {
+        "request_timeout": 180,
+    },
+    "chunk": {
+        "chunk_size": 900,
+        "chunk_overlap": 120,
+    },
+    "retrieval": {
+        "top_k": 6,
+        "mmr": {"enabled": True},
+        "query_expansion": {
+            "enabled": True,
+            "max_expansions": 6,
+            "synonyms": {
+                "sailing": ["sloop", "schooner", "boat"],
+                "boat": ["vessel", "ship"],
+            },
+        },
+    },
+    "models": {
+        "llm": {
+            "name": "llama3.1",
+            "temperature": 0.1,
+        },
+        "embedding": {
+            "name": "nomic-embed-text",
+        },
+    },
+    "storage": {
+        "chroma_path": ".rag/chroma",
+        "collection_name": "logseq_rag",
+        "clear_before_ingest": False,
+    },
+    "evaluation": {
+        "dataset": "evaluations/datasets/baseline.yaml",
+        "configurations_file": "evaluations/configurations.yaml",
+        "max_queries": None,
+        "scoring": {
+            "accuracy_weight": 0.35,
+            "coverage_weight": 0.2,
+            "relevance_weight": 0.2,
+            "hallucination_weight": 0.15,
+            "speed_weight": 0.1,
+        },
+    },
+}
+
+
+def _deep_merge(base: MutableMapping[str, Any], updates: MutableMapping[str, Any]) -> MutableMapping[str, Any]:
+    """Recursively merge ``updates`` into ``base`` and return ``base``.
+
+    Lists are replaced wholesale to keep intent explicit.  Dictionaries are
+    merged key-by-key.  ``base`` is mutated in-place, so callers should pass a
+    copy when they need to preserve the original.
+    """
+
+    for key, value in updates.items():
+        if isinstance(value, dict) and isinstance(base.get(key), dict):
+            _deep_merge(base[key], value)  # type: ignore[index]
+        else:
+            base[key] = value
+    return base
+
+
+class ConfigNamespace(dict):
+    """Dict subclass that exposes attribute access for convenience."""
+
+    def __getattr__(self, item: str) -> Any:  # pragma: no cover - trivial proxy
+        try:
+            value = self[item]
+        except KeyError as exc:  # pragma: no cover - guard for clarity
+            raise AttributeError(item) from exc
+        return _wrap(value)
+
+    __setattr__ = dict.__setitem__  # type: ignore
+    __delattr__ = dict.__delitem__  # type: ignore
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Return a deep Python ``dict`` copy of the namespace."""
+
+        return _unwrap(self)
+
+
+def _wrap(value: Any) -> Any:
+    if isinstance(value, dict):
+        return ConfigNamespace(value)
+    if isinstance(value, list):
+        return [
+            _wrap(item)
+            for item in value
+        ]
+    return value
+
+
+def _unwrap(value: Any) -> Any:
+    if isinstance(value, ConfigNamespace):
+        return {k: _unwrap(v) for k, v in value.items()}
+    if isinstance(value, dict):
+        return {k: _unwrap(v) for k, v in value.items()}
+    if isinstance(value, list):
+        return [_unwrap(item) for item in value]
+    return copy.deepcopy(value)
+
+
+def _load_yaml(path: Path) -> Dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(
+            f"Configuration file not found: {path}\n"
+            "Create one by copying config.yaml.sample and adjusting the values."
+        )
+    with path.open("r", encoding="utf-8") as f:
+        data = yaml.safe_load(f) or {}
+    if not isinstance(data, dict):
+        raise ValueError(f"Expected mapping at top of {path}, got {type(data)!r}")
+    return data
+
+
+def load_app_config(
+    path: Optional[Path] = None,
+    *,
+    overrides: Optional[Iterable[MutableMapping[str, Any]]] = None,
+) -> ConfigNamespace:
+    """Load the application configuration and apply optional overrides."""
+
+    config_path = path or Path("config.yaml")
+    base = copy.deepcopy(DEFAULT_CONFIG)
+    file_values = _load_yaml(config_path)
+    _deep_merge(base, file_values)
+
+    if overrides:
+        for override in overrides:
+            _deep_merge(base, copy.deepcopy(override))
+
+    return ConfigNamespace(base)
+
+
+def apply_overrides(base: ConfigNamespace, *overrides: MutableMapping[str, Any]) -> ConfigNamespace:
+    """Return a new ``ConfigNamespace`` with overrides applied to ``base``."""
+
+    merged = base.to_dict()
+    for override in overrides:
+        _deep_merge(merged, copy.deepcopy(override))
+    return ConfigNamespace(merged)
+
+
+__all__ = ["ConfigNamespace", "DEFAULT_CONFIG", "apply_overrides", "load_app_config"]
diff --git a/chat.py b/chat.py
index eb2abdc..d172688 100644
--- a/chat.py
+++ b/chat.py
@@ -7,16 +7,15 @@
 """
 
 import chromadb
-import yaml
 from llama_index.core import Settings, VectorStoreIndex
 from llama_index.embeddings.ollama import OllamaEmbedding
 from llama_index.llms.ollama import Ollama
 from llama_index.vector_stores.chroma import ChromaVectorStore
 
-with open("config.yaml", "r", encoding="utf-8") as f:
-    CONFIG = yaml.safe_load(f)
+from app_config import ConfigNamespace, load_app_config
+from query_expansion import expand_query
 
-def build_query_engine():
+def build_query_engine(config: ConfigNamespace):
     """Create a ``QueryEngine`` that can answer questions over the Logseq index.
 
     The steps here mirror the high-level components of a RAG system: choose an
@@ -32,16 +31,15 @@ def build_query_engine():
 
     # Models (local via Ollama)
     Settings.llm = Ollama(
-        model=CONFIG["models"]["llm"],
-        request_timeout=180,
-    )
-    Settings.embed_model = OllamaEmbedding(
-        model_name=CONFIG["models"]["embedding"],
+        model=config.models.llm.name,
+        request_timeout=config.runtime.request_timeout,
+        temperature=config.models.llm.temperature,
     )
+    Settings.embed_model = OllamaEmbedding(model_name=config.models.embedding.name)
 
     # Vector store
-    client = chromadb.PersistentClient(path=CONFIG["storage"]["chroma_path"])
-    collection = client.get_or_create_collection("logseq_rag")
+    client = chromadb.PersistentClient(path=config.storage.chroma_path)
+    collection = client.get_or_create_collection(config.storage.collection_name)
     vector_store = ChromaVectorStore(chroma_collection=collection)
 
     # Index from existing Chroma collection
@@ -49,8 +47,8 @@ def build_query_engine():
 
     # Let LlamaIndex create the retriever internally; pass our knobs only
     query_engine = index.as_query_engine(
-        similarity_top_k=CONFIG["retrieval"]["top_k"],
-        use_mmr=CONFIG["retrieval"]["mmr"],
+        similarity_top_k=config.retrieval.top_k,
+        use_mmr=getattr(config.retrieval.mmr, "enabled", False),
     )
     return query_engine
 
@@ -61,8 +59,10 @@ def main():
     pages. Use ``:q`` to exit when you are done experimenting.
     """
 
+    config = load_app_config()
+
     print("Loading query engine...")
-    qe = build_query_engine()
+    qe = build_query_engine(config)
     print("Ready. Type your question (or :q to quit).")
     while True:
         try:
@@ -73,7 +73,11 @@ def main():
         if q == ":q":
             break
 
-        resp = qe.query(q)
+        expanded = expand_query(q, config)
+        if expanded.changed:
+            print(f"\n(expanded query with synonyms: {', '.join(expanded.added_terms)})")
+
+        resp = qe.query(expanded.expanded)
 
         print("\n--- Answer ---")
         print(resp.response)
diff --git a/config.yaml.sample b/config.yaml.sample
index a5f22fb..a9ce180 100644
--- a/config.yaml.sample
+++ b/config.yaml.sample
@@ -1,33 +1,59 @@
-# Update this to the absolute path of your Logseq graph directory
-# (the folder that contains assets/, journals/, pages/)
-logseq_root: "/Users/jdoe/logseq"
+# Copy this file to config.yaml and customise the values for your graph.
+# The inline comments suggest proven alternatives so you can experiment quickly.
+
+# Absolute path to the Logseq graph folder (the directory that contains
+# journals/ and pages/).
+logseq_root: "/absolute/path/to/your/logseq"  # Example: "/Users/alex/notes"
 
 include_dirs:
-  - "journals"
-  - "pages"
+  - "journals"  # Add "journals/archive" to cover older notes
+  - "pages"     # Replace with "pages/projects" to scope ingestion
 
 exclude_globs:
   - "**/.git/**"
   - "**/.DS_Store"
-  - "**/assets/**"   # Skip heavy assets by default
+  - "**/assets/**"  # Swap to "**/*.{png,jpg}" to skip only large images
+
+file_exts: [".md"]  # Add ".org" if you keep Org-mode files alongside Markdown
 
-file_exts: [".md"]
+runtime:
+  request_timeout: 180  # Lower to 90 if Ollama frequently times out on long answers
 
 chunk:
-  by_headers: true
-  chunk_size: 1200
-  chunk_overlap: 200
+  chunk_size: 900    # Try 650 for higher recall or 1200 for faster ingestion
+  chunk_overlap: 120 # Reduce to 80 when chunks are larger to save memory
 
 retrieval:
-  top_k: 6
-  mmr: true
+  top_k: 6           # Increase to 10 when answers miss supporting context
+  mmr:
+    enabled: true    # Set to false for deterministic ranking when testing
+  query_expansion:
+    enabled: true    # Disable to rely solely on literal matches
+    max_expansions: 6  # Raise to 10 if you define longer synonym lists
+    synonyms:
+      sailing: ["sloop", "schooner", "boat"]  # Add "catamaran" for racing notes
+      boat: ["vessel", "ship"]                # Extend with "kayak" for paddling logs
 
-# On smaller machines use all-minilm, it's relatively lightweight
 models:
-  llm: "llama3.1"
-  embedding: "all-minilm"
+  llm:
+    name: "llama3.1"    # Alternatives: "mistral" for speed, "mixtral" for depth
+    temperature: 0.1     # Raise towards 0.3 for more conversational answers
+  embedding:
+    name: "nomic-embed-text"  # Swap to "all-minilm" when GPU memory is limited
 
 storage:
-  chroma_path: ".rag/chroma"
-  index_store: ".rag/index_store"
-  docstore: ".rag/docstore"
+  chroma_path: ".rag/chroma"          # Use ".rag/chroma_fast" for experiments
+  collection_name: "logseq_rag"        # Give each preset a unique name when testing
+  clear_before_ingest: false           # Set true to rebuild from scratch every run
+
+# Evaluation defaults used by evaluation/runner.py.
+evaluation:
+  dataset: "evaluations/datasets/baseline.yaml"
+  configurations_file: "evaluations/configurations.yaml"
+  max_queries: null   # Set to a small integer for smoke tests during tuning
+  scoring:
+    accuracy_weight: 0.35        # Raise to 0.5 to emphasise exact matches
+    coverage_weight: 0.2         # Lower when source recall is less critical
+    relevance_weight: 0.2        # Balance between required and optional keywords
+    hallucination_weight: 0.15   # Increase to punish unsupported citations
+    speed_weight: 0.1            # Boost when latency matters most
diff --git a/config/presets/default.yaml b/config/presets/default.yaml
new file mode 100644
index 0000000..894542a
--- /dev/null
+++ b/config/presets/default.yaml
@@ -0,0 +1,47 @@
+# Default configuration evaluated by the harness.  These values mirror the
+# settings shipped in config.yaml.sample so users can easily promote this preset
+# to their everyday configuration.
+logseq_root: "/absolute/path/to/your/logseq"
+include_dirs:
+  - "journals"
+  - "pages"
+exclude_globs:
+  - "**/.git/**"
+  - "**/.DS_Store"
+  - "**/assets/**"
+file_exts: [".md"]
+runtime:
+  request_timeout: 180
+chunk:
+  chunk_size: 900
+  chunk_overlap: 120
+retrieval:
+  top_k: 6
+  mmr:
+    enabled: true
+  query_expansion:
+    enabled: true
+    max_expansions: 6
+    synonyms:
+      sailing: ["sloop", "schooner", "boat"]
+      boat: ["vessel", "ship"]
+models:
+  llm:
+    name: "llama3.1"
+    temperature: 0.1
+  embedding:
+    name: "nomic-embed-text"
+storage:
+  chroma_path: ".rag/chroma"
+  collection_name: "logseq_rag"
+  clear_before_ingest: false
+evaluation:
+  dataset: "evaluations/datasets/baseline.yaml"
+  configurations_file: "evaluations/configurations.yaml"
+  max_queries: null
+  scoring:
+    accuracy_weight: 0.35
+    coverage_weight: 0.2
+    relevance_weight: 0.2
+    hallucination_weight: 0.15
+    speed_weight: 0.1
diff --git a/config/presets/high_recall.yaml b/config/presets/high_recall.yaml
new file mode 100644
index 0000000..fd9958a
--- /dev/null
+++ b/config/presets/high_recall.yaml
@@ -0,0 +1,16 @@
+# Prioritise coverage: smaller chunks, heavier overlap, deeper retrieval.
+chunk:
+  chunk_size: 650
+  chunk_overlap: 160
+retrieval:
+  top_k: 10
+  mmr:
+    enabled: true
+  query_expansion:
+    enabled: true
+    max_expansions: 10
+models:
+  llm:
+    temperature: 0.05
+storage:
+  collection_name: "logseq_rag_high_recall"
diff --git a/config/presets/lightweight.yaml b/config/presets/lightweight.yaml
new file mode 100644
index 0000000..6bac9b6
--- /dev/null
+++ b/config/presets/lightweight.yaml
@@ -0,0 +1,15 @@
+# Latency-focused profile for quick iteration on modest hardware.
+chunk:
+  chunk_size: 1200
+  chunk_overlap: 80
+retrieval:
+  top_k: 4
+  mmr:
+    enabled: false
+  query_expansion:
+    enabled: false
+models:
+  llm:
+    temperature: 0.0
+storage:
+  collection_name: "logseq_rag_fast"
diff --git a/evaluation/runner.py b/evaluation/runner.py
new file mode 100644
index 0000000..753b2ad
--- /dev/null
+++ b/evaluation/runner.py
@@ -0,0 +1,247 @@
+"""Quantitative evaluation harness for logseq-chat configurations.
+
+Running ``python evaluation/runner.py`` (or ``make evaluate``) executes the
+following steps:
+
+1. Load the user's base configuration from ``config.yaml``.
+2. Expand it with the presets listed in ``evaluations/configurations.yaml``.
+3. Re-ingest the Logseq graph for each preset into an isolated storage
+   directory.
+4. Query the index with prompts from ``evaluations/datasets/baseline.yaml``.
+5. Compute accuracy, coverage, hallucination, relevance, and latency metrics.
+6. Report a ranked leaderboard and identify the best configuration.
+
+The script is intentionally deterministic and free of network calls so that
+users can iterate rapidly on new presets or datasets.  Replace the sample
+queries with your own gold-standard answers to turn this into a bespoke tuning
+loop for your notes.
+"""
+
+from __future__ import annotations
+
+import argparse
+import statistics
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+
+import yaml
+
+from app_config import ConfigNamespace, apply_overrides, load_app_config
+from chat import build_query_engine
+from ingest import run_ingest
+from query_expansion import expand_query
+
+
+def _load_yaml(path: Path) -> Dict[str, Any]:
+    with path.open("r", encoding="utf-8") as fh:
+        data = yaml.safe_load(fh) or {}
+    if not isinstance(data, dict):
+        raise ValueError(f"Expected mapping at top of {path}, got {type(data)!r}")
+    return data
+
+
+def _ensure_profiled_storage(config: ConfigNamespace, profile: str) -> ConfigNamespace:
+    updated = config.to_dict()
+    base = Path(updated["storage"]["chroma_path"])
+    profile_root = Path(".rag") / "evaluations" / profile
+    updated["storage"]["chroma_path"] = str(profile_root / "chroma")
+    updated["storage"]["collection_name"] = f"{updated['storage'].get('collection_name', 'logseq_rag')}_{profile}"
+    updated["storage"]["clear_before_ingest"] = True
+    return ConfigNamespace(updated)
+
+
+def _extract_sources(response: Any) -> List[str]:
+    sources: List[str] = []
+    for node in getattr(response, "source_nodes", []) or []:
+        metadata = getattr(getattr(node, "node", node), "metadata", None) or {}
+        source = metadata.get("source") or metadata.get("file_path")
+        if source:
+            sources.append(str(source))
+    return sources
+
+
+def _lower_list(values: Iterable[str]) -> List[str]:
+    return [v.lower() for v in values]
+
+
+def _score_query(response_text: str, sources: List[str], spec: Dict[str, Any]) -> Dict[str, float]:
+    text = response_text.lower()
+    keywords = spec.get("answer_keywords", {}) or {}
+    required = _lower_list(keywords.get("required", []) or [])
+    optional = _lower_list(keywords.get("optional", []) or [])
+
+    required_hits = sum(1 for kw in required if kw in text)
+    optional_hits = sum(1 for kw in optional if kw in text)
+
+    accuracy = required_hits / len(required) if required else 1.0
+    relevance = (required_hits + optional_hits) / (len(required) + len(optional)) if (required or optional) else 1.0
+
+    expected_sources = _lower_list(spec.get("expected_sources", []) or [])
+    sources_lower = _lower_list(sources)
+    matched_sources = sum(1 for src in sources_lower if src in expected_sources)
+    coverage = matched_sources / len(expected_sources) if expected_sources else 1.0
+
+    allow_extra = bool(spec.get("allow_additional_sources", False))
+    hallucinations = 0
+    if not allow_extra and sources_lower:
+        hallucinations = sum(1 for src in sources_lower if src not in expected_sources)
+    hallucination_rate = hallucinations / len(sources_lower) if sources_lower else 0.0
+
+    return {
+        "accuracy": accuracy,
+        "relevance": relevance,
+        "coverage": coverage,
+        "hallucination_rate": hallucination_rate,
+    }
+
+
+def evaluate_configuration(
+    name: str,
+    base_config: ConfigNamespace,
+    overrides: Dict[str, Any],
+    dataset: Dict[str, Any],
+    *,
+    max_queries: int | None,
+) -> Tuple[ConfigNamespace, Dict[str, Any]]:
+    config = apply_overrides(base_config, overrides)
+    profiled = _ensure_profiled_storage(config, name)
+
+    run_ingest(profiled, verbose=False)
+    query_engine = build_query_engine(profiled)
+
+    queries = dataset.get("queries", [])
+    results: List[Dict[str, float]] = []
+
+    for spec in queries[: max_queries or len(queries)]:
+        expanded = expand_query(spec["question"], profiled)
+        start = time.perf_counter()
+        response = query_engine.query(expanded.expanded)
+        latency_ms = (time.perf_counter() - start) * 1000
+
+        text = getattr(response, "response", "")
+        sources = _extract_sources(response)
+        metrics = _score_query(text, sources, spec)
+        metrics["latency_ms"] = latency_ms
+        results.append(metrics)
+
+    aggregate: Dict[str, float] = {}
+    if results:
+        for key in ("accuracy", "relevance", "coverage", "hallucination_rate"):
+            aggregate[key] = statistics.mean(r[key] for r in results)
+        aggregate["avg_latency_ms"] = statistics.mean(r["latency_ms"] for r in results)
+    else:
+        aggregate = {"accuracy": 0.0, "relevance": 0.0, "coverage": 0.0, "hallucination_rate": 1.0, "avg_latency_ms": float("inf")}
+
+    return profiled, {"metrics": aggregate, "raw": results}
+
+
+def _composite_score(metrics: Dict[str, float], weights: Dict[str, float], speed_anchor: float) -> float:
+    latency = metrics.get("avg_latency_ms", float("inf"))
+    speed_score = 0.0
+    if speed_anchor > 0 and latency > 0:
+        speed_score = min(speed_anchor / latency, 1.0)
+
+    return (
+        weights.get("accuracy_weight", 0.0) * metrics.get("accuracy", 0.0)
+        + weights.get("coverage_weight", 0.0) * metrics.get("coverage", 0.0)
+        + weights.get("relevance_weight", 0.0) * metrics.get("relevance", 0.0)
+        + weights.get("hallucination_weight", 0.0) * (1 - metrics.get("hallucination_rate", 0.0))
+        + weights.get("speed_weight", 0.0) * speed_score
+    )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Evaluate configuration presets against a labelled dataset.")
+    parser.add_argument("--dataset", type=Path, default=None, help="Override dataset path")
+    parser.add_argument("--configurations", type=Path, default=None, help="Override configurations list")
+    parser.add_argument("--max-queries", type=int, default=None, help="Limit the number of queries executed per configuration")
+    args = parser.parse_args()
+
+    base_config = load_app_config()
+    evaluation_cfg = base_config.evaluation
+
+    dataset_path = args.dataset or Path(evaluation_cfg.dataset)
+    config_list_path = args.configurations or Path(evaluation_cfg.configurations_file)
+    max_queries = args.max_queries if args.max_queries is not None else evaluation_cfg.max_queries
+
+    dataset = _load_yaml(dataset_path)
+    config_specs = _load_yaml(config_list_path).get("configurations", [])
+
+    if not config_specs:
+        raise SystemExit(f"No configurations listed in {config_list_path}")
+
+    weights = evaluation_cfg.scoring.to_dict() if hasattr(evaluation_cfg.scoring, "to_dict") else dict(evaluation_cfg.scoring)
+
+    leaderboard: List[Tuple[str, float, Dict[str, Any]]] = []
+    best_config: Tuple[str, ConfigNamespace, Dict[str, Any]] | None = None
+    latencies: List[float] = []
+    per_config_results: Dict[str, Dict[str, Any]] = {}
+
+    for entry in config_specs:
+        name = entry.get("name")
+        if not name:
+            raise ValueError(f"Configuration entry missing name: {entry}")
+        preset_path = entry.get("preset")
+        overrides: Dict[str, Any] = {}
+        if preset_path:
+            overrides = _load_yaml(Path(preset_path))
+        profile_config, result = evaluate_configuration(
+            name,
+            base_config,
+            overrides,
+            dataset,
+            max_queries=max_queries,
+        )
+        metrics = result["metrics"]
+        latencies.append(metrics.get("avg_latency_ms", 0.0))
+        per_config_results[name] = {"config": profile_config.to_dict(), **result}
+        leaderboard.append((name, 0.0, metrics))
+
+    if not leaderboard:
+        raise SystemExit("No evaluation results produced.")
+
+    speed_anchor = min((metrics.get("avg_latency_ms", float("inf")) for _, _, metrics in leaderboard), default=0.0)
+
+    adjusted: List[Tuple[str, float, Dict[str, Any]]] = []
+    for name, _, metrics in leaderboard:
+        score = _composite_score(metrics, weights, speed_anchor)
+        adjusted.append((name, score, metrics))
+        if best_config is None or score > best_config[2]["metrics"]["score"]:
+            best_config = (name, per_config_results[name]["config"], {"metrics": {**metrics, "score": score}})
+
+    adjusted.sort(key=lambda item: item[1], reverse=True)
+
+    print("\nConfiguration leaderboard:")
+    print("-------------------------")
+    for name, score, metrics in adjusted:
+        print(
+            f"{name:>15}  score={score:0.3f}  accuracy={metrics['accuracy']:0.3f}  "
+            f"coverage={metrics['coverage']:0.3f}  relevance={metrics['relevance']:0.3f}  "
+            f"hallucinations={metrics['hallucination_rate']:0.3f}  avg_latency_ms={metrics['avg_latency_ms']:0.1f}"
+        )
+
+    if best_config:
+        best_name, best_settings, payload = best_config
+        print(
+            "\nBest configuration:",
+            best_name,
+            f"(score={payload['metrics']['score']:0.3f})",
+        )
+        output_path = Path("evaluations/results")
+        output_path.mkdir(parents=True, exist_ok=True)
+        with (output_path / "latest.yaml").open("w", encoding="utf-8") as fh:
+            yaml.safe_dump(
+                {
+                    "best_configuration": best_name,
+                    "score": payload["metrics"]["score"],
+                    "metrics": {k: v for k, v in payload["metrics"].items() if k != "score"},
+                },
+                fh,
+                sort_keys=False,
+            )
+        print(f"Saved summary to {output_path / 'latest.yaml'}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/evaluations/configurations.yaml b/evaluations/configurations.yaml
new file mode 100644
index 0000000..dca0e08
--- /dev/null
+++ b/evaluations/configurations.yaml
@@ -0,0 +1,14 @@
+# Presets evaluated by evaluation/runner.py.  Each entry is merged with the
+# base configuration from config.yaml before ingestion and querying.
+configurations:
+  - name: balanced
+    preset: config/presets/default.yaml
+    description: "Balanced accuracy and speed; enables synonym expansion for conceptual recall."
+
+  - name: high_recall
+    preset: config/presets/high_recall.yaml
+    description: "Favors recall with smaller chunks and deeper retrieval."
+
+  - name: fast_local
+    preset: config/presets/lightweight.yaml
+    description: "Optimised for latency on CPUs; limits chunk overlap and disables expansion."
diff --git a/evaluations/datasets/baseline.yaml b/evaluations/datasets/baseline.yaml
new file mode 100644
index 0000000..384a198
--- /dev/null
+++ b/evaluations/datasets/baseline.yaml
@@ -0,0 +1,47 @@
+# Replace the sample questions with prompts that reflect your own graph.
+# The evaluation harness measures how well each configuration answers these
+# questions and whether the returned sources match your expectations.
+queries:
+  - id: sailing
+    question: "What did I write about sailing?"
+    answer_keywords:
+      required:
+        - "sloop"
+      optional:
+        - "sailing"
+        - "harbor"
+        - "schooner"
+    expected_sources:
+      - "journals/2024-06-01.md"
+      - "pages/sailing.md"
+    allow_additional_sources: true
+    notes: "Focus on nautical synonyms so boats like sloops and schooners are surfaced."
+
+  - id: roadmap_meeting
+    question: "Summarize my meeting with Alex about the roadmap."
+    answer_keywords:
+      required:
+        - "roadmap"
+      optional:
+        - "Alex"
+        - "milestones"
+        - "timeline"
+    expected_sources:
+      - "pages/roadmap_meeting.md"
+      - "journals/2024-04-18.md"
+    allow_additional_sources: false
+    notes: "Use this to watch for hallucinations that mention participants who were not present."
+
+  - id: finances
+    question: "How are my quarterly finances looking?"
+    answer_keywords:
+      required:
+        - "quarter"
+        - "revenue"
+      optional:
+        - "expenses"
+        - "profit"
+    expected_sources:
+      - "pages/finances/q2.md"
+    allow_additional_sources: false
+    notes: "Ensures numerical summaries are grounded in the notes."
diff --git a/ingest.py b/ingest.py
index 987b831..8322261 100644
--- a/ingest.py
+++ b/ingest.py
@@ -20,15 +20,13 @@
 from typing import List
 
 import chromadb
-import yaml
 from llama_index.core import Document, Settings, StorageContext, VectorStoreIndex
 from llama_index.core.node_parser import SimpleNodeParser
 from llama_index.embeddings.ollama import OllamaEmbedding
 from llama_index.llms.ollama import Ollama
 from llama_index.vector_stores.chroma import ChromaVectorStore
 
-with open("config.yaml", "r", encoding="utf-8") as f:
-    CONFIG = yaml.safe_load(f)
+from app_config import ConfigNamespace, load_app_config
 
 PAGE_LINK = re.compile(r"\[\[([^\]]+)\]\]")                 # [[Page]]
 BLOCK_REF = re.compile(r"\(\(([a-zA-Z0-9_-]{6,})\)\)")       # ((block-id))
@@ -120,48 +118,70 @@ def load_documents(paths: List[str]) -> List[Document]:
         docs.append(Document(text=clean, metadata=meta))
     return docs
 
-def main():
-    """Run the full ingestion workflow using settings from ``config.yaml``.
+def run_ingest(config: ConfigNamespace, *, verbose: bool = True) -> None:
+    """Execute the ingestion workflow using ``config``."""
 
-    Running this function end-to-end shows how data collection, cleaning,
-    chunking, and indexing fit together in a practical RAG pipeline.
-    """
-
-    root = CONFIG["logseq_root"]
-    include_dirs = CONFIG["include_dirs"]
-    file_exts = CONFIG["file_exts"]
-    exclude = CONFIG["exclude_globs"]
+    root = config.logseq_root
+    include_dirs = list(config.include_dirs)
+    file_exts = list(config.file_exts)
+    exclude = list(config.exclude_globs)
 
-    if not os.path.isdir(root):
-        raise SystemExit(f"Logseq root does not exist: {root}\nEdit config.yaml to set logseq_root.")
+    if not root or not os.path.isdir(root):
+        raise SystemExit(
+            f"Logseq root does not exist: {root}\n"
+            "Edit config.yaml to set logseq_root before running ingest."
+        )
 
     paths = collect_files(root, include_dirs, file_exts, exclude)
-    print(f"Found {len(paths)} markdown files.")
+    if verbose:
+        print(f"Found {len(paths)} markdown files.")
 
     docs = load_documents(paths)
-    print(f"Loaded {len(docs)} documents.")
+    if verbose:
+        print(f"Loaded {len(docs)} documents.")
 
-    Settings.llm = Ollama(model=CONFIG["models"]["llm"], request_timeout=180)
-    Settings.embed_model = OllamaEmbedding(model_name=CONFIG["models"]["embedding"])
+    Settings.llm = Ollama(
+        model=config.models.llm.name,
+        request_timeout=config.runtime.request_timeout,
+        temperature=config.models.llm.temperature,
+    )
+    Settings.embed_model = OllamaEmbedding(model_name=config.models.embedding.name)
 
     parser = SimpleNodeParser.from_defaults(
         include_metadata=True,
-        chunk_size=CONFIG["chunk"]["chunk_size"],
-        chunk_overlap=CONFIG["chunk"]["chunk_overlap"]
+        chunk_size=config.chunk.chunk_size,
+        chunk_overlap=config.chunk.chunk_overlap,
     )
     nodes = parser.get_nodes_from_documents(docs)
-    print(f"Parsed into {len(nodes)} nodes.")
+    if verbose:
+        print(f"Parsed into {len(nodes)} nodes.")
 
-    chroma_path = CONFIG["storage"]["chroma_path"]
+    chroma_path = config.storage.chroma_path
     os.makedirs(chroma_path, exist_ok=True)
     client = chromadb.PersistentClient(path=chroma_path)
-    collection = client.get_or_create_collection("logseq_rag")
+
+    collection_name = config.storage.collection_name
+    if getattr(config.storage, "clear_before_ingest", False):
+        try:
+            client.delete_collection(collection_name)
+        except Exception:
+            pass
+
+    collection = client.get_or_create_collection(collection_name)
 
     vector_store = ChromaVectorStore(chroma_collection=collection)
     storage_ctx = StorageContext.from_defaults(vector_store=vector_store)
 
     _ = VectorStoreIndex(nodes, storage_context=storage_ctx)
-    print("Index built and persisted to Chroma.")
+    if verbose:
+        print("Index built and persisted to Chroma.")
+
+
+def main() -> None:
+    """Run the full ingestion workflow using settings from ``config.yaml``."""
+
+    config = load_app_config()
+    run_ingest(config)
 
 if __name__ == "__main__":
     main()
diff --git a/query_expansion.py b/query_expansion.py
new file mode 100644
index 0000000..2f56e90
--- /dev/null
+++ b/query_expansion.py
@@ -0,0 +1,77 @@
+"""Utilities for expanding user queries before retrieval.
+
+The heuristic implemented here is intentionally lightweight: it simply appends
+pre-defined synonyms to the natural language question so that vector search can
+match conceptually related terms (e.g. *sloop* when the user asks about
+"sailing").  The synonym lists live in ``config.yaml`` so advanced users can
+fine-tune the behaviour without modifying code.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from typing import List
+
+from app_config import ConfigNamespace
+
+
+_WORD = re.compile(r"[\w'-]+")
+
+
+@dataclass
+class ExpandedQuery:
+    original: str
+    expanded: str
+    added_terms: List[str]
+
+    @property
+    def changed(self) -> bool:
+        return bool(self.added_terms)
+
+
+def expand_query(question: str, config: ConfigNamespace) -> ExpandedQuery:
+    """Expand ``question`` with synonym hints defined in ``config``."""
+
+    expansion_cfg = getattr(config.retrieval, "query_expansion", ConfigNamespace({}))
+    if not getattr(expansion_cfg, "enabled", False):
+        return ExpandedQuery(question, question, [])
+
+    synonyms = getattr(expansion_cfg, "synonyms", {}) or {}
+    if not isinstance(synonyms, dict) or not synonyms:
+        return ExpandedQuery(question, question, [])
+
+    normalized_map = {
+        key.lower(): [term.lower() for term in values]
+        for key, values in synonyms.items()
+    }
+
+    tokens = [token.lower() for token in _WORD.findall(question)]
+
+    added: List[str] = []
+    for token in tokens:
+        if token in normalized_map:
+            added.extend(normalized_map[token])
+        else:
+            for root, related in normalized_map.items():
+                if token in related:
+                    added.append(root)
+
+    deduped: List[str] = []
+    seen = set(tokens)
+    for term in added:
+        if term not in seen and term not in deduped:
+            deduped.append(term)
+
+    max_terms = getattr(expansion_cfg, "max_expansions", None)
+    if isinstance(max_terms, int) and max_terms >= 0:
+        deduped = deduped[:max_terms]
+
+    if not deduped:
+        return ExpandedQuery(question, question, [])
+
+    expanded = f"{question} " + " ".join(deduped)
+    return ExpandedQuery(question, expanded, deduped)
+
+
+__all__ = ["ExpandedQuery", "expand_query"]
diff --git a/tests/test_ingest.py b/tests/test_ingest.py
index 121a40a..23979d7 100644
--- a/tests/test_ingest.py
+++ b/tests/test_ingest.py
@@ -3,15 +3,132 @@
 from pathlib import Path
 import textwrap
 
+import importlib
+import sys
+import types
+from pathlib import Path
+import textwrap
+
 import pytest
 
 
+def _install_dependency_stubs():
+    if "chromadb" not in sys.modules:
+        chromadb = types.ModuleType("chromadb")
+
+        class _Collection:
+            pass
+
+        class _Client:
+            def __init__(self, *_, **__):
+                self._collection = _Collection()
+
+            def get_or_create_collection(self, *_args, **_kwargs):
+                return self._collection
+
+            def delete_collection(self, *_args, **_kwargs):
+                return None
+
+        chromadb.PersistentClient = _Client
+        sys.modules["chromadb"] = chromadb
+
+    if "llama_index" not in sys.modules:
+        root = types.ModuleType("llama_index")
+        sys.modules["llama_index"] = root
+
+    if "llama_index.core" not in sys.modules:
+        core = types.ModuleType("llama_index.core")
+
+        class _DummyDocument:
+            def __init__(self, text: str, metadata: dict):
+                self.text = text
+                self.metadata = metadata
+
+        class _DummySettings:
+            llm = None
+            embed_model = None
+
+        class _DummyStorageContext:
+            @classmethod
+            def from_defaults(cls, **_kwargs):
+                return cls()
+
+        class _DummyVectorStoreIndex:
+            def __init__(self, *_, **__):
+                pass
+
+        core.Document = _DummyDocument
+        core.Settings = _DummySettings
+        core.StorageContext = _DummyStorageContext
+        core.VectorStoreIndex = _DummyVectorStoreIndex
+        sys.modules["llama_index.core"] = core
+
+    if "llama_index.core.node_parser" not in sys.modules:
+        node_parser = types.ModuleType("llama_index.core.node_parser")
+
+        class _Parser:
+            @classmethod
+            def from_defaults(cls, **_kwargs):
+                return cls()
+
+            def get_nodes_from_documents(self, documents):
+                return documents
+
+        node_parser.SimpleNodeParser = _Parser
+        sys.modules["llama_index.core.node_parser"] = node_parser
+
+    if "llama_index.embeddings.ollama" not in sys.modules:
+        embeddings = types.ModuleType("llama_index.embeddings.ollama")
+
+        class _DummyEmbedding:
+            def __init__(self, *_, **__):
+                pass
+
+        embeddings.OllamaEmbedding = _DummyEmbedding
+        sys.modules["llama_index.embeddings.ollama"] = embeddings
+
+    if "llama_index.llms.ollama" not in sys.modules:
+        llms = types.ModuleType("llama_index.llms.ollama")
+
+        class _DummyLLM:
+            def __init__(self, *_, **__):
+                pass
+
+        llms.Ollama = _DummyLLM
+        sys.modules["llama_index.llms.ollama"] = llms
+
+    if "llama_index.vector_stores.chroma" not in sys.modules:
+        vector_store = types.ModuleType("llama_index.vector_stores.chroma")
+
+        class _DummyVectorStore:
+            def __init__(self, *_, **__):
+                pass
+
+        vector_store.ChromaVectorStore = _DummyVectorStore
+        sys.modules["llama_index.vector_stores.chroma"] = vector_store
+
+    if "yaml" not in sys.modules:
+        yaml_stub = types.ModuleType("yaml")
+
+        def _safe_load(data):
+            return {}
+
+        def _safe_dump(_data, _fh, **_kwargs):
+            return None
+
+        yaml_stub.safe_load = _safe_load
+        yaml_stub.safe_dump = _safe_dump
+        sys.modules["yaml"] = yaml_stub
+
+
 @pytest.fixture(scope="session")
 def ingest_module():
     project_root = Path(__file__).resolve().parents[1]
     config_path = project_root / "config.yaml"
     created = False
 
+    _install_dependency_stubs()
+
     if not config_path.exists():
         config_path.write_text(
             textwrap.dedent(
@@ -20,17 +137,37 @@ def ingest_module():
                 include_dirs: []
                 file_exts: []
                 exclude_globs: []
+                runtime:
+                  request_timeout: 30
                 models:
-                  llm: llama3.1
-                  embedding: nomic-embed-text
+                  llm:
+                    name: llama3.1
+                    temperature: 0.0
+                  embedding:
+                    name: nomic-embed-text
                 storage:
                   chroma_path: /tmp/chroma
+                  collection_name: test_collection
+                  clear_before_ingest: true
                 retrieval:
                   top_k: 5
-                  mmr: false
+                  mmr:
+                    enabled: false
+                  query_expansion:
+                    enabled: false
                 chunk:
                   chunk_size: 512
                   chunk_overlap: 50
+                evaluation:
+                  dataset: evaluations/datasets/baseline.yaml
+                  configurations_file: evaluations/configurations.yaml
+                  max_queries: null
+                  scoring:
+                    accuracy_weight: 0.35
+                    coverage_weight: 0.2
+                    relevance_weight: 0.2
+                    hallucination_weight: 0.15
+                    speed_weight: 0.1
                 """
             ).strip()
         )