From cae50f1a90157af7974fb4bfb6c86d9c5542dc24 Mon Sep 17 00:00:00 2001 From: Cory Donnelly <152584+crd@users.noreply.github.com> Date: Fri, 7 Nov 2025 08:23:49 -0500 Subject: [PATCH] Add configuration presets and evaluation harness --- .gitignore | 3 + Makefile | 9 +- README.md | 43 ++++- app_config.py | 169 ++++++++++++++++++++ chat.py | 34 ++-- config.yaml.sample | 62 +++++--- config/presets/default.yaml | 47 ++++++ config/presets/high_recall.yaml | 16 ++ config/presets/lightweight.yaml | 15 ++ evaluation/runner.py | 247 +++++++++++++++++++++++++++++ evaluations/configurations.yaml | 14 ++ evaluations/datasets/baseline.yaml | 47 ++++++ ingest.py | 70 +++++--- query_expansion.py | 77 +++++++++ tests/test_ingest.py | 143 ++++++++++++++++- 15 files changed, 930 insertions(+), 66 deletions(-) create mode 100644 app_config.py create mode 100644 config/presets/default.yaml create mode 100644 config/presets/high_recall.yaml create mode 100644 config/presets/lightweight.yaml create mode 100644 evaluation/runner.py create mode 100644 evaluations/configurations.yaml create mode 100644 evaluations/datasets/baseline.yaml create mode 100644 query_expansion.py diff --git a/.gitignore b/.gitignore index 4e3547a..cb73e9d 100644 --- a/.gitignore +++ b/.gitignore @@ -12,5 +12,8 @@ wheels/ # RAG location .rag +# Evaluation artefacts +evaluations/results/ + # Config file config.yaml diff --git a/Makefile b/Makefile index 0a65ca5..c270313 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: venv install ingest chat lock clean +.PHONY: venv install ingest chat evaluate lock clean # Create/refresh a local .venv and install deps from pyproject/uv.lock install: @@ -13,11 +13,14 @@ ingest: uv run ingest.py chat: - uv run chat.py + uv run chat.py + +evaluate: + uv run evaluation/runner.py # Run the automated test suite test: - uv run --extra dev pytest + uv run --extra dev pytest # Create/update a lockfile explicitly (optional; uv sync also updates it) lock: diff --git a/README.md b/README.md index d54ba7e..3d61259 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,23 @@ cd logseq-chat make install ``` -Edit `config.yaml` and at a minimum set `logseq_root` to your Logseq graph directory. +Copy `config.yaml.sample` to `config.yaml` and customise the values. The sample +lists tuned defaults plus alternative values (chunk sizes, retrieval depth, +synonym lists, etc.) that you can toggle as you experiment. At minimum set +`logseq_root` to your Logseq graph directory. + +## Configuration cheat sheet +- **Chunk size / overlap** – controls how much context each embedding sees. + Smaller chunks with slightly larger overlaps (`chunk_size: 650`, + `chunk_overlap: 160`) improve recall; larger chunks (`chunk_size: 1200`) speed + things up on slower machines. +- **Retrieval depth** – adjust `retrieval.top_k` and `retrieval.mmr.enabled` to + trade recall for latency. +- **Query expansion** – populate `retrieval.query_expansion.synonyms` with + domain-specific vocabulary. Asking “What did I write about sailing?” will also + search for “sloop” and “schooner” with the default config. +- **Model temperature** – lower values keep answers grounded; increase towards + `0.3` for more conversational replies. ## Build index ```bash @@ -41,6 +57,26 @@ make chat make test ``` +## Evaluate presets +```bash +make evaluate +``` + +The evaluation harness ingests your graph for each preset listed in +`evaluations/configurations.yaml`, runs the labelled queries from +`evaluations/datasets/baseline.yaml`, and prints a leaderboard ranked by the +weighted scoring formula defined in `config.yaml`. The bundled presets are: + +| Name | Purpose | +| ----------- | ------------------------------------------------- | +| balanced | Default profile – accuracy, coverage, and speed. | +| high_recall | Smaller chunks, deeper retrieval, more overlap. | +| fast_local | Larger chunks, shallow retrieval for quick tests. | + +After the run, the best-scoring configuration is reported and summarised in +`evaluations/results/latest.yaml`. Use that preset as a starting point for new +experiments or promote it to your day-to-day `config.yaml`. + ### Example questions - Summarize tasks tagged #home in October 2025. - Find notes referencing [[Team Topologies]] and list my pros/cons. @@ -48,4 +84,7 @@ make test ## Notes - Skips `assets/` by default. Enable OCR later if needed. - Uses Markdown-aware chunking; tags from `#tag` and `tags::` stored in metadata. -- For faster machines, try bigger models; for CPU-only, consider `llama3.2` or `qwen2.5:7b` and smaller chunks. +- The default configuration enables targeted synonym expansion to improve recall + for concept-driven queries (e.g. “sailing” → “sloop”, “schooner”). +- For faster machines, try bigger models; for CPU-only, consider `llama3.2` or + `qwen2.5:7b` and larger chunk sizes to reduce request volume. diff --git a/app_config.py b/app_config.py new file mode 100644 index 0000000..0b4003a --- /dev/null +++ b/app_config.py @@ -0,0 +1,169 @@ +"""Configuration loader and helpers for logseq-chat. + +This module centralises default settings and exposes a convenient helper for +loading ``config.yaml`` (or preset overrides) as a nested namespace. Keeping +all configuration semantics in one place makes it easier to experiment with the +RAG pipeline while ensuring scripts like ``ingest.py`` and ``chat.py`` stay in +sync. +""" + +from __future__ import annotations + +import copy +from pathlib import Path +from typing import Any, Dict, Iterable, MutableMapping, Optional + +import yaml + + +DEFAULT_CONFIG: Dict[str, Any] = { + "logseq_root": "", + "include_dirs": ["journals", "pages"], + "exclude_globs": ["**/.git/**", "**/.DS_Store", "**/assets/**"], + "file_exts": [".md"], + "runtime": { + "request_timeout": 180, + }, + "chunk": { + "chunk_size": 900, + "chunk_overlap": 120, + }, + "retrieval": { + "top_k": 6, + "mmr": {"enabled": True}, + "query_expansion": { + "enabled": True, + "max_expansions": 6, + "synonyms": { + "sailing": ["sloop", "schooner", "boat"], + "boat": ["vessel", "ship"], + }, + }, + }, + "models": { + "llm": { + "name": "llama3.1", + "temperature": 0.1, + }, + "embedding": { + "name": "nomic-embed-text", + }, + }, + "storage": { + "chroma_path": ".rag/chroma", + "collection_name": "logseq_rag", + "clear_before_ingest": False, + }, + "evaluation": { + "dataset": "evaluations/datasets/baseline.yaml", + "configurations_file": "evaluations/configurations.yaml", + "max_queries": None, + "scoring": { + "accuracy_weight": 0.35, + "coverage_weight": 0.2, + "relevance_weight": 0.2, + "hallucination_weight": 0.15, + "speed_weight": 0.1, + }, + }, +} + + +def _deep_merge(base: MutableMapping[str, Any], updates: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + """Recursively merge ``updates`` into ``base`` and return ``base``. + + Lists are replaced wholesale to keep intent explicit. Dictionaries are + merged key-by-key. ``base`` is mutated in-place, so callers should pass a + copy when they need to preserve the original. + """ + + for key, value in updates.items(): + if isinstance(value, dict) and isinstance(base.get(key), dict): + _deep_merge(base[key], value) # type: ignore[index] + else: + base[key] = value + return base + + +class ConfigNamespace(dict): + """Dict subclass that exposes attribute access for convenience.""" + + def __getattr__(self, item: str) -> Any: # pragma: no cover - trivial proxy + try: + value = self[item] + except KeyError as exc: # pragma: no cover - guard for clarity + raise AttributeError(item) from exc + return _wrap(value) + + __setattr__ = dict.__setitem__ # type: ignore + __delattr__ = dict.__delitem__ # type: ignore + + def to_dict(self) -> Dict[str, Any]: + """Return a deep Python ``dict`` copy of the namespace.""" + + return _unwrap(self) + + +def _wrap(value: Any) -> Any: + if isinstance(value, dict): + return ConfigNamespace(value) + if isinstance(value, list): + return [ + _wrap(item) + for item in value + ] + return value + + +def _unwrap(value: Any) -> Any: + if isinstance(value, ConfigNamespace): + return {k: _unwrap(v) for k, v in value.items()} + if isinstance(value, dict): + return {k: _unwrap(v) for k, v in value.items()} + if isinstance(value, list): + return [_unwrap(item) for item in value] + return copy.deepcopy(value) + + +def _load_yaml(path: Path) -> Dict[str, Any]: + if not path.exists(): + raise FileNotFoundError( + f"Configuration file not found: {path}\n" + "Create one by copying config.yaml.sample and adjusting the values." + ) + with path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) or {} + if not isinstance(data, dict): + raise ValueError(f"Expected mapping at top of {path}, got {type(data)!r}") + return data + + +def load_app_config( + path: Optional[Path] = None, + *, + overrides: Optional[Iterable[MutableMapping[str, Any]]] = None, +) -> ConfigNamespace: + """Load the application configuration and apply optional overrides.""" + + config_path = path or Path("config.yaml") + base = copy.deepcopy(DEFAULT_CONFIG) + file_values = _load_yaml(config_path) + _deep_merge(base, file_values) + + if overrides: + for override in overrides: + _deep_merge(base, copy.deepcopy(override)) + + return ConfigNamespace(base) + + +def apply_overrides(base: ConfigNamespace, *overrides: MutableMapping[str, Any]) -> ConfigNamespace: + """Return a new ``ConfigNamespace`` with overrides applied to ``base``.""" + + merged = base.to_dict() + for override in overrides: + _deep_merge(merged, copy.deepcopy(override)) + return ConfigNamespace(merged) + + +__all__ = ["ConfigNamespace", "DEFAULT_CONFIG", "apply_overrides", "load_app_config"] diff --git a/chat.py b/chat.py index eb2abdc..d172688 100644 --- a/chat.py +++ b/chat.py @@ -7,16 +7,15 @@ """ import chromadb -import yaml from llama_index.core import Settings, VectorStoreIndex from llama_index.embeddings.ollama import OllamaEmbedding from llama_index.llms.ollama import Ollama from llama_index.vector_stores.chroma import ChromaVectorStore -with open("config.yaml", "r", encoding="utf-8") as f: - CONFIG = yaml.safe_load(f) +from app_config import ConfigNamespace, load_app_config +from query_expansion import expand_query -def build_query_engine(): +def build_query_engine(config: ConfigNamespace): """Create a ``QueryEngine`` that can answer questions over the Logseq index. The steps here mirror the high-level components of a RAG system: choose an @@ -32,16 +31,15 @@ def build_query_engine(): # Models (local via Ollama) Settings.llm = Ollama( - model=CONFIG["models"]["llm"], - request_timeout=180, - ) - Settings.embed_model = OllamaEmbedding( - model_name=CONFIG["models"]["embedding"], + model=config.models.llm.name, + request_timeout=config.runtime.request_timeout, + temperature=config.models.llm.temperature, ) + Settings.embed_model = OllamaEmbedding(model_name=config.models.embedding.name) # Vector store - client = chromadb.PersistentClient(path=CONFIG["storage"]["chroma_path"]) - collection = client.get_or_create_collection("logseq_rag") + client = chromadb.PersistentClient(path=config.storage.chroma_path) + collection = client.get_or_create_collection(config.storage.collection_name) vector_store = ChromaVectorStore(chroma_collection=collection) # Index from existing Chroma collection @@ -49,8 +47,8 @@ def build_query_engine(): # Let LlamaIndex create the retriever internally; pass our knobs only query_engine = index.as_query_engine( - similarity_top_k=CONFIG["retrieval"]["top_k"], - use_mmr=CONFIG["retrieval"]["mmr"], + similarity_top_k=config.retrieval.top_k, + use_mmr=getattr(config.retrieval.mmr, "enabled", False), ) return query_engine @@ -61,8 +59,10 @@ def main(): pages. Use ``:q`` to exit when you are done experimenting. """ + config = load_app_config() + print("Loading query engine...") - qe = build_query_engine() + qe = build_query_engine(config) print("Ready. Type your question (or :q to quit).") while True: try: @@ -73,7 +73,11 @@ def main(): if q == ":q": break - resp = qe.query(q) + expanded = expand_query(q, config) + if expanded.changed: + print(f"\n(expanded query with synonyms: {', '.join(expanded.added_terms)})") + + resp = qe.query(expanded.expanded) print("\n--- Answer ---") print(resp.response) diff --git a/config.yaml.sample b/config.yaml.sample index a5f22fb..a9ce180 100644 --- a/config.yaml.sample +++ b/config.yaml.sample @@ -1,33 +1,59 @@ -# Update this to the absolute path of your Logseq graph directory -# (the folder that contains assets/, journals/, pages/) -logseq_root: "/Users/jdoe/logseq" +# Copy this file to config.yaml and customise the values for your graph. +# The inline comments suggest proven alternatives so you can experiment quickly. + +# Absolute path to the Logseq graph folder (the directory that contains +# journals/ and pages/). +logseq_root: "/absolute/path/to/your/logseq" # Example: "/Users/alex/notes" include_dirs: - - "journals" - - "pages" + - "journals" # Add "journals/archive" to cover older notes + - "pages" # Replace with "pages/projects" to scope ingestion exclude_globs: - "**/.git/**" - "**/.DS_Store" - - "**/assets/**" # Skip heavy assets by default + - "**/assets/**" # Swap to "**/*.{png,jpg}" to skip only large images + +file_exts: [".md"] # Add ".org" if you keep Org-mode files alongside Markdown -file_exts: [".md"] +runtime: + request_timeout: 180 # Lower to 90 if Ollama frequently times out on long answers chunk: - by_headers: true - chunk_size: 1200 - chunk_overlap: 200 + chunk_size: 900 # Try 650 for higher recall or 1200 for faster ingestion + chunk_overlap: 120 # Reduce to 80 when chunks are larger to save memory retrieval: - top_k: 6 - mmr: true + top_k: 6 # Increase to 10 when answers miss supporting context + mmr: + enabled: true # Set to false for deterministic ranking when testing + query_expansion: + enabled: true # Disable to rely solely on literal matches + max_expansions: 6 # Raise to 10 if you define longer synonym lists + synonyms: + sailing: ["sloop", "schooner", "boat"] # Add "catamaran" for racing notes + boat: ["vessel", "ship"] # Extend with "kayak" for paddling logs -# On smaller machines use all-minilm, it's relatively lightweight models: - llm: "llama3.1" - embedding: "all-minilm" + llm: + name: "llama3.1" # Alternatives: "mistral" for speed, "mixtral" for depth + temperature: 0.1 # Raise towards 0.3 for more conversational answers + embedding: + name: "nomic-embed-text" # Swap to "all-minilm" when GPU memory is limited storage: - chroma_path: ".rag/chroma" - index_store: ".rag/index_store" - docstore: ".rag/docstore" + chroma_path: ".rag/chroma" # Use ".rag/chroma_fast" for experiments + collection_name: "logseq_rag" # Give each preset a unique name when testing + clear_before_ingest: false # Set true to rebuild from scratch every run + +# Evaluation defaults used by evaluation/runner.py. +evaluation: + dataset: "evaluations/datasets/baseline.yaml" + configurations_file: "evaluations/configurations.yaml" + max_queries: null # Set to a small integer for smoke tests during tuning + scoring: + accuracy_weight: 0.35 # Raise to 0.5 to emphasise exact matches + coverage_weight: 0.2 # Lower when source recall is less critical + relevance_weight: 0.2 # Balance between required and optional keywords + hallucination_weight: 0.15 # Increase to punish unsupported citations + speed_weight: 0.1 # Boost when latency matters most diff --git a/config/presets/default.yaml b/config/presets/default.yaml new file mode 100644 index 0000000..894542a --- /dev/null +++ b/config/presets/default.yaml @@ -0,0 +1,47 @@ +# Default configuration evaluated by the harness. These values mirror the +# settings shipped in config.yaml.sample so users can easily promote this preset +# to their everyday configuration. +logseq_root: "/absolute/path/to/your/logseq" +include_dirs: + - "journals" + - "pages" +exclude_globs: + - "**/.git/**" + - "**/.DS_Store" + - "**/assets/**" +file_exts: [".md"] +runtime: + request_timeout: 180 +chunk: + chunk_size: 900 + chunk_overlap: 120 +retrieval: + top_k: 6 + mmr: + enabled: true + query_expansion: + enabled: true + max_expansions: 6 + synonyms: + sailing: ["sloop", "schooner", "boat"] + boat: ["vessel", "ship"] +models: + llm: + name: "llama3.1" + temperature: 0.1 + embedding: + name: "nomic-embed-text" +storage: + chroma_path: ".rag/chroma" + collection_name: "logseq_rag" + clear_before_ingest: false +evaluation: + dataset: "evaluations/datasets/baseline.yaml" + configurations_file: "evaluations/configurations.yaml" + max_queries: null + scoring: + accuracy_weight: 0.35 + coverage_weight: 0.2 + relevance_weight: 0.2 + hallucination_weight: 0.15 + speed_weight: 0.1 diff --git a/config/presets/high_recall.yaml b/config/presets/high_recall.yaml new file mode 100644 index 0000000..fd9958a --- /dev/null +++ b/config/presets/high_recall.yaml @@ -0,0 +1,16 @@ +# Prioritise coverage: smaller chunks, heavier overlap, deeper retrieval. +chunk: + chunk_size: 650 + chunk_overlap: 160 +retrieval: + top_k: 10 + mmr: + enabled: true + query_expansion: + enabled: true + max_expansions: 10 +models: + llm: + temperature: 0.05 +storage: + collection_name: "logseq_rag_high_recall" diff --git a/config/presets/lightweight.yaml b/config/presets/lightweight.yaml new file mode 100644 index 0000000..6bac9b6 --- /dev/null +++ b/config/presets/lightweight.yaml @@ -0,0 +1,15 @@ +# Latency-focused profile for quick iteration on modest hardware. +chunk: + chunk_size: 1200 + chunk_overlap: 80 +retrieval: + top_k: 4 + mmr: + enabled: false + query_expansion: + enabled: false +models: + llm: + temperature: 0.0 +storage: + collection_name: "logseq_rag_fast" diff --git a/evaluation/runner.py b/evaluation/runner.py new file mode 100644 index 0000000..753b2ad --- /dev/null +++ b/evaluation/runner.py @@ -0,0 +1,247 @@ +"""Quantitative evaluation harness for logseq-chat configurations. + +Running ``python evaluation/runner.py`` (or ``make evaluate``) executes the +following steps: + +1. Load the user's base configuration from ``config.yaml``. +2. Expand it with the presets listed in ``evaluations/configurations.yaml``. +3. Re-ingest the Logseq graph for each preset into an isolated storage + directory. +4. Query the index with prompts from ``evaluations/datasets/baseline.yaml``. +5. Compute accuracy, coverage, hallucination, relevance, and latency metrics. +6. Report a ranked leaderboard and identify the best configuration. + +The script is intentionally deterministic and free of network calls so that +users can iterate rapidly on new presets or datasets. Replace the sample +queries with your own gold-standard answers to turn this into a bespoke tuning +loop for your notes. +""" + +from __future__ import annotations + +import argparse +import statistics +import time +from pathlib import Path +from typing import Any, Dict, List, Tuple + +import yaml + +from app_config import ConfigNamespace, apply_overrides, load_app_config +from chat import build_query_engine +from ingest import run_ingest +from query_expansion import expand_query + + +def _load_yaml(path: Path) -> Dict[str, Any]: + with path.open("r", encoding="utf-8") as fh: + data = yaml.safe_load(fh) or {} + if not isinstance(data, dict): + raise ValueError(f"Expected mapping at top of {path}, got {type(data)!r}") + return data + + +def _ensure_profiled_storage(config: ConfigNamespace, profile: str) -> ConfigNamespace: + updated = config.to_dict() + base = Path(updated["storage"]["chroma_path"]) + profile_root = Path(".rag") / "evaluations" / profile + updated["storage"]["chroma_path"] = str(profile_root / "chroma") + updated["storage"]["collection_name"] = f"{updated['storage'].get('collection_name', 'logseq_rag')}_{profile}" + updated["storage"]["clear_before_ingest"] = True + return ConfigNamespace(updated) + + +def _extract_sources(response: Any) -> List[str]: + sources: List[str] = [] + for node in getattr(response, "source_nodes", []) or []: + metadata = getattr(getattr(node, "node", node), "metadata", None) or {} + source = metadata.get("source") or metadata.get("file_path") + if source: + sources.append(str(source)) + return sources + + +def _lower_list(values: Iterable[str]) -> List[str]: + return [v.lower() for v in values] + + +def _score_query(response_text: str, sources: List[str], spec: Dict[str, Any]) -> Dict[str, float]: + text = response_text.lower() + keywords = spec.get("answer_keywords", {}) or {} + required = _lower_list(keywords.get("required", []) or []) + optional = _lower_list(keywords.get("optional", []) or []) + + required_hits = sum(1 for kw in required if kw in text) + optional_hits = sum(1 for kw in optional if kw in text) + + accuracy = required_hits / len(required) if required else 1.0 + relevance = (required_hits + optional_hits) / (len(required) + len(optional)) if (required or optional) else 1.0 + + expected_sources = _lower_list(spec.get("expected_sources", []) or []) + sources_lower = _lower_list(sources) + matched_sources = sum(1 for src in sources_lower if src in expected_sources) + coverage = matched_sources / len(expected_sources) if expected_sources else 1.0 + + allow_extra = bool(spec.get("allow_additional_sources", False)) + hallucinations = 0 + if not allow_extra and sources_lower: + hallucinations = sum(1 for src in sources_lower if src not in expected_sources) + hallucination_rate = hallucinations / len(sources_lower) if sources_lower else 0.0 + + return { + "accuracy": accuracy, + "relevance": relevance, + "coverage": coverage, + "hallucination_rate": hallucination_rate, + } + + +def evaluate_configuration( + name: str, + base_config: ConfigNamespace, + overrides: Dict[str, Any], + dataset: Dict[str, Any], + *, + max_queries: int | None, +) -> Tuple[ConfigNamespace, Dict[str, Any]]: + config = apply_overrides(base_config, overrides) + profiled = _ensure_profiled_storage(config, name) + + run_ingest(profiled, verbose=False) + query_engine = build_query_engine(profiled) + + queries = dataset.get("queries", []) + results: List[Dict[str, float]] = [] + + for spec in queries[: max_queries or len(queries)]: + expanded = expand_query(spec["question"], profiled) + start = time.perf_counter() + response = query_engine.query(expanded.expanded) + latency_ms = (time.perf_counter() - start) * 1000 + + text = getattr(response, "response", "") + sources = _extract_sources(response) + metrics = _score_query(text, sources, spec) + metrics["latency_ms"] = latency_ms + results.append(metrics) + + aggregate: Dict[str, float] = {} + if results: + for key in ("accuracy", "relevance", "coverage", "hallucination_rate"): + aggregate[key] = statistics.mean(r[key] for r in results) + aggregate["avg_latency_ms"] = statistics.mean(r["latency_ms"] for r in results) + else: + aggregate = {"accuracy": 0.0, "relevance": 0.0, "coverage": 0.0, "hallucination_rate": 1.0, "avg_latency_ms": float("inf")} + + return profiled, {"metrics": aggregate, "raw": results} + + +def _composite_score(metrics: Dict[str, float], weights: Dict[str, float], speed_anchor: float) -> float: + latency = metrics.get("avg_latency_ms", float("inf")) + speed_score = 0.0 + if speed_anchor > 0 and latency > 0: + speed_score = min(speed_anchor / latency, 1.0) + + return ( + weights.get("accuracy_weight", 0.0) * metrics.get("accuracy", 0.0) + + weights.get("coverage_weight", 0.0) * metrics.get("coverage", 0.0) + + weights.get("relevance_weight", 0.0) * metrics.get("relevance", 0.0) + + weights.get("hallucination_weight", 0.0) * (1 - metrics.get("hallucination_rate", 0.0)) + + weights.get("speed_weight", 0.0) * speed_score + ) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Evaluate configuration presets against a labelled dataset.") + parser.add_argument("--dataset", type=Path, default=None, help="Override dataset path") + parser.add_argument("--configurations", type=Path, default=None, help="Override configurations list") + parser.add_argument("--max-queries", type=int, default=None, help="Limit the number of queries executed per configuration") + args = parser.parse_args() + + base_config = load_app_config() + evaluation_cfg = base_config.evaluation + + dataset_path = args.dataset or Path(evaluation_cfg.dataset) + config_list_path = args.configurations or Path(evaluation_cfg.configurations_file) + max_queries = args.max_queries if args.max_queries is not None else evaluation_cfg.max_queries + + dataset = _load_yaml(dataset_path) + config_specs = _load_yaml(config_list_path).get("configurations", []) + + if not config_specs: + raise SystemExit(f"No configurations listed in {config_list_path}") + + weights = evaluation_cfg.scoring.to_dict() if hasattr(evaluation_cfg.scoring, "to_dict") else dict(evaluation_cfg.scoring) + + leaderboard: List[Tuple[str, float, Dict[str, Any]]] = [] + best_config: Tuple[str, ConfigNamespace, Dict[str, Any]] | None = None + latencies: List[float] = [] + per_config_results: Dict[str, Dict[str, Any]] = {} + + for entry in config_specs: + name = entry.get("name") + if not name: + raise ValueError(f"Configuration entry missing name: {entry}") + preset_path = entry.get("preset") + overrides: Dict[str, Any] = {} + if preset_path: + overrides = _load_yaml(Path(preset_path)) + profile_config, result = evaluate_configuration( + name, + base_config, + overrides, + dataset, + max_queries=max_queries, + ) + metrics = result["metrics"] + latencies.append(metrics.get("avg_latency_ms", 0.0)) + per_config_results[name] = {"config": profile_config.to_dict(), **result} + leaderboard.append((name, 0.0, metrics)) + + if not leaderboard: + raise SystemExit("No evaluation results produced.") + + speed_anchor = min((metrics.get("avg_latency_ms", float("inf")) for _, _, metrics in leaderboard), default=0.0) + + adjusted: List[Tuple[str, float, Dict[str, Any]]] = [] + for name, _, metrics in leaderboard: + score = _composite_score(metrics, weights, speed_anchor) + adjusted.append((name, score, metrics)) + if best_config is None or score > best_config[2]["metrics"]["score"]: + best_config = (name, per_config_results[name]["config"], {"metrics": {**metrics, "score": score}}) + + adjusted.sort(key=lambda item: item[1], reverse=True) + + print("\nConfiguration leaderboard:") + print("-------------------------") + for name, score, metrics in adjusted: + print( + f"{name:>15} score={score:0.3f} accuracy={metrics['accuracy']:0.3f} " + f"coverage={metrics['coverage']:0.3f} relevance={metrics['relevance']:0.3f} " + f"hallucinations={metrics['hallucination_rate']:0.3f} avg_latency_ms={metrics['avg_latency_ms']:0.1f}" + ) + + if best_config: + best_name, best_settings, payload = best_config + print( + "\nBest configuration:", + best_name, + f"(score={payload['metrics']['score']:0.3f})", + ) + output_path = Path("evaluations/results") + output_path.mkdir(parents=True, exist_ok=True) + with (output_path / "latest.yaml").open("w", encoding="utf-8") as fh: + yaml.safe_dump( + { + "best_configuration": best_name, + "score": payload["metrics"]["score"], + "metrics": {k: v for k, v in payload["metrics"].items() if k != "score"}, + }, + fh, + sort_keys=False, + ) + print(f"Saved summary to {output_path / 'latest.yaml'}") + + +if __name__ == "__main__": + main() diff --git a/evaluations/configurations.yaml b/evaluations/configurations.yaml new file mode 100644 index 0000000..dca0e08 --- /dev/null +++ b/evaluations/configurations.yaml @@ -0,0 +1,14 @@ +# Presets evaluated by evaluation/runner.py. Each entry is merged with the +# base configuration from config.yaml before ingestion and querying. +configurations: + - name: balanced + preset: config/presets/default.yaml + description: "Balanced accuracy and speed; enables synonym expansion for conceptual recall." + + - name: high_recall + preset: config/presets/high_recall.yaml + description: "Favors recall with smaller chunks and deeper retrieval." + + - name: fast_local + preset: config/presets/lightweight.yaml + description: "Optimised for latency on CPUs; limits chunk overlap and disables expansion." diff --git a/evaluations/datasets/baseline.yaml b/evaluations/datasets/baseline.yaml new file mode 100644 index 0000000..384a198 --- /dev/null +++ b/evaluations/datasets/baseline.yaml @@ -0,0 +1,47 @@ +# Replace the sample questions with prompts that reflect your own graph. +# The evaluation harness measures how well each configuration answers these +# questions and whether the returned sources match your expectations. +queries: + - id: sailing + question: "What did I write about sailing?" + answer_keywords: + required: + - "sloop" + optional: + - "sailing" + - "harbor" + - "schooner" + expected_sources: + - "journals/2024-06-01.md" + - "pages/sailing.md" + allow_additional_sources: true + notes: "Focus on nautical synonyms so boats like sloops and schooners are surfaced." + + - id: roadmap_meeting + question: "Summarize my meeting with Alex about the roadmap." + answer_keywords: + required: + - "roadmap" + optional: + - "Alex" + - "milestones" + - "timeline" + expected_sources: + - "pages/roadmap_meeting.md" + - "journals/2024-04-18.md" + allow_additional_sources: false + notes: "Use this to watch for hallucinations that mention participants who were not present." + + - id: finances + question: "How are my quarterly finances looking?" + answer_keywords: + required: + - "quarter" + - "revenue" + optional: + - "expenses" + - "profit" + expected_sources: + - "pages/finances/q2.md" + allow_additional_sources: false + notes: "Ensures numerical summaries are grounded in the notes." diff --git a/ingest.py b/ingest.py index 987b831..8322261 100644 --- a/ingest.py +++ b/ingest.py @@ -20,15 +20,13 @@ from typing import List import chromadb -import yaml from llama_index.core import Document, Settings, StorageContext, VectorStoreIndex from llama_index.core.node_parser import SimpleNodeParser from llama_index.embeddings.ollama import OllamaEmbedding from llama_index.llms.ollama import Ollama from llama_index.vector_stores.chroma import ChromaVectorStore -with open("config.yaml", "r", encoding="utf-8") as f: - CONFIG = yaml.safe_load(f) +from app_config import ConfigNamespace, load_app_config PAGE_LINK = re.compile(r"\[\[([^\]]+)\]\]") # [[Page]] BLOCK_REF = re.compile(r"\(\(([a-zA-Z0-9_-]{6,})\)\)") # ((block-id)) @@ -120,48 +118,70 @@ def load_documents(paths: List[str]) -> List[Document]: docs.append(Document(text=clean, metadata=meta)) return docs -def main(): - """Run the full ingestion workflow using settings from ``config.yaml``. +def run_ingest(config: ConfigNamespace, *, verbose: bool = True) -> None: + """Execute the ingestion workflow using ``config``.""" - Running this function end-to-end shows how data collection, cleaning, - chunking, and indexing fit together in a practical RAG pipeline. - """ - - root = CONFIG["logseq_root"] - include_dirs = CONFIG["include_dirs"] - file_exts = CONFIG["file_exts"] - exclude = CONFIG["exclude_globs"] + root = config.logseq_root + include_dirs = list(config.include_dirs) + file_exts = list(config.file_exts) + exclude = list(config.exclude_globs) - if not os.path.isdir(root): - raise SystemExit(f"Logseq root does not exist: {root}\nEdit config.yaml to set logseq_root.") + if not root or not os.path.isdir(root): + raise SystemExit( + f"Logseq root does not exist: {root}\n" + "Edit config.yaml to set logseq_root before running ingest." + ) paths = collect_files(root, include_dirs, file_exts, exclude) - print(f"Found {len(paths)} markdown files.") + if verbose: + print(f"Found {len(paths)} markdown files.") docs = load_documents(paths) - print(f"Loaded {len(docs)} documents.") + if verbose: + print(f"Loaded {len(docs)} documents.") - Settings.llm = Ollama(model=CONFIG["models"]["llm"], request_timeout=180) - Settings.embed_model = OllamaEmbedding(model_name=CONFIG["models"]["embedding"]) + Settings.llm = Ollama( + model=config.models.llm.name, + request_timeout=config.runtime.request_timeout, + temperature=config.models.llm.temperature, + ) + Settings.embed_model = OllamaEmbedding(model_name=config.models.embedding.name) parser = SimpleNodeParser.from_defaults( include_metadata=True, - chunk_size=CONFIG["chunk"]["chunk_size"], - chunk_overlap=CONFIG["chunk"]["chunk_overlap"] + chunk_size=config.chunk.chunk_size, + chunk_overlap=config.chunk.chunk_overlap, ) nodes = parser.get_nodes_from_documents(docs) - print(f"Parsed into {len(nodes)} nodes.") + if verbose: + print(f"Parsed into {len(nodes)} nodes.") - chroma_path = CONFIG["storage"]["chroma_path"] + chroma_path = config.storage.chroma_path os.makedirs(chroma_path, exist_ok=True) client = chromadb.PersistentClient(path=chroma_path) - collection = client.get_or_create_collection("logseq_rag") + + collection_name = config.storage.collection_name + if getattr(config.storage, "clear_before_ingest", False): + try: + client.delete_collection(collection_name) + except Exception: + pass + + collection = client.get_or_create_collection(collection_name) vector_store = ChromaVectorStore(chroma_collection=collection) storage_ctx = StorageContext.from_defaults(vector_store=vector_store) _ = VectorStoreIndex(nodes, storage_context=storage_ctx) - print("Index built and persisted to Chroma.") + if verbose: + print("Index built and persisted to Chroma.") + + +def main() -> None: + """Run the full ingestion workflow using settings from ``config.yaml``.""" + + config = load_app_config() + run_ingest(config) if __name__ == "__main__": main() diff --git a/query_expansion.py b/query_expansion.py new file mode 100644 index 0000000..2f56e90 --- /dev/null +++ b/query_expansion.py @@ -0,0 +1,77 @@ +"""Utilities for expanding user queries before retrieval. + +The heuristic implemented here is intentionally lightweight: it simply appends +pre-defined synonyms to the natural language question so that vector search can +match conceptually related terms (e.g. *sloop* when the user asks about +"sailing"). The synonym lists live in ``config.yaml`` so advanced users can +fine-tune the behaviour without modifying code. +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass +from typing import List + +from app_config import ConfigNamespace + + +_WORD = re.compile(r"[\w'-]+") + + +@dataclass +class ExpandedQuery: + original: str + expanded: str + added_terms: List[str] + + @property + def changed(self) -> bool: + return bool(self.added_terms) + + +def expand_query(question: str, config: ConfigNamespace) -> ExpandedQuery: + """Expand ``question`` with synonym hints defined in ``config``.""" + + expansion_cfg = getattr(config.retrieval, "query_expansion", ConfigNamespace({})) + if not getattr(expansion_cfg, "enabled", False): + return ExpandedQuery(question, question, []) + + synonyms = getattr(expansion_cfg, "synonyms", {}) or {} + if not isinstance(synonyms, dict) or not synonyms: + return ExpandedQuery(question, question, []) + + normalized_map = { + key.lower(): [term.lower() for term in values] + for key, values in synonyms.items() + } + + tokens = [token.lower() for token in _WORD.findall(question)] + + added: List[str] = [] + for token in tokens: + if token in normalized_map: + added.extend(normalized_map[token]) + else: + for root, related in normalized_map.items(): + if token in related: + added.append(root) + + deduped: List[str] = [] + seen = set(tokens) + for term in added: + if term not in seen and term not in deduped: + deduped.append(term) + + max_terms = getattr(expansion_cfg, "max_expansions", None) + if isinstance(max_terms, int) and max_terms >= 0: + deduped = deduped[:max_terms] + + if not deduped: + return ExpandedQuery(question, question, []) + + expanded = f"{question} " + " ".join(deduped) + return ExpandedQuery(question, expanded, deduped) + + +__all__ = ["ExpandedQuery", "expand_query"] diff --git a/tests/test_ingest.py b/tests/test_ingest.py index 121a40a..23979d7 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -3,15 +3,132 @@ from pathlib import Path import textwrap +import importlib +import sys +import types +from pathlib import Path +import textwrap + import pytest +def _install_dependency_stubs(): + if "chromadb" not in sys.modules: + chromadb = types.ModuleType("chromadb") + + class _Collection: + pass + + class _Client: + def __init__(self, *_, **__): + self._collection = _Collection() + + def get_or_create_collection(self, *_args, **_kwargs): + return self._collection + + def delete_collection(self, *_args, **_kwargs): + return None + + chromadb.PersistentClient = _Client + sys.modules["chromadb"] = chromadb + + if "llama_index" not in sys.modules: + root = types.ModuleType("llama_index") + sys.modules["llama_index"] = root + + if "llama_index.core" not in sys.modules: + core = types.ModuleType("llama_index.core") + + class _DummyDocument: + def __init__(self, text: str, metadata: dict): + self.text = text + self.metadata = metadata + + class _DummySettings: + llm = None + embed_model = None + + class _DummyStorageContext: + @classmethod + def from_defaults(cls, **_kwargs): + return cls() + + class _DummyVectorStoreIndex: + def __init__(self, *_, **__): + pass + + core.Document = _DummyDocument + core.Settings = _DummySettings + core.StorageContext = _DummyStorageContext + core.VectorStoreIndex = _DummyVectorStoreIndex + sys.modules["llama_index.core"] = core + + if "llama_index.core.node_parser" not in sys.modules: + node_parser = types.ModuleType("llama_index.core.node_parser") + + class _Parser: + @classmethod + def from_defaults(cls, **_kwargs): + return cls() + + def get_nodes_from_documents(self, documents): + return documents + + node_parser.SimpleNodeParser = _Parser + sys.modules["llama_index.core.node_parser"] = node_parser + + if "llama_index.embeddings.ollama" not in sys.modules: + embeddings = types.ModuleType("llama_index.embeddings.ollama") + + class _DummyEmbedding: + def __init__(self, *_, **__): + pass + + embeddings.OllamaEmbedding = _DummyEmbedding + sys.modules["llama_index.embeddings.ollama"] = embeddings + + if "llama_index.llms.ollama" not in sys.modules: + llms = types.ModuleType("llama_index.llms.ollama") + + class _DummyLLM: + def __init__(self, *_, **__): + pass + + llms.Ollama = _DummyLLM + sys.modules["llama_index.llms.ollama"] = llms + + if "llama_index.vector_stores.chroma" not in sys.modules: + vector_store = types.ModuleType("llama_index.vector_stores.chroma") + + class _DummyVectorStore: + def __init__(self, *_, **__): + pass + + vector_store.ChromaVectorStore = _DummyVectorStore + sys.modules["llama_index.vector_stores.chroma"] = vector_store + + if "yaml" not in sys.modules: + yaml_stub = types.ModuleType("yaml") + + def _safe_load(data): + return {} + + def _safe_dump(_data, _fh, **_kwargs): + return None + + yaml_stub.safe_load = _safe_load + yaml_stub.safe_dump = _safe_dump + sys.modules["yaml"] = yaml_stub + + @pytest.fixture(scope="session") def ingest_module(): project_root = Path(__file__).resolve().parents[1] config_path = project_root / "config.yaml" created = False + _install_dependency_stubs() + if not config_path.exists(): config_path.write_text( textwrap.dedent( @@ -20,17 +137,37 @@ def ingest_module(): include_dirs: [] file_exts: [] exclude_globs: [] + runtime: + request_timeout: 30 models: - llm: llama3.1 - embedding: nomic-embed-text + llm: + name: llama3.1 + temperature: 0.0 + embedding: + name: nomic-embed-text storage: chroma_path: /tmp/chroma + collection_name: test_collection + clear_before_ingest: true retrieval: top_k: 5 - mmr: false + mmr: + enabled: false + query_expansion: + enabled: false chunk: chunk_size: 512 chunk_overlap: 50 + evaluation: + dataset: evaluations/datasets/baseline.yaml + configurations_file: evaluations/configurations.yaml + max_queries: null + scoring: + accuracy_weight: 0.35 + coverage_weight: 0.2 + relevance_weight: 0.2 + hallucination_weight: 0.15 + speed_weight: 0.1 """ ).strip() )