wiseiodev · dubscode · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026
diff --git a/.github/workflows/pr-checks.yml b/.github/workflows/pr-checks.yml
@@ -43,6 +43,9 @@ jobs:
       - name: Run checks
         run: pnpm checks
 
+      - name: Run retrieval proofing (smoke profile)
+        run: pnpm dev retrieval-proof --profile smoke --output-dir artifacts/retrieval-proofing
+
       - name: Comment success summary on PR
         if: ${{ success() && github.event_name == 'pull_request' }}
         continue-on-error: true

diff --git a/README.md b/README.md
@@ -33,6 +33,7 @@ pnpm dev -- chat
 pnpm dev -- chat "summarize this repo"
 pnpm dev -- plan "create a rollout plan for indexing"
 pnpm dev -- index .
+pnpm dev retrieval-proof --profile smoke
 pnpm dev -- automations list
 pnpm dev -- automations add --name "Hourly Check" --cron "0 * * * *" --prompt "summarize local status"
 pnpm dev -- automations run
@@ -94,3 +95,4 @@ Environment variables (BYOK):
 
 - Anthropic embeddings currently fall back to deterministic local vectors.
 - This project intentionally uses Biome only (no ESLint/Prettier).
+- Retrieval proofing benchmark schema/workflow docs: `docs/retrieval-proofing-benchmark-schema.md` and `docs/retrieval-proofing.md`.
diff --git a/benchmarks/retrieval-proofing/benchmark.v1.json b/benchmarks/retrieval-proofing/benchmark.v1.json
@@ -0,0 +1,217 @@
+{
+  "version": "1.0",
+  "datasetName": "retrieval-proofing-core",
+  "datasetVersion": "2026.03.01",
+  "cases": [
+    {
+      "id": "repo-layout",
+      "title": "Find retrieval implementation location",
+      "query": "where is hybrid retrieval implemented",
+      "intent": "lookup",
+      "difficulty": "low",
+      "topK": 3,
+      "expectedEvidenceDocIds": ["d1", "d2"],
+      "documents": [
+        {
+          "id": "d1",
+          "path": "src/context/retrieval/hybrid.ts",
+          "title": "Hybrid retrieval source",
+          "content": "The hybrid retrieval runner combines lexical search, vector similarity, and ranking metadata."
+        },
+        {
+          "id": "d2",
+          "path": "src/context/retrieval/rerank.ts",
+          "title": "Rerank helpers",
+          "content": "hybridRerank computes weighted retrieval ordering and cosine similarity for vector retrieval."
+        },
+        {
+          "id": "d3",
+          "path": "README.md",
+          "title": "Project overview",
+          "content": "General project overview and quick start commands for local development."
+        },
+        {
+          "id": "d4",
+          "path": "src/cli/commands.ts",
+          "title": "CLI commands",
+          "content": "Registers chat, plan, index, and automations commands."
+        }
+      ]
+    },
+    {
+      "id": "quality-commands",
+      "title": "Identify quality gates",
+      "query": "what command runs lint typecheck and tests",
+      "intent": "lookup",
+      "difficulty": "medium",
+      "topK": 3,
+      "expectedEvidenceDocIds": ["d1"],
+      "documents": [
+        {
+          "id": "d1",
+          "path": "package.json",
+          "title": "Project scripts",
+          "content": "The checks script runs pnpm test, pnpm typecheck, pnpm lint, and pnpm build."
+        },
+        {
+          "id": "d2",
+          "path": "README.md",
+          "title": "README quality commands",
+          "content": "The README includes lint, typecheck, test, and build as quality commands."
+        },
+        {
+          "id": "d3",
+          "path": "src/automation/runner.ts",
+          "title": "Automation runner",
+          "content": "Executes configured prompts on cron schedules."
+        },
+        {
+          "id": "d4",
+          "path": "src/db/client.ts",
+          "title": "Database client",
+          "content": "Initializes PGLite and exposes query and exec methods."
+        }
+      ]
+    },
+    {
+      "id": "provider-preflight",
+      "title": "Provider preflight requirements",
+      "query": "which env var is required for google provider preflight",
+      "intent": "lookup",
+      "difficulty": "medium",
+      "topK": 3,
+      "expectedEvidenceDocIds": ["d1", "d2"],
+      "documents": [
+        {
+          "id": "d1",
+          "path": "src/cli/commands/chat.tsx",
+          "title": "Chat preflight",
+          "content": "Chat preflight checks provider env vars and prints setup instructions for google openai and anthropic."
+        },
+        {
+          "id": "d2",
+          "path": "README.md",
+          "title": "Environment variable docs",
+          "content": "GOOGLE_GENERATIVE_AI_API_KEY is required when using the google provider."
+        },
+        {
+          "id": "d3",
+          "path": "src/mcp/client.ts",
+          "title": "MCP client",
+          "content": "Starts and interacts with external MCP servers."
+        },
+        {
+          "id": "d4",
+          "path": "src/context/indexer/full-index.ts",
+          "title": "Indexer",
+          "content": "Indexes repository files and writes chunks and embeddings."
+        }
+      ]
+    },
+    {
+      "id": "policy-approval",
+      "title": "Approval behavior",
+      "query": "which actions require approval in interactive mode",
+      "intent": "reasoning",
+      "difficulty": "high",
+      "topK": 3,
+      "expectedEvidenceDocIds": ["d1", "d2"],
+      "documents": [
+        {
+          "id": "d1",
+          "path": "README.md",
+          "title": "Interactive approval docs",
+          "content": "Sensitive write and destructive tool actions require explicit approve, deny, or dismiss decisions in the TUI."
+        },
+        {
+          "id": "d2",
+          "path": "src/policy/engine.ts",
+          "title": "Policy engine",
+          "content": "Policy engine classifies tool side effects and enforces approval decisions."
+        },
+        {
+          "id": "d3",
+          "path": "src/context/retrieval/rerank.ts",
+          "title": "Rerank",
+          "content": "Reranks retrieval candidates with weighted score combination."
+        },
+        {
+          "id": "d4",
+          "path": "src/db/migrate.ts",
+          "title": "Migrations",
+          "content": "Applies schema migrations at startup."
+        }
+      ]
+    },
+    {
+      "id": "automation-hooks",
+      "title": "Hook trigger behavior",
+      "query": "what hooks trigger tests or typecheck",
+      "intent": "lookup",
+      "difficulty": "low",
+      "topK": 3,
+      "expectedEvidenceDocIds": ["d1"],
+      "documents": [
+        {
+          "id": "d1",
+          "path": "AGENTS.md",
+          "title": "Hook definitions",
+          "content": "file-change runs pnpm test and git-head-change runs pnpm typecheck."
+        },
+        {
+          "id": "d2",
+          "path": "README.md",
+          "title": "README automation",
+          "content": "Automations can run prompts but does not define hook command mapping."
+        },
+        {
+          "id": "d3",
+          "path": "src/automation/scheduler.ts",
+          "title": "Scheduler",
+          "content": "Cron scheduler dispatches queued automation specs."
+        },
+        {
+          "id": "d4",
+          "path": "src/agent/orchestrator.ts",
+          "title": "Orchestrator",
+          "content": "Runs gather reason act verify loops with validation retries."
+        }
+      ]
+    },
+    {
+      "id": "ci-workflow",
+      "title": "CI checks pipeline",
+      "query": "where is pr checks workflow defined",
+      "intent": "lookup",
+      "difficulty": "medium",
+      "topK": 3,
+      "expectedEvidenceDocIds": ["d1", "d2"],
+      "documents": [
+        {
+          "id": "d1",
+          "path": ".github/workflows/pr-checks.yml",
+          "title": "PR checks workflow",
+          "content": "Runs pnpm checks in CI on pull requests."
+        },
+        {
+          "id": "d2",
+          "path": "README.md",
+          "title": "Project quality commands",
+          "content": "The quality command list maps to CI checks execution."
+        },
+        {
+          "id": "d3",
+          "path": "src/tools/registry.ts",
+          "title": "Tool registry",
+          "content": "Defines registered local tools and validation."
+        },
+        {
+          "id": "d4",
+          "path": "src/db/client.ts",
+          "title": "Database client",
+          "content": "Provides thin wrapper around PGLite query execution."
+        }
+      ]
+    }
+  ]
+}
diff --git a/benchmarks/retrieval-proofing/profiles.v1.json b/benchmarks/retrieval-proofing/profiles.v1.json
@@ -0,0 +1,36 @@
+{
+  "version": "1.0",
+  "profiles": {
+    "smoke": {
+      "description": "Fast CI profile with a representative subset of benchmark cases",
+      "caseIds": ["repo-layout", "quality-commands", "provider-preflight"],
+      "thresholds": {
+        "hybridMinimums": {
+          "evidenceRelevance": 0.55,
+          "citationSupportCoverage": 0.75,
+          "compositeScore": 0.62,
+          "maxUnsupportedClaimPenalty": 0.45
+        },
+        "baselineDeltaFloors": {
+          "lexical": -0.03,
+          "vector": 0.02
+        }
+      }
+    },
+    "full": {
+      "description": "Full benchmark profile for deeper retrieval proofing",
+      "thresholds": {
+        "hybridMinimums": {
+          "evidenceRelevance": 0.5,
+          "citationSupportCoverage": 0.7,
+          "compositeScore": 0.58,
+          "maxUnsupportedClaimPenalty": 0.5
+        },
+        "baselineDeltaFloors": {
+          "lexical": -0.01,
+          "vector": 0.02
+        }
+      }
+    }
+  }
+}
diff --git a/docs/retrieval-proofing-benchmark-schema.md b/docs/retrieval-proofing-benchmark-schema.md
@@ -0,0 +1,69 @@
+# Retrieval Proofing Benchmark Schema (v1.0)
+
+This document defines the versioned fixture format used by retrieval proofing.
+
+## Fixture File
+
+Path: `benchmarks/retrieval-proofing/benchmark.v1.json`
+
+Top-level shape:
+
+```json
+{
+  "version": "1.0",
+  "datasetName": "retrieval-proofing-core",
+  "datasetVersion": "2026.03.01",
+  "cases": [
+    {
+      "id": "repo-layout",
+      "title": "Find retrieval implementation location",
+      "query": "where is hybrid retrieval implemented",
+      "intent": "lookup",
+      "difficulty": "low",
+      "topK": 3,
+      "expectedEvidenceDocIds": ["d1", "d2"],
+      "documents": [
+        {
+          "id": "d1",
+          "path": "src/context/retrieval/hybrid.ts",
+          "title": "Hybrid retrieval source",
+          "content": "..."
+        },
+        {
+          "id": "d2",
+          "path": "src/context/retrieval/rerank.ts",
+          "title": "Rerank helpers",
+          "content": "..."
+        }
+      ]
+    }
+  ]
+}
+```
+
+## Field Semantics
+
+- `version`: Fixture schema version. Must be `1.0` for this release.
+- `datasetName`: Human-readable benchmark dataset name.
+- `datasetVersion`: Version of benchmark content. Bump when case content or labels change.
+- `cases`: Benchmark case list.
+- `cases[].id`: Stable identifier used by profile filters and reports.
+- `cases[].query`: Query string used by all retrieval strategies.
+- `cases[].topK`: Number of retrieved documents considered for scoring.
+- `cases[].documents`: Candidate evidence set for the case.
+- `cases[].expectedEvidenceDocIds`: Canonical evidence documents used for deterministic scoring.
+
+## Profile File
+
+Path: `benchmarks/retrieval-proofing/profiles.v1.json`
+
+- `version`: Profile schema version (`1.0`).
+- `profiles.<name>.caseIds`: Optional subset of case IDs for this profile.
+- `profiles.<name>.thresholds.hybridMinimums`: Absolute floors for hybrid metrics.
+- `profiles.<name>.thresholds.baselineDeltaFloors`: Minimum hybrid-vs-baseline composite deltas.
+
+## Versioning Rules
+
+- Bump `datasetVersion` whenever benchmark content changes.
+- Keep schema `version` at `1.0` unless the JSON structure changes.
+- Prefer adding new cases over mutating existing case IDs to preserve comparability.