From fa7846ec65de481fff63510a531c11322710a31f Mon Sep 17 00:00:00 2001
From: Daniel Wise <io.dwise@gmail.com>
Date: Tue, 3 Mar 2026 17:15:37 -0800
Subject: [PATCH 1/2] feat(retrieval): add retrieval quality proofing workflow
 and archive change

- add retrieval proofing benchmark fixtures and profile thresholds
- add proofing runner, deterministic scoring, and report generation
- add retrieval-proof CLI command and smoke CI gate
- add tests and docs for local/CI proofing workflow
- archive retrieval-quality-proofing change and sync main spec
---
 .github/workflows/pr-checks.yml               |   3 +
 README.md                                     |   2 +
 .../retrieval-proofing/benchmark.v1.json      | 217 ++++++++++++++++++
 .../retrieval-proofing/profiles.v1.json       |  36 +++
 docs/retrieval-proofing-benchmark-schema.md   |  63 +++++
 docs/retrieval-proofing.md                    |  54 +++++
 .../.openspec.yaml                            |   0
 .../design.md                                 |   0
 .../proposal.md                               |   0
 .../specs/retrieval-quality-proofing/spec.md  |   0
 .../tasks.md                                  |  23 ++
 .../retrieval-quality-proofing/tasks.md       |  23 --
 .../specs/retrieval-quality-proofing/spec.md  |  37 +++
 src/cli/commands.ts                           |  33 +++
 src/cli/commands/retrieval-proof.ts           |  51 ++++
 src/context/retrieval/proofing/reports.ts     |  65 ++++++
 src/context/retrieval/proofing/runner.ts      | 213 +++++++++++++++++
 src/context/retrieval/proofing/schema.ts      | 118 ++++++++++
 src/context/retrieval/proofing/scoring.ts     |  84 +++++++
 tests/retrieval-proofing.test.ts              |  90 ++++++++
 20 files changed, 1089 insertions(+), 23 deletions(-)
 create mode 100644 benchmarks/retrieval-proofing/benchmark.v1.json
 create mode 100644 benchmarks/retrieval-proofing/profiles.v1.json
 create mode 100644 docs/retrieval-proofing-benchmark-schema.md
 create mode 100644 docs/retrieval-proofing.md
 rename openspec/changes/{retrieval-quality-proofing => archive/2026-03-04-retrieval-quality-proofing}/.openspec.yaml (100%)
 rename openspec/changes/{retrieval-quality-proofing => archive/2026-03-04-retrieval-quality-proofing}/design.md (100%)
 rename openspec/changes/{retrieval-quality-proofing => archive/2026-03-04-retrieval-quality-proofing}/proposal.md (100%)
 rename openspec/changes/{retrieval-quality-proofing => archive/2026-03-04-retrieval-quality-proofing}/specs/retrieval-quality-proofing/spec.md (100%)
 create mode 100644 openspec/changes/archive/2026-03-04-retrieval-quality-proofing/tasks.md
 delete mode 100644 openspec/changes/retrieval-quality-proofing/tasks.md
 create mode 100644 openspec/specs/retrieval-quality-proofing/spec.md
 create mode 100644 src/cli/commands/retrieval-proof.ts
 create mode 100644 src/context/retrieval/proofing/reports.ts
 create mode 100644 src/context/retrieval/proofing/runner.ts
 create mode 100644 src/context/retrieval/proofing/schema.ts
 create mode 100644 src/context/retrieval/proofing/scoring.ts
 create mode 100644 tests/retrieval-proofing.test.ts

diff --git a/.github/workflows/pr-checks.yml b/.github/workflows/pr-checks.yml
index 2ec99dc..f97e85a 100644
--- a/.github/workflows/pr-checks.yml
+++ b/.github/workflows/pr-checks.yml
@@ -43,6 +43,9 @@ jobs:
       - name: Run checks
         run: pnpm checks
 
+      - name: Run retrieval proofing (smoke profile)
+        run: pnpm dev retrieval-proof --profile smoke --output-dir artifacts/retrieval-proofing
+
       - name: Comment success summary on PR
         if: ${{ success() && github.event_name == 'pull_request' }}
         continue-on-error: true
diff --git a/README.md b/README.md
index b643150..1da25f9 100644
--- a/README.md
+++ b/README.md
@@ -33,6 +33,7 @@ pnpm dev -- chat
 pnpm dev -- chat "summarize this repo"
 pnpm dev -- plan "create a rollout plan for indexing"
 pnpm dev -- index .
+pnpm dev retrieval-proof --profile smoke
 pnpm dev -- automations list
 pnpm dev -- automations add --name "Hourly Check" --cron "0 * * * *" --prompt "summarize local status"
 pnpm dev -- automations run
@@ -94,3 +95,4 @@ Environment variables (BYOK):
 
 - Anthropic embeddings currently fall back to deterministic local vectors.
 - This project intentionally uses Biome only (no ESLint/Prettier).
+- Retrieval proofing benchmark schema/workflow docs: `docs/retrieval-proofing-benchmark-schema.md` and `docs/retrieval-proofing.md`.
diff --git a/benchmarks/retrieval-proofing/benchmark.v1.json b/benchmarks/retrieval-proofing/benchmark.v1.json
new file mode 100644
index 0000000..734c13b
--- /dev/null
+++ b/benchmarks/retrieval-proofing/benchmark.v1.json
@@ -0,0 +1,217 @@
+{
+  "version": "1.0",
+  "datasetName": "retrieval-proofing-core",
+  "datasetVersion": "2026.03.01",
+  "cases": [
+    {
+      "id": "repo-layout",
+      "title": "Find retrieval implementation location",
+      "query": "where is hybrid retrieval implemented",
+      "intent": "lookup",
+      "difficulty": "low",
+      "topK": 3,
+      "expectedEvidenceDocIds": ["d1", "d2"],
+      "documents": [
+        {
+          "id": "d1",
+          "path": "src/context/retrieval/hybrid.ts",
+          "title": "Hybrid retrieval source",
+          "content": "The hybrid retrieval runner combines lexical search, vector similarity, and ranking metadata."
+        },
+        {
+          "id": "d2",
+          "path": "src/context/retrieval/rerank.ts",
+          "title": "Rerank helpers",
+          "content": "hybridRerank computes weighted retrieval ordering and cosine similarity for vector retrieval."
+        },
+        {
+          "id": "d3",
+          "path": "README.md",
+          "title": "Project overview",
+          "content": "General project overview and quick start commands for local development."
+        },
+        {
+          "id": "d4",
+          "path": "src/cli/commands.ts",
+          "title": "CLI commands",
+          "content": "Registers chat, plan, index, and automations commands."
+        }
+      ]
+    },
+    {
+      "id": "quality-commands",
+      "title": "Identify quality gates",
+      "query": "what command runs lint typecheck and tests",
+      "intent": "lookup",
+      "difficulty": "medium",
+      "topK": 3,
+      "expectedEvidenceDocIds": ["d1"],
+      "documents": [
+        {
+          "id": "d1",
+          "path": "package.json",
+          "title": "Project scripts",
+          "content": "The checks script runs pnpm test, pnpm typecheck, pnpm lint, and pnpm build."
+        },
+        {
+          "id": "d2",
+          "path": "README.md",
+          "title": "README quality commands",
+          "content": "The README includes lint, typecheck, test, and build as quality commands."
+        },
+        {
+          "id": "d3",
+          "path": "src/automation/runner.ts",
+          "title": "Automation runner",
+          "content": "Executes configured prompts on cron schedules."
+        },
+        {
+          "id": "d4",
+          "path": "src/db/client.ts",
+          "title": "Database client",
+          "content": "Initializes PGLite and exposes query and exec methods."
+        }
+      ]
+    },
+    {
+      "id": "provider-preflight",
+      "title": "Provider preflight requirements",
+      "query": "which env var is required for google provider preflight",
+      "intent": "lookup",
+      "difficulty": "medium",
+      "topK": 3,
+      "expectedEvidenceDocIds": ["d1", "d2"],
+      "documents": [
+        {
+          "id": "d1",
+          "path": "src/cli/commands/chat.tsx",
+          "title": "Chat preflight",
+          "content": "Chat preflight checks provider env vars and prints setup instructions for google openai and anthropic."
+        },
+        {
+          "id": "d2",
+          "path": "README.md",
+          "title": "Environment variable docs",
+          "content": "GOOGLE_GENERATIVE_AI_API_KEY is required when using the google provider."
+        },
+        {
+          "id": "d3",
+          "path": "src/mcp/client.ts",
+          "title": "MCP client",
+          "content": "Starts and interacts with external MCP servers."
+        },
+        {
+          "id": "d4",
+          "path": "src/context/indexer/full-index.ts",
+          "title": "Indexer",
+          "content": "Indexes repository files and writes chunks and embeddings."
+        }
+      ]
+    },
+    {
+      "id": "policy-approval",
+      "title": "Approval behavior",
+      "query": "which actions require approval in interactive mode",
+      "intent": "reasoning",
+      "difficulty": "high",
+      "topK": 3,
+      "expectedEvidenceDocIds": ["d1", "d2"],
+      "documents": [
+        {
+          "id": "d1",
+          "path": "README.md",
+          "title": "Interactive approval docs",
+          "content": "Sensitive write and destructive tool actions require explicit approve, deny, or dismiss decisions in the TUI."
+        },
+        {
+          "id": "d2",
+          "path": "src/policy/engine.ts",
+          "title": "Policy engine",
+          "content": "Policy engine classifies tool side effects and enforces approval decisions."
+        },
+        {
+          "id": "d3",
+          "path": "src/context/retrieval/rerank.ts",
+          "title": "Rerank",
+          "content": "Reranks retrieval candidates with weighted score combination."
+        },
+        {
+          "id": "d4",
+          "path": "src/db/migrate.ts",
+          "title": "Migrations",
+          "content": "Applies schema migrations at startup."
+        }
+      ]
+    },
+    {
+      "id": "automation-hooks",
+      "title": "Hook trigger behavior",
+      "query": "what hooks trigger tests or typecheck",
+      "intent": "lookup",
+      "difficulty": "low",
+      "topK": 3,
+      "expectedEvidenceDocIds": ["d1"],
+      "documents": [
+        {
+          "id": "d1",
+          "path": "AGENTS.md",
+          "title": "Hook definitions",
+          "content": "file-change runs pnpm test and git-head-change runs pnpm typecheck."
+        },
+        {
+          "id": "d2",
+          "path": "README.md",
+          "title": "README automation",
+          "content": "Automations can run prompts but does not define hook command mapping."
+        },
+        {
+          "id": "d3",
+          "path": "src/automation/scheduler.ts",
+          "title": "Scheduler",
+          "content": "Cron scheduler dispatches queued automation specs."
+        },
+        {
+          "id": "d4",
+          "path": "src/agent/orchestrator.ts",
+          "title": "Orchestrator",
+          "content": "Runs gather reason act verify loops with validation retries."
+        }
+      ]
+    },
+    {
+      "id": "ci-workflow",
+      "title": "CI checks pipeline",
+      "query": "where is pr checks workflow defined",
+      "intent": "lookup",
+      "difficulty": "medium",
+      "topK": 3,
+      "expectedEvidenceDocIds": ["d1", "d2"],
+      "documents": [
+        {
+          "id": "d1",
+          "path": ".github/workflows/pr-checks.yml",
+          "title": "PR checks workflow",
+          "content": "Runs pnpm checks in CI on pull requests."
+        },
+        {
+          "id": "d2",
+          "path": "README.md",
+          "title": "Project quality commands",
+          "content": "The quality command list maps to CI checks execution."
+        },
+        {
+          "id": "d3",
+          "path": "src/tools/registry.ts",
+          "title": "Tool registry",
+          "content": "Defines registered local tools and validation."
+        },
+        {
+          "id": "d4",
+          "path": "src/db/client.ts",
+          "title": "Database client",
+          "content": "Provides thin wrapper around PGLite query execution."
+        }
+      ]
+    }
+  ]
+}
diff --git a/benchmarks/retrieval-proofing/profiles.v1.json b/benchmarks/retrieval-proofing/profiles.v1.json
new file mode 100644
index 0000000..ff5c585
--- /dev/null
+++ b/benchmarks/retrieval-proofing/profiles.v1.json
@@ -0,0 +1,36 @@
+{
+  "version": "1.0",
+  "profiles": {
+    "smoke": {
+      "description": "Fast CI profile with a representative subset of benchmark cases",
+      "caseIds": ["repo-layout", "quality-commands", "provider-preflight"],
+      "thresholds": {
+        "hybridMinimums": {
+          "evidenceRelevance": 0.55,
+          "citationSupportCoverage": 0.75,
+          "compositeScore": 0.62,
+          "maxUnsupportedClaimPenalty": 0.45
+        },
+        "baselineDeltaFloors": {
+          "lexical": -0.03,
+          "vector": 0.02
+        }
+      }
+    },
+    "full": {
+      "description": "Full benchmark profile for deeper retrieval proofing",
+      "thresholds": {
+        "hybridMinimums": {
+          "evidenceRelevance": 0.5,
+          "citationSupportCoverage": 0.7,
+          "compositeScore": 0.58,
+          "maxUnsupportedClaimPenalty": 0.5
+        },
+        "baselineDeltaFloors": {
+          "lexical": -0.01,
+          "vector": 0.02
+        }
+      }
+    }
+  }
+}
diff --git a/docs/retrieval-proofing-benchmark-schema.md b/docs/retrieval-proofing-benchmark-schema.md
new file mode 100644
index 0000000..3551b00
--- /dev/null
+++ b/docs/retrieval-proofing-benchmark-schema.md
@@ -0,0 +1,63 @@
+# Retrieval Proofing Benchmark Schema (v1.0)
+
+This document defines the versioned fixture format used by retrieval proofing.
+
+## Fixture File
+
+Path: `benchmarks/retrieval-proofing/benchmark.v1.json`
+
+Top-level shape:
+
+```json
+{
+  "version": "1.0",
+  "datasetName": "retrieval-proofing-core",
+  "datasetVersion": "2026.03.01",
+  "cases": [
+    {
+      "id": "repo-layout",
+      "title": "Find retrieval implementation location",
+      "query": "where is hybrid retrieval implemented",
+      "intent": "lookup",
+      "difficulty": "low",
+      "topK": 3,
+      "expectedEvidenceDocIds": ["d1", "d2"],
+      "documents": [
+        {
+          "id": "d1",
+          "path": "src/context/retrieval/hybrid.ts",
+          "title": "Hybrid retrieval source",
+          "content": "..."
+        }
+      ]
+    }
+  ]
+}
+```
+
+## Field Semantics
+
+- `version`: Fixture schema version. Must be `1.0` for this release.
+- `datasetName`: Human-readable benchmark dataset name.
+- `datasetVersion`: Version of benchmark content. Bump when case content or labels change.
+- `cases`: Benchmark case list.
+- `cases[].id`: Stable identifier used by profile filters and reports.
+- `cases[].query`: Query string used by all retrieval strategies.
+- `cases[].topK`: Number of retrieved documents considered for scoring.
+- `cases[].documents`: Candidate evidence set for the case.
+- `cases[].expectedEvidenceDocIds`: Canonical evidence documents used for deterministic scoring.
+
+## Profile File
+
+Path: `benchmarks/retrieval-proofing/profiles.v1.json`
+
+- `version`: Profile schema version (`1.0`).
+- `profiles.<name>.caseIds`: Optional subset of case IDs for this profile.
+- `profiles.<name>.thresholds.hybridMinimums`: Absolute floors for hybrid metrics.
+- `profiles.<name>.thresholds.baselineDeltaFloors`: Minimum hybrid-vs-baseline composite deltas.
+
+## Versioning Rules
+
+- Bump `datasetVersion` whenever benchmark content changes.
+- Keep schema `version` at `1.0` unless the JSON structure changes.
+- Prefer adding new cases over mutating existing case IDs to preserve comparability.
diff --git a/docs/retrieval-proofing.md b/docs/retrieval-proofing.md
new file mode 100644
index 0000000..88d6c5a
--- /dev/null
+++ b/docs/retrieval-proofing.md
@@ -0,0 +1,54 @@
+# Retrieval Quality Proofing
+
+Retrieval proofing evaluates `lexical`, `vector`, and `hybrid` strategies on the same benchmark dataset, emits JSON/Markdown artifacts, and enforces hybrid quality gates.
+
+## Run Locally
+
+Smoke profile:
+
+```bash
+pnpm dev retrieval-proof --profile smoke
+```
+
+Full profile:
+
+```bash
+pnpm dev retrieval-proof --profile full
+```
+
+Custom output directory:
+
+```bash
+pnpm dev retrieval-proof --profile smoke --output-dir artifacts/retrieval-proofing
+```
+
+## Artifacts
+
+Each run writes:
+
+- `<profile>-<timestamp>.json`: per-case metrics, aggregate metrics, hybrid deltas, gate result.
+- `<profile>-<timestamp>.md`: concise human-readable summary for PR/release notes.
+
+Default output path:
+
+- `artifacts/retrieval-proofing/`
+
+## CI Workflow
+
+PR checks run retrieval proofing with the smoke profile:
+
+```bash
+pnpm dev retrieval-proof --profile smoke --output-dir artifacts/retrieval-proofing
+```
+
+If hybrid thresholds fail, the command exits non-zero and CI fails.
+
+## Updating Baseline Thresholds Safely
+
+1. Run the full profile locally and inspect both JSON and Markdown reports.
+2. Confirm changes are intentional and linked to retrieval behavior changes.
+3. Update thresholds in `benchmarks/retrieval-proofing/profiles.v1.json`.
+4. Re-run both `smoke` and `full` profiles and ensure results are stable.
+5. Include rationale for threshold changes in PR description (what changed and why).
+
+Avoid lowering thresholds to mask regressions. Prefer improving retrieval behavior first.
diff --git a/openspec/changes/retrieval-quality-proofing/.openspec.yaml b/openspec/changes/archive/2026-03-04-retrieval-quality-proofing/.openspec.yaml
similarity index 100%
rename from openspec/changes/retrieval-quality-proofing/.openspec.yaml
rename to openspec/changes/archive/2026-03-04-retrieval-quality-proofing/.openspec.yaml
diff --git a/openspec/changes/retrieval-quality-proofing/design.md b/openspec/changes/archive/2026-03-04-retrieval-quality-proofing/design.md
similarity index 100%
rename from openspec/changes/retrieval-quality-proofing/design.md
rename to openspec/changes/archive/2026-03-04-retrieval-quality-proofing/design.md
diff --git a/openspec/changes/retrieval-quality-proofing/proposal.md b/openspec/changes/archive/2026-03-04-retrieval-quality-proofing/proposal.md
similarity index 100%
rename from openspec/changes/retrieval-quality-proofing/proposal.md
rename to openspec/changes/archive/2026-03-04-retrieval-quality-proofing/proposal.md
diff --git a/openspec/changes/retrieval-quality-proofing/specs/retrieval-quality-proofing/spec.md b/openspec/changes/archive/2026-03-04-retrieval-quality-proofing/specs/retrieval-quality-proofing/spec.md
similarity index 100%
rename from openspec/changes/retrieval-quality-proofing/specs/retrieval-quality-proofing/spec.md
rename to openspec/changes/archive/2026-03-04-retrieval-quality-proofing/specs/retrieval-quality-proofing/spec.md
diff --git a/openspec/changes/archive/2026-03-04-retrieval-quality-proofing/tasks.md b/openspec/changes/archive/2026-03-04-retrieval-quality-proofing/tasks.md
new file mode 100644
index 0000000..b914fd6
--- /dev/null
+++ b/openspec/changes/archive/2026-03-04-retrieval-quality-proofing/tasks.md
@@ -0,0 +1,23 @@
+## 1. Benchmark Dataset and Configuration
+
+- [x] 1.1 Define and document versioned benchmark fixture schema for retrieval proofing cases
+- [x] 1.2 Add initial benchmark dataset covering multiple query intents and grounding difficulty levels
+- [x] 1.3 Implement profile-based proofing configuration (for example `smoke` and `full`) with threshold settings
+
+## 2. Evaluation Runner and Scoring
+
+- [x] 2.1 Implement retrieval proofing runner that executes lexical, vector, and hybrid modes over the same case set
+- [x] 2.2 Implement deterministic grounding metric scoring for evidence relevance, citation support, and unsupported-claim penalty
+- [x] 2.3 Add aggregate scoring and strategy delta computation suitable for pass/fail gating
+
+## 3. Reporting and CLI Integration
+
+- [x] 3.1 Add CLI command(s) to run retrieval proofing for a selected benchmark profile
+- [x] 3.2 Generate JSON report artifacts with per-case and aggregate metrics for each strategy
+- [x] 3.3 Generate Markdown summary report highlighting hybrid-vs-baseline outcomes and gate status
+
+## 4. Quality Gates and Verification
+
+- [x] 4.1 Integrate proofing command into CI with non-zero exit on failed hybrid thresholds
+- [x] 4.2 Add tests for scoring determinism, report schema stability, and gate pass/fail behavior
+- [x] 4.3 Document local and CI proofing workflows, including how to update baseline thresholds safely
diff --git a/openspec/changes/retrieval-quality-proofing/tasks.md b/openspec/changes/retrieval-quality-proofing/tasks.md
deleted file mode 100644
index 0fe1789..0000000
--- a/openspec/changes/retrieval-quality-proofing/tasks.md
+++ /dev/null
@@ -1,23 +0,0 @@
-## 1. Benchmark Dataset and Configuration
-
-- [ ] 1.1 Define and document versioned benchmark fixture schema for retrieval proofing cases
-- [ ] 1.2 Add initial benchmark dataset covering multiple query intents and grounding difficulty levels
-- [ ] 1.3 Implement profile-based proofing configuration (for example `smoke` and `full`) with threshold settings
-
-## 2. Evaluation Runner and Scoring
-
-- [ ] 2.1 Implement retrieval proofing runner that executes lexical, vector, and hybrid modes over the same case set
-- [ ] 2.2 Implement deterministic grounding metric scoring for evidence relevance, citation support, and unsupported-claim penalty
-- [ ] 2.3 Add aggregate scoring and strategy delta computation suitable for pass/fail gating
-
-## 3. Reporting and CLI Integration
-
-- [ ] 3.1 Add CLI command(s) to run retrieval proofing for a selected benchmark profile
-- [ ] 3.2 Generate JSON report artifacts with per-case and aggregate metrics for each strategy
-- [ ] 3.3 Generate Markdown summary report highlighting hybrid-vs-baseline outcomes and gate status
-
-## 4. Quality Gates and Verification
-
-- [ ] 4.1 Integrate proofing command into CI with non-zero exit on failed hybrid thresholds
-- [ ] 4.2 Add tests for scoring determinism, report schema stability, and gate pass/fail behavior
-- [ ] 4.3 Document local and CI proofing workflows, including how to update baseline thresholds safely
diff --git a/openspec/specs/retrieval-quality-proofing/spec.md b/openspec/specs/retrieval-quality-proofing/spec.md
new file mode 100644
index 0000000..ef4b8ec
--- /dev/null
+++ b/openspec/specs/retrieval-quality-proofing/spec.md
@@ -0,0 +1,37 @@
+# retrieval-quality-proofing Specification
+
+## Purpose
+TBD - created by archiving change retrieval-quality-proofing. Update Purpose after archive.
+## Requirements
+### Requirement: Multi-Strategy Retrieval Evaluation
+The system MUST execute the same benchmark question set against at least three retrieval strategies: lexical-only, vector-only, and hybrid.
+
+#### Scenario: Compare strategies on shared benchmark
+- **WHEN** a proofing run starts for a benchmark profile
+- **THEN** the system runs every benchmark case across lexical, vector, and hybrid modes using identical inputs and scoring configuration
+
+### Requirement: Deterministic Grounding Metrics
+The system MUST calculate deterministic grounding metrics for each benchmark case and strategy, including evidence relevance, citation support coverage, and unsupported-claim penalty.
+
+#### Scenario: Produce deterministic scores
+- **WHEN** the same benchmark profile and repository state are evaluated multiple times
+- **THEN** the computed grounding metrics and aggregate scores are identical across runs except for explicitly declared non-deterministic fields
+
+### Requirement: Versioned Benchmark and Report Artifacts
+The system MUST support versioned benchmark fixtures and emit both machine-readable and human-readable report artifacts for every proofing run.
+
+#### Scenario: Generate proof artifacts
+- **WHEN** a proofing run completes
+- **THEN** the system writes a JSON report containing per-case and aggregate metric values and writes a Markdown summary highlighting strategy deltas and pass/fail gate status
+
+### Requirement: Hybrid Quality Gate Enforcement
+The system MUST enforce configurable quality gates that verify hybrid retrieval outperforms configured baseline strategies on grounding metrics.
+
+#### Scenario: Gate fails on hybrid regression
+- **WHEN** a proofing run determines that hybrid retrieval does not meet configured improvement thresholds versus baseline
+- **THEN** the command exits non-zero and marks the run as failed for CI enforcement
+
+#### Scenario: Gate passes on acceptable hybrid improvement
+- **WHEN** a proofing run determines that hybrid retrieval meets configured improvement thresholds versus baseline
+- **THEN** the command exits zero and marks the run as passing
+
diff --git a/src/cli/commands.ts b/src/cli/commands.ts
index 683716f..00564bc 100644
--- a/src/cli/commands.ts
+++ b/src/cli/commands.ts
@@ -7,6 +7,7 @@ import {
 import { runChatCommand } from './commands/chat';
 import { runIndexCommand } from './commands/index';
 import { runPlanCommand } from './commands/plan';
+import { runRetrievalProofCommand } from './commands/retrieval-proof';
 
 export function createProgram(): Command {
   const program = new Command();
@@ -37,6 +38,38 @@ export function createProgram(): Command {
       await runIndexCommand(repoRoot);
     });
 
+  program
+    .command('retrieval-proof')
+    .description('Run retrieval quality proofing against benchmark profiles')
+    .option(
+      '--benchmark <path>',
+      'benchmark fixture JSON path',
+      'benchmarks/retrieval-proofing/benchmark.v1.json'
+    )
+    .option(
+      '--profiles <path>',
+      'benchmark profiles JSON path',
+      'benchmarks/retrieval-proofing/profiles.v1.json'
+    )
+    .option('--profile <name>', 'benchmark profile name', 'smoke')
+    .option(
+      '--output-dir <path>',
+      'directory for generated reports',
+      'artifacts/retrieval-proofing'
+    )
+    .option('--no-fail-on-gate', 'do not exit non-zero when gate fails')
+    .action(
+      async (options: {
+        benchmark: string;
+        profiles: string;
+        profile: string;
+        outputDir: string;
+        failOnGate: boolean;
+      }) => {
+        await runRetrievalProofCommand(options);
+      }
+    );
+
   const automations = program.command('automations').description('Manage local automations');
 
   automations.command('list').action(async () => {
diff --git a/src/cli/commands/retrieval-proof.ts b/src/cli/commands/retrieval-proof.ts
new file mode 100644
index 0000000..905a2c1
--- /dev/null
+++ b/src/cli/commands/retrieval-proof.ts
@@ -0,0 +1,51 @@
+import { mkdir, writeFile } from 'node:fs/promises';
+import { isAbsolute, join, resolve } from 'node:path';
+import { formatProofingMarkdown } from '../../context/retrieval/proofing/reports';
+import { runRetrievalProofing } from '../../context/retrieval/proofing/runner';
+
+type RetrievalProofCommandOptions = {
+  benchmark: string;
+  profiles: string;
+  profile: string;
+  outputDir: string;
+  failOnGate: boolean;
+};
+
+export async function runRetrievalProofCommand(
+  options: RetrievalProofCommandOptions
+): Promise<void> {
+  const benchmarkPath = absoluteFromCwd(options.benchmark);
+  const profilesPath = absoluteFromCwd(options.profiles);
+  const outputDir = absoluteFromCwd(options.outputDir);
+
+  const report = await runRetrievalProofing({
+    benchmarkPath,
+    profilesPath,
+    profileName: options.profile,
+  });
+
+  await mkdir(outputDir, { recursive: true });
+  const timestamp = report.generatedAt.replaceAll(':', '-');
+  const baseName = `${report.profile}-${timestamp}`;
+  const jsonPath = join(outputDir, `${baseName}.json`);
+  const markdownPath = join(outputDir, `${baseName}.md`);
+
+  await writeFile(jsonPath, `${JSON.stringify(report, null, 2)}\n`, 'utf8');
+  await writeFile(markdownPath, `${formatProofingMarkdown(report)}\n`, 'utf8');
+
+  console.log(`Retrieval proofing complete for profile "${report.profile}".`);
+  console.log(`Gate status: ${report.gate.passed ? 'PASS' : 'FAIL'}`);
+  console.log(`JSON report: ${jsonPath}`);
+  console.log(`Markdown report: ${markdownPath}`);
+
+  if (!report.gate.passed && options.failOnGate) {
+    throw new Error(`Retrieval proofing gate failed: ${report.gate.failures.join('; ')}`);
+  }
+}
+
+function absoluteFromCwd(path: string): string {
+  if (isAbsolute(path)) {
+    return path;
+  }
+  return resolve(process.cwd(), path);
+}
diff --git a/src/context/retrieval/proofing/reports.ts b/src/context/retrieval/proofing/reports.ts
new file mode 100644
index 0000000..0067ec2
--- /dev/null
+++ b/src/context/retrieval/proofing/reports.ts
@@ -0,0 +1,65 @@
+import type { RetrievalProofingReport } from './schema';
+
+export function formatProofingMarkdown(report: RetrievalProofingReport): string {
+  const hybridAggregate = report.strategies.hybrid.aggregate.metrics;
+  const lexicalAggregate = report.strategies.lexical.aggregate.metrics;
+  const vectorAggregate = report.strategies.vector.aggregate.metrics;
+
+  const lines = [
+    '# Retrieval Quality Proofing Report',
+    '',
+    `- Generated: ${report.generatedAt}`,
+    `- Benchmark: ${report.benchmark.datasetName}@${report.benchmark.datasetVersion}`,
+    `- Profile: ${report.profile}`,
+    `- Gate: ${report.gate.passed ? 'PASS' : 'FAIL'}`,
+    '',
+    '## Aggregate Metrics',
+    '',
+    '| Strategy | Evidence Relevance | Citation Coverage | Unsupported Penalty | Composite |',
+    '| --- | ---: | ---: | ---: | ---: |',
+    renderAggregateRow('hybrid', hybridAggregate),
+    renderAggregateRow('lexical', lexicalAggregate),
+    renderAggregateRow('vector', vectorAggregate),
+    '',
+    '## Hybrid Deltas vs Baselines',
+    '',
+    '| Baseline | Evidence Relevance Δ | Citation Coverage Δ | Unsupported Penalty Δ | Composite Δ |',
+    '| --- | ---: | ---: | ---: | ---: |',
+    ...report.hybridDeltas.map((entry) =>
+      [
+        `| ${entry.baseline}`,
+        `${entry.metricDeltas.evidenceRelevance.toFixed(3)}`,
+        `${entry.metricDeltas.citationSupportCoverage.toFixed(3)}`,
+        `${entry.metricDeltas.unsupportedClaimPenalty.toFixed(3)}`,
+        `${entry.metricDeltas.compositeScore.toFixed(3)} |`,
+      ].join(' | ')
+    ),
+    '',
+    '## Gate Status',
+    '',
+    report.gate.passed ? '- All configured thresholds passed.' : '- Failure reasons:',
+    ...report.gate.failures.map((failure) => `  - ${failure}`),
+    '',
+    '## Per-Case Hybrid Summary',
+    '',
+    '| Case | Retrieved Doc IDs | Expected Evidence IDs | Composite |',
+    '| --- | --- | --- | ---: |',
+    ...report.strategies.hybrid.cases.map((entry) =>
+      [
+        `| ${entry.caseId}`,
+        entry.retrievedDocIds.join(', '),
+        entry.expectedEvidenceDocIds.join(', '),
+        `${entry.metrics.compositeScore.toFixed(3)} |`,
+      ].join(' | ')
+    ),
+  ];
+
+  return lines.join('\n');
+}
+
+function renderAggregateRow(
+  strategy: string,
+  metrics: RetrievalProofingReport['strategies']['hybrid']['aggregate']['metrics']
+): string {
+  return `| ${strategy} | ${metrics.evidenceRelevance.toFixed(3)} | ${metrics.citationSupportCoverage.toFixed(3)} | ${metrics.unsupportedClaimPenalty.toFixed(3)} | ${metrics.compositeScore.toFixed(3)} |`;
+}
diff --git a/src/context/retrieval/proofing/runner.ts b/src/context/retrieval/proofing/runner.ts
new file mode 100644
index 0000000..00be525
--- /dev/null
+++ b/src/context/retrieval/proofing/runner.ts
@@ -0,0 +1,213 @@
+import { readFile } from 'node:fs/promises';
+import { cosineSimilarity, deterministicEmbedding } from '../rerank';
+import {
+  type BenchmarkCase,
+  type BenchmarkProfile,
+  BenchmarkProfilesSchema,
+  RetrievalBenchmarkSchema,
+  type RetrievalProofingReport,
+  RetrievalProofingReportSchema,
+  type RetrievalProofingStrategy,
+} from './schema';
+import { averageCaseMetrics, scoreCaseMetrics, subtractMetrics } from './scoring';
+
+const STRATEGIES: RetrievalProofingStrategy[] = ['lexical', 'vector', 'hybrid'];
+
+export async function runRetrievalProofing(input: {
+  benchmarkPath: string;
+  profilesPath: string;
+  profileName: string;
+}): Promise<RetrievalProofingReport> {
+  const benchmark = await loadBenchmark(input.benchmarkPath);
+  const profiles = await loadProfiles(input.profilesPath);
+  const profile = profiles.profiles[input.profileName];
+
+  if (!profile) {
+    throw new Error(
+      `Unknown benchmark profile "${input.profileName}". Available: ${Object.keys(profiles.profiles).join(', ')}`
+    );
+  }
+
+  const selectedCases = selectCases(benchmark.cases, profile);
+  const strategyReports = Object.fromEntries(
+    STRATEGIES.map((strategy) => {
+      const cases = selectedCases.map((benchmarkCase) => {
+        const retrievedDocIds = retrieveDocsForCase(benchmarkCase, strategy);
+        const metrics = scoreCaseMetrics({ benchmarkCase, retrievedDocIds });
+        return {
+          caseId: benchmarkCase.id,
+          strategy,
+          retrievedDocIds,
+          expectedEvidenceDocIds: benchmarkCase.expectedEvidenceDocIds,
+          metrics,
+        };
+      });
+      const aggregate = {
+        strategy,
+        metrics: averageCaseMetrics(cases.map((entry) => entry.metrics)),
+      };
+      return [strategy, { cases, aggregate }];
+    })
+  ) as RetrievalProofingReport['strategies'];
+
+  const hybridAggregate = strategyReports.hybrid.aggregate.metrics;
+  const lexicalAggregate = strategyReports.lexical.aggregate.metrics;
+  const vectorAggregate = strategyReports.vector.aggregate.metrics;
+
+  const hybridDeltas = [
+    {
+      baseline: 'lexical' as const,
+      metricDeltas: subtractMetrics(hybridAggregate, lexicalAggregate),
+    },
+    {
+      baseline: 'vector' as const,
+      metricDeltas: subtractMetrics(hybridAggregate, vectorAggregate),
+    },
+  ];
+
+  const gateFailures = evaluateGate({
+    profile,
+    hybridAggregate,
+    lexicalAggregate,
+    vectorAggregate,
+  });
+
+  return RetrievalProofingReportSchema.parse({
+    schemaVersion: '1.0',
+    benchmark: {
+      datasetName: benchmark.datasetName,
+      datasetVersion: benchmark.datasetVersion,
+    },
+    profile: input.profileName,
+    generatedAt: new Date().toISOString(),
+    strategies: strategyReports,
+    hybridDeltas,
+    gate: {
+      passed: gateFailures.length === 0,
+      failures: gateFailures,
+    },
+  });
+}
+
+function evaluateGate(input: {
+  profile: BenchmarkProfile;
+  hybridAggregate: RetrievalProofingReport['strategies']['hybrid']['aggregate']['metrics'];
+  lexicalAggregate: RetrievalProofingReport['strategies']['lexical']['aggregate']['metrics'];
+  vectorAggregate: RetrievalProofingReport['strategies']['vector']['aggregate']['metrics'];
+}): string[] {
+  const failures: string[] = [];
+  const minimums = input.profile.thresholds.hybridMinimums;
+  const deltas = input.profile.thresholds.baselineDeltaFloors;
+  const hybrid = input.hybridAggregate;
+
+  if (hybrid.evidenceRelevance < minimums.evidenceRelevance) {
+    failures.push(
+      `hybrid evidenceRelevance ${hybrid.evidenceRelevance.toFixed(3)} < ${minimums.evidenceRelevance.toFixed(3)}`
+    );
+  }
+  if (hybrid.citationSupportCoverage < minimums.citationSupportCoverage) {
+    failures.push(
+      `hybrid citationSupportCoverage ${hybrid.citationSupportCoverage.toFixed(3)} < ${minimums.citationSupportCoverage.toFixed(3)}`
+    );
+  }
+  if (hybrid.compositeScore < minimums.compositeScore) {
+    failures.push(
+      `hybrid compositeScore ${hybrid.compositeScore.toFixed(3)} < ${minimums.compositeScore.toFixed(3)}`
+    );
+  }
+  if (hybrid.unsupportedClaimPenalty > minimums.maxUnsupportedClaimPenalty) {
+    failures.push(
+      `hybrid unsupportedClaimPenalty ${hybrid.unsupportedClaimPenalty.toFixed(3)} > ${minimums.maxUnsupportedClaimPenalty.toFixed(3)}`
+    );
+  }
+
+  const hybridVsLexical = hybrid.compositeScore - input.lexicalAggregate.compositeScore;
+  if (hybridVsLexical < deltas.lexical) {
+    failures.push(
+      `hybrid-vs-lexical composite delta ${hybridVsLexical.toFixed(3)} < ${deltas.lexical.toFixed(3)}`
+    );
+  }
+
+  const hybridVsVector = hybrid.compositeScore - input.vectorAggregate.compositeScore;
+  if (hybridVsVector < deltas.vector) {
+    failures.push(
+      `hybrid-vs-vector composite delta ${hybridVsVector.toFixed(3)} < ${deltas.vector.toFixed(3)}`
+    );
+  }
+
+  return failures;
+}
+
+function selectCases(cases: BenchmarkCase[], profile: BenchmarkProfile): BenchmarkCase[] {
+  if (!profile.caseIds || profile.caseIds.length === 0) {
+    return cases;
+  }
+
+  const wanted = new Set(profile.caseIds);
+  const selected = cases.filter((entry) => wanted.has(entry.id));
+  if (selected.length !== profile.caseIds.length) {
+    const selectedIds = new Set(selected.map((entry) => entry.id));
+    const missing = profile.caseIds.filter((id) => !selectedIds.has(id));
+    throw new Error(`Profile references missing benchmark case IDs: ${missing.join(', ')}`);
+  }
+  return selected;
+}
+
+function retrieveDocsForCase(
+  benchmarkCase: BenchmarkCase,
+  strategy: RetrievalProofingStrategy
+): string[] {
+  const rows = benchmarkCase.documents.map((doc) => {
+    const lexicalScore = computeLexicalScore(benchmarkCase.query, `${doc.title} ${doc.content}`);
+    const vectorScore = cosineSimilarity(
+      deterministicEmbedding(benchmarkCase.query),
+      deterministicEmbedding(`${doc.title} ${doc.content}`)
+    );
+    const totalScore =
+      strategy === 'lexical'
+        ? lexicalScore
+        : strategy === 'vector'
+          ? vectorScore
+          : lexicalScore * 0.7 + vectorScore * 0.3;
+
+    return {
+      id: doc.id,
+      totalScore,
+    };
+  });
+
+  return rows
+    .sort((a, b) => b.totalScore - a.totalScore)
+    .slice(0, benchmarkCase.topK)
+    .map((entry) => entry.id);
+}
+
+function computeLexicalScore(query: string, haystack: string): number {
+  const tokens = query
+    .toLowerCase()
+    .split(/[^a-z0-9]+/g)
+    .filter(Boolean);
+  if (tokens.length === 0) {
+    return 0;
+  }
+
+  const source = haystack.toLowerCase();
+  let score = 0;
+  for (const token of tokens) {
+    if (source.includes(token)) {
+      score += 1;
+    }
+  }
+
+  return score / tokens.length;
+}
+
+async function loadBenchmark(path: string) {
+  const raw = await readFile(path, 'utf8');
+  return RetrievalBenchmarkSchema.parse(JSON.parse(raw));
+}
+
+async function loadProfiles(path: string) {
+  const raw = await readFile(path, 'utf8');
+  return BenchmarkProfilesSchema.parse(JSON.parse(raw));
+}
diff --git a/src/context/retrieval/proofing/schema.ts b/src/context/retrieval/proofing/schema.ts
new file mode 100644
index 0000000..0e7a78c
--- /dev/null
+++ b/src/context/retrieval/proofing/schema.ts
@@ -0,0 +1,118 @@
+import { z } from 'zod';
+
+export const RetrievalProofingStrategySchema = z.enum(['lexical', 'vector', 'hybrid']);
+export type RetrievalProofingStrategy = z.infer<typeof RetrievalProofingStrategySchema>;
+
+export const BenchmarkDocumentSchema = z.object({
+  id: z.string().min(1),
+  path: z.string().min(1),
+  title: z.string().min(1),
+  content: z.string().min(1),
+});
+
+export const BenchmarkCaseSchema = z.object({
+  id: z.string().min(1),
+  title: z.string().min(1),
+  query: z.string().min(1),
+  intent: z.string().min(1),
+  difficulty: z.enum(['low', 'medium', 'high']),
+  topK: z.number().int().positive().default(3),
+  documents: z.array(BenchmarkDocumentSchema).min(2),
+  expectedEvidenceDocIds: z.array(z.string().min(1)).min(1),
+});
+export type BenchmarkCase = z.infer<typeof BenchmarkCaseSchema>;
+
+export const RetrievalBenchmarkSchema = z.object({
+  version: z.literal('1.0'),
+  datasetName: z.string().min(1),
+  datasetVersion: z.string().min(1),
+  cases: z.array(BenchmarkCaseSchema).min(1),
+});
+export type RetrievalBenchmark = z.infer<typeof RetrievalBenchmarkSchema>;
+
+export const ProofingThresholdsSchema = z.object({
+  hybridMinimums: z.object({
+    evidenceRelevance: z.number().min(0).max(1),
+    citationSupportCoverage: z.number().min(0).max(1),
+    compositeScore: z.number().min(0).max(1),
+    maxUnsupportedClaimPenalty: z.number().min(0).max(1),
+  }),
+  baselineDeltaFloors: z.object({
+    lexical: z.number(),
+    vector: z.number(),
+  }),
+});
+export type ProofingThresholds = z.infer<typeof ProofingThresholdsSchema>;
+
+export const BenchmarkProfileSchema = z.object({
+  description: z.string().min(1),
+  caseIds: z.array(z.string().min(1)).optional(),
+  thresholds: ProofingThresholdsSchema,
+});
+export type BenchmarkProfile = z.infer<typeof BenchmarkProfileSchema>;
+
+export const BenchmarkProfilesSchema = z.object({
+  version: z.literal('1.0'),
+  profiles: z.record(z.string(), BenchmarkProfileSchema),
+});
+export type BenchmarkProfiles = z.infer<typeof BenchmarkProfilesSchema>;
+
+export const CaseMetricsSchema = z.object({
+  evidenceRelevance: z.number().min(0).max(1),
+  citationSupportCoverage: z.number().min(0).max(1),
+  unsupportedClaimPenalty: z.number().min(0).max(1),
+  compositeScore: z.number().min(0).max(1),
+});
+export type CaseMetrics = z.infer<typeof CaseMetricsSchema>;
+
+export const StrategyCaseResultSchema = z.object({
+  caseId: z.string(),
+  strategy: RetrievalProofingStrategySchema,
+  retrievedDocIds: z.array(z.string()),
+  expectedEvidenceDocIds: z.array(z.string()),
+  metrics: CaseMetricsSchema,
+});
+export type StrategyCaseResult = z.infer<typeof StrategyCaseResultSchema>;
+
+export const StrategyAggregateSchema = z.object({
+  strategy: RetrievalProofingStrategySchema,
+  metrics: CaseMetricsSchema,
+});
+export type StrategyAggregate = z.infer<typeof StrategyAggregateSchema>;
+
+export const StrategyDeltaSchema = z.object({
+  baseline: z.enum(['lexical', 'vector']),
+  metricDeltas: z.object({
+    evidenceRelevance: z.number(),
+    citationSupportCoverage: z.number(),
+    unsupportedClaimPenalty: z.number(),
+    compositeScore: z.number(),
+  }),
+});
+export type StrategyDelta = z.infer<typeof StrategyDeltaSchema>;
+
+export const GateResultSchema = z.object({
+  passed: z.boolean(),
+  failures: z.array(z.string()),
+});
+export type GateResult = z.infer<typeof GateResultSchema>;
+
+export const RetrievalProofingReportSchema = z.object({
+  schemaVersion: z.literal('1.0'),
+  benchmark: z.object({
+    datasetName: z.string(),
+    datasetVersion: z.string(),
+  }),
+  profile: z.string(),
+  generatedAt: z.string(),
+  strategies: z.record(
+    RetrievalProofingStrategySchema,
+    z.object({
+      cases: z.array(StrategyCaseResultSchema),
+      aggregate: StrategyAggregateSchema,
+    })
+  ),
+  hybridDeltas: z.array(StrategyDeltaSchema),
+  gate: GateResultSchema,
+});
+export type RetrievalProofingReport = z.infer<typeof RetrievalProofingReportSchema>;
diff --git a/src/context/retrieval/proofing/scoring.ts b/src/context/retrieval/proofing/scoring.ts
new file mode 100644
index 0000000..790a032
--- /dev/null
+++ b/src/context/retrieval/proofing/scoring.ts
@@ -0,0 +1,84 @@
+import type { BenchmarkCase, CaseMetrics } from './schema';
+
+export function scoreCaseMetrics(input: {
+  benchmarkCase: BenchmarkCase;
+  retrievedDocIds: string[];
+}): CaseMetrics {
+  const { benchmarkCase, retrievedDocIds } = input;
+  const topK = benchmarkCase.topK;
+  const expectedSet = new Set(benchmarkCase.expectedEvidenceDocIds);
+  const top = retrievedDocIds.slice(0, topK);
+  const weightedHits = top.reduce((acc, docId, index) => {
+    if (!expectedSet.has(docId)) {
+      return acc;
+    }
+    return acc + (topK - index) / topK;
+  }, 0);
+  const weightDenominator = top.reduce((acc, _docId, index) => acc + (topK - index) / topK, 0);
+  const hits = top.filter((docId) => expectedSet.has(docId)).length;
+
+  const evidenceRelevance = divide(weightedHits, weightDenominator);
+  const citationSupportCoverage = divide(hits, benchmarkCase.expectedEvidenceDocIds.length);
+  const unsupportedClaimPenalty = divide(topK - hits, topK);
+  const compositeScore = clamp01(
+    evidenceRelevance * 0.45 + citationSupportCoverage * 0.45 + (1 - unsupportedClaimPenalty) * 0.1
+  );
+
+  return {
+    evidenceRelevance,
+    citationSupportCoverage,
+    unsupportedClaimPenalty,
+    compositeScore,
+  };
+}
+
+export function averageCaseMetrics(metrics: CaseMetrics[]): CaseMetrics {
+  if (metrics.length === 0) {
+    return {
+      evidenceRelevance: 0,
+      citationSupportCoverage: 0,
+      unsupportedClaimPenalty: 1,
+      compositeScore: 0,
+    };
+  }
+
+  return {
+    evidenceRelevance: average(metrics.map((metric) => metric.evidenceRelevance)),
+    citationSupportCoverage: average(metrics.map((metric) => metric.citationSupportCoverage)),
+    unsupportedClaimPenalty: average(metrics.map((metric) => metric.unsupportedClaimPenalty)),
+    compositeScore: average(metrics.map((metric) => metric.compositeScore)),
+  };
+}
+
+export function subtractMetrics(a: CaseMetrics, b: CaseMetrics): CaseMetrics {
+  return {
+    evidenceRelevance: a.evidenceRelevance - b.evidenceRelevance,
+    citationSupportCoverage: a.citationSupportCoverage - b.citationSupportCoverage,
+    unsupportedClaimPenalty: a.unsupportedClaimPenalty - b.unsupportedClaimPenalty,
+    compositeScore: a.compositeScore - b.compositeScore,
+  };
+}
+
+function average(values: number[]): number {
+  if (values.length === 0) {
+    return 0;
+  }
+  return values.reduce((acc, value) => acc + value, 0) / values.length;
+}
+
+function divide(num: number, den: number): number {
+  if (den <= 0) {
+    return 0;
+  }
+  return num / den;
+}
+
+function clamp01(value: number): number {
+  if (value <= 0) {
+    return 0;
+  }
+  if (value >= 1) {
+    return 1;
+  }
+  return value;
+}
diff --git a/tests/retrieval-proofing.test.ts b/tests/retrieval-proofing.test.ts
new file mode 100644
index 0000000..66333fb
--- /dev/null
+++ b/tests/retrieval-proofing.test.ts
@@ -0,0 +1,90 @@
+import { mkdtemp, readFile, writeFile } from 'node:fs/promises';
+import { tmpdir } from 'node:os';
+import { join, resolve } from 'node:path';
+import { describe, expect, it } from 'vitest';
+import { runRetrievalProofing } from '../src/context/retrieval/proofing/runner';
+import { RetrievalProofingReportSchema } from '../src/context/retrieval/proofing/schema';
+
+const benchmarkPath = resolve(process.cwd(), 'benchmarks/retrieval-proofing/benchmark.v1.json');
+const profilesPath = resolve(process.cwd(), 'benchmarks/retrieval-proofing/profiles.v1.json');
+
+describe('retrieval proofing', () => {
+  it('produces deterministic scoring for fixed benchmark/profile inputs', async () => {
+    const first = await runRetrievalProofing({
+      benchmarkPath,
+      profilesPath,
+      profileName: 'smoke',
+    });
+    const second = await runRetrievalProofing({
+      benchmarkPath,
+      profilesPath,
+      profileName: 'smoke',
+    });
+
+    expect(first.generatedAt).not.toEqual(second.generatedAt);
+    expect({ ...first, generatedAt: 'fixed' }).toEqual({ ...second, generatedAt: 'fixed' });
+  });
+
+  it('keeps JSON report schema stable and parseable', async () => {
+    const report = await runRetrievalProofing({
+      benchmarkPath,
+      profilesPath,
+      profileName: 'smoke',
+    });
+
+    const parsed = RetrievalProofingReportSchema.parse(report);
+
+    expect(parsed.schemaVersion).toBe('1.0');
+    expect(Object.keys(parsed.strategies)).toEqual(['lexical', 'vector', 'hybrid']);
+    expect(parsed.strategies.hybrid.cases.length).toBeGreaterThan(0);
+    expect(typeof parsed.gate.passed).toBe('boolean');
+  });
+
+  it('reports gate pass/fail based on configured thresholds', async () => {
+    const passReport = await runRetrievalProofing({
+      benchmarkPath,
+      profilesPath,
+      profileName: 'smoke',
+    });
+    expect(passReport.gate.passed).toBe(true);
+
+    const tempDir = await mkdtemp(join(tmpdir(), 'retrieval-proofing-'));
+    const strictProfilesPath = join(tempDir, 'profiles.strict.json');
+    const baseProfiles = JSON.parse(await readFile(profilesPath, 'utf8')) as {
+      version: string;
+      profiles: Record<string, unknown>;
+    };
+
+    const strictProfiles = {
+      ...baseProfiles,
+      profiles: {
+        ...baseProfiles.profiles,
+        smoke: {
+          description: 'strict gate for failure test',
+          thresholds: {
+            hybridMinimums: {
+              evidenceRelevance: 0.99,
+              citationSupportCoverage: 0.99,
+              compositeScore: 0.99,
+              maxUnsupportedClaimPenalty: 0.01,
+            },
+            baselineDeltaFloors: {
+              lexical: 0.2,
+              vector: 0.2,
+            },
+          },
+        },
+      },
+    };
+    await writeFile(strictProfilesPath, `${JSON.stringify(strictProfiles, null, 2)}\n`, 'utf8');
+
+    const failReport = await runRetrievalProofing({
+      benchmarkPath,
+      profilesPath: strictProfilesPath,
+      profileName: 'smoke',
+    });
+
+    expect(failReport.gate.passed).toBe(false);
+    expect(failReport.gate.failures.length).toBeGreaterThan(0);
+  });
+});

From dd34f45415d1b2f7055fb29e8400bc1b9f519ffa Mon Sep 17 00:00:00 2001
From: Daniel Wise <io.dwise@gmail.com>
Date: Tue, 3 Mar 2026 19:13:08 -0800
Subject: [PATCH 2/2] fix(retrieval): address PR feedback on docs and test
 stability

---
 docs/retrieval-proofing-benchmark-schema.md   |  6 ++
 .../specs/retrieval-quality-proofing/spec.md  |  5 +-
 tests/retrieval-proofing.test.ts              | 63 ++++++++++---------
 3 files changed, 43 insertions(+), 31 deletions(-)

diff --git a/docs/retrieval-proofing-benchmark-schema.md b/docs/retrieval-proofing-benchmark-schema.md
index 3551b00..4f30f2a 100644
--- a/docs/retrieval-proofing-benchmark-schema.md
+++ b/docs/retrieval-proofing-benchmark-schema.md
@@ -28,6 +28,12 @@ Top-level shape:
           "path": "src/context/retrieval/hybrid.ts",
           "title": "Hybrid retrieval source",
           "content": "..."
+        },
+        {
+          "id": "d2",
+          "path": "src/context/retrieval/rerank.ts",
+          "title": "Rerank helpers",
+          "content": "..."
         }
       ]
     }
diff --git a/openspec/specs/retrieval-quality-proofing/spec.md b/openspec/specs/retrieval-quality-proofing/spec.md
index ef4b8ec..4c5794e 100644
--- a/openspec/specs/retrieval-quality-proofing/spec.md
+++ b/openspec/specs/retrieval-quality-proofing/spec.md
@@ -1,7 +1,9 @@
 # retrieval-quality-proofing Specification
 
 ## Purpose
-TBD - created by archiving change retrieval-quality-proofing. Update Purpose after archive.
+Define how the system evaluates, compares, and enforces retrieval quality across lexical, vector,
+and hybrid strategies using deterministic benchmarks, grounding-focused metrics, and
+CI-enforceable quality gates.
 ## Requirements
 ### Requirement: Multi-Strategy Retrieval Evaluation
 The system MUST execute the same benchmark question set against at least three retrieval strategies: lexical-only, vector-only, and hybrid.
@@ -34,4 +36,3 @@ The system MUST enforce configurable quality gates that verify hybrid retrieval
 #### Scenario: Gate passes on acceptable hybrid improvement
 - **WHEN** a proofing run determines that hybrid retrieval meets configured improvement thresholds versus baseline
 - **THEN** the command exits zero and marks the run as passing
-
diff --git a/tests/retrieval-proofing.test.ts b/tests/retrieval-proofing.test.ts
index 66333fb..a0bca43 100644
--- a/tests/retrieval-proofing.test.ts
+++ b/tests/retrieval-proofing.test.ts
@@ -1,4 +1,4 @@
-import { mkdtemp, readFile, writeFile } from 'node:fs/promises';
+import { mkdtemp, readFile, rm, writeFile } from 'node:fs/promises';
 import { tmpdir } from 'node:os';
 import { join, resolve } from 'node:path';
 import { describe, expect, it } from 'vitest';
@@ -21,7 +21,6 @@ describe('retrieval proofing', () => {
       profileName: 'smoke',
     });
 
-    expect(first.generatedAt).not.toEqual(second.generatedAt);
     expect({ ...first, generatedAt: 'fixed' }).toEqual({ ...second, generatedAt: 'fixed' });
   });
 
@@ -52,39 +51,45 @@ describe('retrieval proofing', () => {
     const strictProfilesPath = join(tempDir, 'profiles.strict.json');
     const baseProfiles = JSON.parse(await readFile(profilesPath, 'utf8')) as {
       version: string;
-      profiles: Record<string, unknown>;
+      profiles: Record<string, { description?: string; caseIds?: string[]; thresholds?: unknown }>;
     };
 
-    const strictProfiles = {
-      ...baseProfiles,
-      profiles: {
-        ...baseProfiles.profiles,
-        smoke: {
-          description: 'strict gate for failure test',
-          thresholds: {
-            hybridMinimums: {
-              evidenceRelevance: 0.99,
-              citationSupportCoverage: 0.99,
-              compositeScore: 0.99,
-              maxUnsupportedClaimPenalty: 0.01,
-            },
-            baselineDeltaFloors: {
-              lexical: 0.2,
-              vector: 0.2,
+    try {
+      const existingSmoke = baseProfiles.profiles.smoke ?? {};
+      const strictProfiles = {
+        ...baseProfiles,
+        profiles: {
+          ...baseProfiles.profiles,
+          smoke: {
+            ...existingSmoke,
+            description: 'strict gate for failure test',
+            thresholds: {
+              hybridMinimums: {
+                evidenceRelevance: 0.99,
+                citationSupportCoverage: 0.99,
+                compositeScore: 0.99,
+                maxUnsupportedClaimPenalty: 0.01,
+              },
+              baselineDeltaFloors: {
+                lexical: 0.2,
+                vector: 0.2,
+              },
             },
           },
         },
-      },
-    };
-    await writeFile(strictProfilesPath, `${JSON.stringify(strictProfiles, null, 2)}\n`, 'utf8');
+      };
+      await writeFile(strictProfilesPath, `${JSON.stringify(strictProfiles, null, 2)}\n`, 'utf8');
 
-    const failReport = await runRetrievalProofing({
-      benchmarkPath,
-      profilesPath: strictProfilesPath,
-      profileName: 'smoke',
-    });
+      const failReport = await runRetrievalProofing({
+        benchmarkPath,
+        profilesPath: strictProfilesPath,
+        profileName: 'smoke',
+      });
 
-    expect(failReport.gate.passed).toBe(false);
-    expect(failReport.gate.failures.length).toBeGreaterThan(0);
+      expect(failReport.gate.passed).toBe(false);
+      expect(failReport.gate.failures.length).toBeGreaterThan(0);
+    } finally {
+      await rm(tempDir, { recursive: true, force: true });
+    }
   });
 });