From fa7846ec65de481fff63510a531c11322710a31f Mon Sep 17 00:00:00 2001 From: Daniel Wise Date: Tue, 3 Mar 2026 17:15:37 -0800 Subject: [PATCH 1/2] feat(retrieval): add retrieval quality proofing workflow and archive change - add retrieval proofing benchmark fixtures and profile thresholds - add proofing runner, deterministic scoring, and report generation - add retrieval-proof CLI command and smoke CI gate - add tests and docs for local/CI proofing workflow - archive retrieval-quality-proofing change and sync main spec --- .github/workflows/pr-checks.yml | 3 + README.md | 2 + .../retrieval-proofing/benchmark.v1.json | 217 ++++++++++++++++++ .../retrieval-proofing/profiles.v1.json | 36 +++ docs/retrieval-proofing-benchmark-schema.md | 63 +++++ docs/retrieval-proofing.md | 54 +++++ .../.openspec.yaml | 0 .../design.md | 0 .../proposal.md | 0 .../specs/retrieval-quality-proofing/spec.md | 0 .../tasks.md | 23 ++ .../retrieval-quality-proofing/tasks.md | 23 -- .../specs/retrieval-quality-proofing/spec.md | 37 +++ src/cli/commands.ts | 33 +++ src/cli/commands/retrieval-proof.ts | 51 ++++ src/context/retrieval/proofing/reports.ts | 65 ++++++ src/context/retrieval/proofing/runner.ts | 213 +++++++++++++++++ src/context/retrieval/proofing/schema.ts | 118 ++++++++++ src/context/retrieval/proofing/scoring.ts | 84 +++++++ tests/retrieval-proofing.test.ts | 90 ++++++++ 20 files changed, 1089 insertions(+), 23 deletions(-) create mode 100644 benchmarks/retrieval-proofing/benchmark.v1.json create mode 100644 benchmarks/retrieval-proofing/profiles.v1.json create mode 100644 docs/retrieval-proofing-benchmark-schema.md create mode 100644 docs/retrieval-proofing.md rename openspec/changes/{retrieval-quality-proofing => archive/2026-03-04-retrieval-quality-proofing}/.openspec.yaml (100%) rename openspec/changes/{retrieval-quality-proofing => archive/2026-03-04-retrieval-quality-proofing}/design.md (100%) rename openspec/changes/{retrieval-quality-proofing => archive/2026-03-04-retrieval-quality-proofing}/proposal.md (100%) rename openspec/changes/{retrieval-quality-proofing => archive/2026-03-04-retrieval-quality-proofing}/specs/retrieval-quality-proofing/spec.md (100%) create mode 100644 openspec/changes/archive/2026-03-04-retrieval-quality-proofing/tasks.md delete mode 100644 openspec/changes/retrieval-quality-proofing/tasks.md create mode 100644 openspec/specs/retrieval-quality-proofing/spec.md create mode 100644 src/cli/commands/retrieval-proof.ts create mode 100644 src/context/retrieval/proofing/reports.ts create mode 100644 src/context/retrieval/proofing/runner.ts create mode 100644 src/context/retrieval/proofing/schema.ts create mode 100644 src/context/retrieval/proofing/scoring.ts create mode 100644 tests/retrieval-proofing.test.ts diff --git a/.github/workflows/pr-checks.yml b/.github/workflows/pr-checks.yml index 2ec99dc..f97e85a 100644 --- a/.github/workflows/pr-checks.yml +++ b/.github/workflows/pr-checks.yml @@ -43,6 +43,9 @@ jobs: - name: Run checks run: pnpm checks + - name: Run retrieval proofing (smoke profile) + run: pnpm dev retrieval-proof --profile smoke --output-dir artifacts/retrieval-proofing + - name: Comment success summary on PR if: ${{ success() && github.event_name == 'pull_request' }} continue-on-error: true diff --git a/README.md b/README.md index b643150..1da25f9 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,7 @@ pnpm dev -- chat pnpm dev -- chat "summarize this repo" pnpm dev -- plan "create a rollout plan for indexing" pnpm dev -- index . +pnpm dev retrieval-proof --profile smoke pnpm dev -- automations list pnpm dev -- automations add --name "Hourly Check" --cron "0 * * * *" --prompt "summarize local status" pnpm dev -- automations run @@ -94,3 +95,4 @@ Environment variables (BYOK): - Anthropic embeddings currently fall back to deterministic local vectors. - This project intentionally uses Biome only (no ESLint/Prettier). +- Retrieval proofing benchmark schema/workflow docs: `docs/retrieval-proofing-benchmark-schema.md` and `docs/retrieval-proofing.md`. diff --git a/benchmarks/retrieval-proofing/benchmark.v1.json b/benchmarks/retrieval-proofing/benchmark.v1.json new file mode 100644 index 0000000..734c13b --- /dev/null +++ b/benchmarks/retrieval-proofing/benchmark.v1.json @@ -0,0 +1,217 @@ +{ + "version": "1.0", + "datasetName": "retrieval-proofing-core", + "datasetVersion": "2026.03.01", + "cases": [ + { + "id": "repo-layout", + "title": "Find retrieval implementation location", + "query": "where is hybrid retrieval implemented", + "intent": "lookup", + "difficulty": "low", + "topK": 3, + "expectedEvidenceDocIds": ["d1", "d2"], + "documents": [ + { + "id": "d1", + "path": "src/context/retrieval/hybrid.ts", + "title": "Hybrid retrieval source", + "content": "The hybrid retrieval runner combines lexical search, vector similarity, and ranking metadata." + }, + { + "id": "d2", + "path": "src/context/retrieval/rerank.ts", + "title": "Rerank helpers", + "content": "hybridRerank computes weighted retrieval ordering and cosine similarity for vector retrieval." + }, + { + "id": "d3", + "path": "README.md", + "title": "Project overview", + "content": "General project overview and quick start commands for local development." + }, + { + "id": "d4", + "path": "src/cli/commands.ts", + "title": "CLI commands", + "content": "Registers chat, plan, index, and automations commands." + } + ] + }, + { + "id": "quality-commands", + "title": "Identify quality gates", + "query": "what command runs lint typecheck and tests", + "intent": "lookup", + "difficulty": "medium", + "topK": 3, + "expectedEvidenceDocIds": ["d1"], + "documents": [ + { + "id": "d1", + "path": "package.json", + "title": "Project scripts", + "content": "The checks script runs pnpm test, pnpm typecheck, pnpm lint, and pnpm build." + }, + { + "id": "d2", + "path": "README.md", + "title": "README quality commands", + "content": "The README includes lint, typecheck, test, and build as quality commands." + }, + { + "id": "d3", + "path": "src/automation/runner.ts", + "title": "Automation runner", + "content": "Executes configured prompts on cron schedules." + }, + { + "id": "d4", + "path": "src/db/client.ts", + "title": "Database client", + "content": "Initializes PGLite and exposes query and exec methods." + } + ] + }, + { + "id": "provider-preflight", + "title": "Provider preflight requirements", + "query": "which env var is required for google provider preflight", + "intent": "lookup", + "difficulty": "medium", + "topK": 3, + "expectedEvidenceDocIds": ["d1", "d2"], + "documents": [ + { + "id": "d1", + "path": "src/cli/commands/chat.tsx", + "title": "Chat preflight", + "content": "Chat preflight checks provider env vars and prints setup instructions for google openai and anthropic." + }, + { + "id": "d2", + "path": "README.md", + "title": "Environment variable docs", + "content": "GOOGLE_GENERATIVE_AI_API_KEY is required when using the google provider." + }, + { + "id": "d3", + "path": "src/mcp/client.ts", + "title": "MCP client", + "content": "Starts and interacts with external MCP servers." + }, + { + "id": "d4", + "path": "src/context/indexer/full-index.ts", + "title": "Indexer", + "content": "Indexes repository files and writes chunks and embeddings." + } + ] + }, + { + "id": "policy-approval", + "title": "Approval behavior", + "query": "which actions require approval in interactive mode", + "intent": "reasoning", + "difficulty": "high", + "topK": 3, + "expectedEvidenceDocIds": ["d1", "d2"], + "documents": [ + { + "id": "d1", + "path": "README.md", + "title": "Interactive approval docs", + "content": "Sensitive write and destructive tool actions require explicit approve, deny, or dismiss decisions in the TUI." + }, + { + "id": "d2", + "path": "src/policy/engine.ts", + "title": "Policy engine", + "content": "Policy engine classifies tool side effects and enforces approval decisions." + }, + { + "id": "d3", + "path": "src/context/retrieval/rerank.ts", + "title": "Rerank", + "content": "Reranks retrieval candidates with weighted score combination." + }, + { + "id": "d4", + "path": "src/db/migrate.ts", + "title": "Migrations", + "content": "Applies schema migrations at startup." + } + ] + }, + { + "id": "automation-hooks", + "title": "Hook trigger behavior", + "query": "what hooks trigger tests or typecheck", + "intent": "lookup", + "difficulty": "low", + "topK": 3, + "expectedEvidenceDocIds": ["d1"], + "documents": [ + { + "id": "d1", + "path": "AGENTS.md", + "title": "Hook definitions", + "content": "file-change runs pnpm test and git-head-change runs pnpm typecheck." + }, + { + "id": "d2", + "path": "README.md", + "title": "README automation", + "content": "Automations can run prompts but does not define hook command mapping." + }, + { + "id": "d3", + "path": "src/automation/scheduler.ts", + "title": "Scheduler", + "content": "Cron scheduler dispatches queued automation specs." + }, + { + "id": "d4", + "path": "src/agent/orchestrator.ts", + "title": "Orchestrator", + "content": "Runs gather reason act verify loops with validation retries." + } + ] + }, + { + "id": "ci-workflow", + "title": "CI checks pipeline", + "query": "where is pr checks workflow defined", + "intent": "lookup", + "difficulty": "medium", + "topK": 3, + "expectedEvidenceDocIds": ["d1", "d2"], + "documents": [ + { + "id": "d1", + "path": ".github/workflows/pr-checks.yml", + "title": "PR checks workflow", + "content": "Runs pnpm checks in CI on pull requests." + }, + { + "id": "d2", + "path": "README.md", + "title": "Project quality commands", + "content": "The quality command list maps to CI checks execution." + }, + { + "id": "d3", + "path": "src/tools/registry.ts", + "title": "Tool registry", + "content": "Defines registered local tools and validation." + }, + { + "id": "d4", + "path": "src/db/client.ts", + "title": "Database client", + "content": "Provides thin wrapper around PGLite query execution." + } + ] + } + ] +} diff --git a/benchmarks/retrieval-proofing/profiles.v1.json b/benchmarks/retrieval-proofing/profiles.v1.json new file mode 100644 index 0000000..ff5c585 --- /dev/null +++ b/benchmarks/retrieval-proofing/profiles.v1.json @@ -0,0 +1,36 @@ +{ + "version": "1.0", + "profiles": { + "smoke": { + "description": "Fast CI profile with a representative subset of benchmark cases", + "caseIds": ["repo-layout", "quality-commands", "provider-preflight"], + "thresholds": { + "hybridMinimums": { + "evidenceRelevance": 0.55, + "citationSupportCoverage": 0.75, + "compositeScore": 0.62, + "maxUnsupportedClaimPenalty": 0.45 + }, + "baselineDeltaFloors": { + "lexical": -0.03, + "vector": 0.02 + } + } + }, + "full": { + "description": "Full benchmark profile for deeper retrieval proofing", + "thresholds": { + "hybridMinimums": { + "evidenceRelevance": 0.5, + "citationSupportCoverage": 0.7, + "compositeScore": 0.58, + "maxUnsupportedClaimPenalty": 0.5 + }, + "baselineDeltaFloors": { + "lexical": -0.01, + "vector": 0.02 + } + } + } + } +} diff --git a/docs/retrieval-proofing-benchmark-schema.md b/docs/retrieval-proofing-benchmark-schema.md new file mode 100644 index 0000000..3551b00 --- /dev/null +++ b/docs/retrieval-proofing-benchmark-schema.md @@ -0,0 +1,63 @@ +# Retrieval Proofing Benchmark Schema (v1.0) + +This document defines the versioned fixture format used by retrieval proofing. + +## Fixture File + +Path: `benchmarks/retrieval-proofing/benchmark.v1.json` + +Top-level shape: + +```json +{ + "version": "1.0", + "datasetName": "retrieval-proofing-core", + "datasetVersion": "2026.03.01", + "cases": [ + { + "id": "repo-layout", + "title": "Find retrieval implementation location", + "query": "where is hybrid retrieval implemented", + "intent": "lookup", + "difficulty": "low", + "topK": 3, + "expectedEvidenceDocIds": ["d1", "d2"], + "documents": [ + { + "id": "d1", + "path": "src/context/retrieval/hybrid.ts", + "title": "Hybrid retrieval source", + "content": "..." + } + ] + } + ] +} +``` + +## Field Semantics + +- `version`: Fixture schema version. Must be `1.0` for this release. +- `datasetName`: Human-readable benchmark dataset name. +- `datasetVersion`: Version of benchmark content. Bump when case content or labels change. +- `cases`: Benchmark case list. +- `cases[].id`: Stable identifier used by profile filters and reports. +- `cases[].query`: Query string used by all retrieval strategies. +- `cases[].topK`: Number of retrieved documents considered for scoring. +- `cases[].documents`: Candidate evidence set for the case. +- `cases[].expectedEvidenceDocIds`: Canonical evidence documents used for deterministic scoring. + +## Profile File + +Path: `benchmarks/retrieval-proofing/profiles.v1.json` + +- `version`: Profile schema version (`1.0`). +- `profiles..caseIds`: Optional subset of case IDs for this profile. +- `profiles..thresholds.hybridMinimums`: Absolute floors for hybrid metrics. +- `profiles..thresholds.baselineDeltaFloors`: Minimum hybrid-vs-baseline composite deltas. + +## Versioning Rules + +- Bump `datasetVersion` whenever benchmark content changes. +- Keep schema `version` at `1.0` unless the JSON structure changes. +- Prefer adding new cases over mutating existing case IDs to preserve comparability. diff --git a/docs/retrieval-proofing.md b/docs/retrieval-proofing.md new file mode 100644 index 0000000..88d6c5a --- /dev/null +++ b/docs/retrieval-proofing.md @@ -0,0 +1,54 @@ +# Retrieval Quality Proofing + +Retrieval proofing evaluates `lexical`, `vector`, and `hybrid` strategies on the same benchmark dataset, emits JSON/Markdown artifacts, and enforces hybrid quality gates. + +## Run Locally + +Smoke profile: + +```bash +pnpm dev retrieval-proof --profile smoke +``` + +Full profile: + +```bash +pnpm dev retrieval-proof --profile full +``` + +Custom output directory: + +```bash +pnpm dev retrieval-proof --profile smoke --output-dir artifacts/retrieval-proofing +``` + +## Artifacts + +Each run writes: + +- `-.json`: per-case metrics, aggregate metrics, hybrid deltas, gate result. +- `-.md`: concise human-readable summary for PR/release notes. + +Default output path: + +- `artifacts/retrieval-proofing/` + +## CI Workflow + +PR checks run retrieval proofing with the smoke profile: + +```bash +pnpm dev retrieval-proof --profile smoke --output-dir artifacts/retrieval-proofing +``` + +If hybrid thresholds fail, the command exits non-zero and CI fails. + +## Updating Baseline Thresholds Safely + +1. Run the full profile locally and inspect both JSON and Markdown reports. +2. Confirm changes are intentional and linked to retrieval behavior changes. +3. Update thresholds in `benchmarks/retrieval-proofing/profiles.v1.json`. +4. Re-run both `smoke` and `full` profiles and ensure results are stable. +5. Include rationale for threshold changes in PR description (what changed and why). + +Avoid lowering thresholds to mask regressions. Prefer improving retrieval behavior first. diff --git a/openspec/changes/retrieval-quality-proofing/.openspec.yaml b/openspec/changes/archive/2026-03-04-retrieval-quality-proofing/.openspec.yaml similarity index 100% rename from openspec/changes/retrieval-quality-proofing/.openspec.yaml rename to openspec/changes/archive/2026-03-04-retrieval-quality-proofing/.openspec.yaml diff --git a/openspec/changes/retrieval-quality-proofing/design.md b/openspec/changes/archive/2026-03-04-retrieval-quality-proofing/design.md similarity index 100% rename from openspec/changes/retrieval-quality-proofing/design.md rename to openspec/changes/archive/2026-03-04-retrieval-quality-proofing/design.md diff --git a/openspec/changes/retrieval-quality-proofing/proposal.md b/openspec/changes/archive/2026-03-04-retrieval-quality-proofing/proposal.md similarity index 100% rename from openspec/changes/retrieval-quality-proofing/proposal.md rename to openspec/changes/archive/2026-03-04-retrieval-quality-proofing/proposal.md diff --git a/openspec/changes/retrieval-quality-proofing/specs/retrieval-quality-proofing/spec.md b/openspec/changes/archive/2026-03-04-retrieval-quality-proofing/specs/retrieval-quality-proofing/spec.md similarity index 100% rename from openspec/changes/retrieval-quality-proofing/specs/retrieval-quality-proofing/spec.md rename to openspec/changes/archive/2026-03-04-retrieval-quality-proofing/specs/retrieval-quality-proofing/spec.md diff --git a/openspec/changes/archive/2026-03-04-retrieval-quality-proofing/tasks.md b/openspec/changes/archive/2026-03-04-retrieval-quality-proofing/tasks.md new file mode 100644 index 0000000..b914fd6 --- /dev/null +++ b/openspec/changes/archive/2026-03-04-retrieval-quality-proofing/tasks.md @@ -0,0 +1,23 @@ +## 1. Benchmark Dataset and Configuration + +- [x] 1.1 Define and document versioned benchmark fixture schema for retrieval proofing cases +- [x] 1.2 Add initial benchmark dataset covering multiple query intents and grounding difficulty levels +- [x] 1.3 Implement profile-based proofing configuration (for example `smoke` and `full`) with threshold settings + +## 2. Evaluation Runner and Scoring + +- [x] 2.1 Implement retrieval proofing runner that executes lexical, vector, and hybrid modes over the same case set +- [x] 2.2 Implement deterministic grounding metric scoring for evidence relevance, citation support, and unsupported-claim penalty +- [x] 2.3 Add aggregate scoring and strategy delta computation suitable for pass/fail gating + +## 3. Reporting and CLI Integration + +- [x] 3.1 Add CLI command(s) to run retrieval proofing for a selected benchmark profile +- [x] 3.2 Generate JSON report artifacts with per-case and aggregate metrics for each strategy +- [x] 3.3 Generate Markdown summary report highlighting hybrid-vs-baseline outcomes and gate status + +## 4. Quality Gates and Verification + +- [x] 4.1 Integrate proofing command into CI with non-zero exit on failed hybrid thresholds +- [x] 4.2 Add tests for scoring determinism, report schema stability, and gate pass/fail behavior +- [x] 4.3 Document local and CI proofing workflows, including how to update baseline thresholds safely diff --git a/openspec/changes/retrieval-quality-proofing/tasks.md b/openspec/changes/retrieval-quality-proofing/tasks.md deleted file mode 100644 index 0fe1789..0000000 --- a/openspec/changes/retrieval-quality-proofing/tasks.md +++ /dev/null @@ -1,23 +0,0 @@ -## 1. Benchmark Dataset and Configuration - -- [ ] 1.1 Define and document versioned benchmark fixture schema for retrieval proofing cases -- [ ] 1.2 Add initial benchmark dataset covering multiple query intents and grounding difficulty levels -- [ ] 1.3 Implement profile-based proofing configuration (for example `smoke` and `full`) with threshold settings - -## 2. Evaluation Runner and Scoring - -- [ ] 2.1 Implement retrieval proofing runner that executes lexical, vector, and hybrid modes over the same case set -- [ ] 2.2 Implement deterministic grounding metric scoring for evidence relevance, citation support, and unsupported-claim penalty -- [ ] 2.3 Add aggregate scoring and strategy delta computation suitable for pass/fail gating - -## 3. Reporting and CLI Integration - -- [ ] 3.1 Add CLI command(s) to run retrieval proofing for a selected benchmark profile -- [ ] 3.2 Generate JSON report artifacts with per-case and aggregate metrics for each strategy -- [ ] 3.3 Generate Markdown summary report highlighting hybrid-vs-baseline outcomes and gate status - -## 4. Quality Gates and Verification - -- [ ] 4.1 Integrate proofing command into CI with non-zero exit on failed hybrid thresholds -- [ ] 4.2 Add tests for scoring determinism, report schema stability, and gate pass/fail behavior -- [ ] 4.3 Document local and CI proofing workflows, including how to update baseline thresholds safely diff --git a/openspec/specs/retrieval-quality-proofing/spec.md b/openspec/specs/retrieval-quality-proofing/spec.md new file mode 100644 index 0000000..ef4b8ec --- /dev/null +++ b/openspec/specs/retrieval-quality-proofing/spec.md @@ -0,0 +1,37 @@ +# retrieval-quality-proofing Specification + +## Purpose +TBD - created by archiving change retrieval-quality-proofing. Update Purpose after archive. +## Requirements +### Requirement: Multi-Strategy Retrieval Evaluation +The system MUST execute the same benchmark question set against at least three retrieval strategies: lexical-only, vector-only, and hybrid. + +#### Scenario: Compare strategies on shared benchmark +- **WHEN** a proofing run starts for a benchmark profile +- **THEN** the system runs every benchmark case across lexical, vector, and hybrid modes using identical inputs and scoring configuration + +### Requirement: Deterministic Grounding Metrics +The system MUST calculate deterministic grounding metrics for each benchmark case and strategy, including evidence relevance, citation support coverage, and unsupported-claim penalty. + +#### Scenario: Produce deterministic scores +- **WHEN** the same benchmark profile and repository state are evaluated multiple times +- **THEN** the computed grounding metrics and aggregate scores are identical across runs except for explicitly declared non-deterministic fields + +### Requirement: Versioned Benchmark and Report Artifacts +The system MUST support versioned benchmark fixtures and emit both machine-readable and human-readable report artifacts for every proofing run. + +#### Scenario: Generate proof artifacts +- **WHEN** a proofing run completes +- **THEN** the system writes a JSON report containing per-case and aggregate metric values and writes a Markdown summary highlighting strategy deltas and pass/fail gate status + +### Requirement: Hybrid Quality Gate Enforcement +The system MUST enforce configurable quality gates that verify hybrid retrieval outperforms configured baseline strategies on grounding metrics. + +#### Scenario: Gate fails on hybrid regression +- **WHEN** a proofing run determines that hybrid retrieval does not meet configured improvement thresholds versus baseline +- **THEN** the command exits non-zero and marks the run as failed for CI enforcement + +#### Scenario: Gate passes on acceptable hybrid improvement +- **WHEN** a proofing run determines that hybrid retrieval meets configured improvement thresholds versus baseline +- **THEN** the command exits zero and marks the run as passing + diff --git a/src/cli/commands.ts b/src/cli/commands.ts index 683716f..00564bc 100644 --- a/src/cli/commands.ts +++ b/src/cli/commands.ts @@ -7,6 +7,7 @@ import { import { runChatCommand } from './commands/chat'; import { runIndexCommand } from './commands/index'; import { runPlanCommand } from './commands/plan'; +import { runRetrievalProofCommand } from './commands/retrieval-proof'; export function createProgram(): Command { const program = new Command(); @@ -37,6 +38,38 @@ export function createProgram(): Command { await runIndexCommand(repoRoot); }); + program + .command('retrieval-proof') + .description('Run retrieval quality proofing against benchmark profiles') + .option( + '--benchmark ', + 'benchmark fixture JSON path', + 'benchmarks/retrieval-proofing/benchmark.v1.json' + ) + .option( + '--profiles ', + 'benchmark profiles JSON path', + 'benchmarks/retrieval-proofing/profiles.v1.json' + ) + .option('--profile ', 'benchmark profile name', 'smoke') + .option( + '--output-dir ', + 'directory for generated reports', + 'artifacts/retrieval-proofing' + ) + .option('--no-fail-on-gate', 'do not exit non-zero when gate fails') + .action( + async (options: { + benchmark: string; + profiles: string; + profile: string; + outputDir: string; + failOnGate: boolean; + }) => { + await runRetrievalProofCommand(options); + } + ); + const automations = program.command('automations').description('Manage local automations'); automations.command('list').action(async () => { diff --git a/src/cli/commands/retrieval-proof.ts b/src/cli/commands/retrieval-proof.ts new file mode 100644 index 0000000..905a2c1 --- /dev/null +++ b/src/cli/commands/retrieval-proof.ts @@ -0,0 +1,51 @@ +import { mkdir, writeFile } from 'node:fs/promises'; +import { isAbsolute, join, resolve } from 'node:path'; +import { formatProofingMarkdown } from '../../context/retrieval/proofing/reports'; +import { runRetrievalProofing } from '../../context/retrieval/proofing/runner'; + +type RetrievalProofCommandOptions = { + benchmark: string; + profiles: string; + profile: string; + outputDir: string; + failOnGate: boolean; +}; + +export async function runRetrievalProofCommand( + options: RetrievalProofCommandOptions +): Promise { + const benchmarkPath = absoluteFromCwd(options.benchmark); + const profilesPath = absoluteFromCwd(options.profiles); + const outputDir = absoluteFromCwd(options.outputDir); + + const report = await runRetrievalProofing({ + benchmarkPath, + profilesPath, + profileName: options.profile, + }); + + await mkdir(outputDir, { recursive: true }); + const timestamp = report.generatedAt.replaceAll(':', '-'); + const baseName = `${report.profile}-${timestamp}`; + const jsonPath = join(outputDir, `${baseName}.json`); + const markdownPath = join(outputDir, `${baseName}.md`); + + await writeFile(jsonPath, `${JSON.stringify(report, null, 2)}\n`, 'utf8'); + await writeFile(markdownPath, `${formatProofingMarkdown(report)}\n`, 'utf8'); + + console.log(`Retrieval proofing complete for profile "${report.profile}".`); + console.log(`Gate status: ${report.gate.passed ? 'PASS' : 'FAIL'}`); + console.log(`JSON report: ${jsonPath}`); + console.log(`Markdown report: ${markdownPath}`); + + if (!report.gate.passed && options.failOnGate) { + throw new Error(`Retrieval proofing gate failed: ${report.gate.failures.join('; ')}`); + } +} + +function absoluteFromCwd(path: string): string { + if (isAbsolute(path)) { + return path; + } + return resolve(process.cwd(), path); +} diff --git a/src/context/retrieval/proofing/reports.ts b/src/context/retrieval/proofing/reports.ts new file mode 100644 index 0000000..0067ec2 --- /dev/null +++ b/src/context/retrieval/proofing/reports.ts @@ -0,0 +1,65 @@ +import type { RetrievalProofingReport } from './schema'; + +export function formatProofingMarkdown(report: RetrievalProofingReport): string { + const hybridAggregate = report.strategies.hybrid.aggregate.metrics; + const lexicalAggregate = report.strategies.lexical.aggregate.metrics; + const vectorAggregate = report.strategies.vector.aggregate.metrics; + + const lines = [ + '# Retrieval Quality Proofing Report', + '', + `- Generated: ${report.generatedAt}`, + `- Benchmark: ${report.benchmark.datasetName}@${report.benchmark.datasetVersion}`, + `- Profile: ${report.profile}`, + `- Gate: ${report.gate.passed ? 'PASS' : 'FAIL'}`, + '', + '## Aggregate Metrics', + '', + '| Strategy | Evidence Relevance | Citation Coverage | Unsupported Penalty | Composite |', + '| --- | ---: | ---: | ---: | ---: |', + renderAggregateRow('hybrid', hybridAggregate), + renderAggregateRow('lexical', lexicalAggregate), + renderAggregateRow('vector', vectorAggregate), + '', + '## Hybrid Deltas vs Baselines', + '', + '| Baseline | Evidence Relevance Δ | Citation Coverage Δ | Unsupported Penalty Δ | Composite Δ |', + '| --- | ---: | ---: | ---: | ---: |', + ...report.hybridDeltas.map((entry) => + [ + `| ${entry.baseline}`, + `${entry.metricDeltas.evidenceRelevance.toFixed(3)}`, + `${entry.metricDeltas.citationSupportCoverage.toFixed(3)}`, + `${entry.metricDeltas.unsupportedClaimPenalty.toFixed(3)}`, + `${entry.metricDeltas.compositeScore.toFixed(3)} |`, + ].join(' | ') + ), + '', + '## Gate Status', + '', + report.gate.passed ? '- All configured thresholds passed.' : '- Failure reasons:', + ...report.gate.failures.map((failure) => ` - ${failure}`), + '', + '## Per-Case Hybrid Summary', + '', + '| Case | Retrieved Doc IDs | Expected Evidence IDs | Composite |', + '| --- | --- | --- | ---: |', + ...report.strategies.hybrid.cases.map((entry) => + [ + `| ${entry.caseId}`, + entry.retrievedDocIds.join(', '), + entry.expectedEvidenceDocIds.join(', '), + `${entry.metrics.compositeScore.toFixed(3)} |`, + ].join(' | ') + ), + ]; + + return lines.join('\n'); +} + +function renderAggregateRow( + strategy: string, + metrics: RetrievalProofingReport['strategies']['hybrid']['aggregate']['metrics'] +): string { + return `| ${strategy} | ${metrics.evidenceRelevance.toFixed(3)} | ${metrics.citationSupportCoverage.toFixed(3)} | ${metrics.unsupportedClaimPenalty.toFixed(3)} | ${metrics.compositeScore.toFixed(3)} |`; +} diff --git a/src/context/retrieval/proofing/runner.ts b/src/context/retrieval/proofing/runner.ts new file mode 100644 index 0000000..00be525 --- /dev/null +++ b/src/context/retrieval/proofing/runner.ts @@ -0,0 +1,213 @@ +import { readFile } from 'node:fs/promises'; +import { cosineSimilarity, deterministicEmbedding } from '../rerank'; +import { + type BenchmarkCase, + type BenchmarkProfile, + BenchmarkProfilesSchema, + RetrievalBenchmarkSchema, + type RetrievalProofingReport, + RetrievalProofingReportSchema, + type RetrievalProofingStrategy, +} from './schema'; +import { averageCaseMetrics, scoreCaseMetrics, subtractMetrics } from './scoring'; + +const STRATEGIES: RetrievalProofingStrategy[] = ['lexical', 'vector', 'hybrid']; + +export async function runRetrievalProofing(input: { + benchmarkPath: string; + profilesPath: string; + profileName: string; +}): Promise { + const benchmark = await loadBenchmark(input.benchmarkPath); + const profiles = await loadProfiles(input.profilesPath); + const profile = profiles.profiles[input.profileName]; + + if (!profile) { + throw new Error( + `Unknown benchmark profile "${input.profileName}". Available: ${Object.keys(profiles.profiles).join(', ')}` + ); + } + + const selectedCases = selectCases(benchmark.cases, profile); + const strategyReports = Object.fromEntries( + STRATEGIES.map((strategy) => { + const cases = selectedCases.map((benchmarkCase) => { + const retrievedDocIds = retrieveDocsForCase(benchmarkCase, strategy); + const metrics = scoreCaseMetrics({ benchmarkCase, retrievedDocIds }); + return { + caseId: benchmarkCase.id, + strategy, + retrievedDocIds, + expectedEvidenceDocIds: benchmarkCase.expectedEvidenceDocIds, + metrics, + }; + }); + const aggregate = { + strategy, + metrics: averageCaseMetrics(cases.map((entry) => entry.metrics)), + }; + return [strategy, { cases, aggregate }]; + }) + ) as RetrievalProofingReport['strategies']; + + const hybridAggregate = strategyReports.hybrid.aggregate.metrics; + const lexicalAggregate = strategyReports.lexical.aggregate.metrics; + const vectorAggregate = strategyReports.vector.aggregate.metrics; + + const hybridDeltas = [ + { + baseline: 'lexical' as const, + metricDeltas: subtractMetrics(hybridAggregate, lexicalAggregate), + }, + { + baseline: 'vector' as const, + metricDeltas: subtractMetrics(hybridAggregate, vectorAggregate), + }, + ]; + + const gateFailures = evaluateGate({ + profile, + hybridAggregate, + lexicalAggregate, + vectorAggregate, + }); + + return RetrievalProofingReportSchema.parse({ + schemaVersion: '1.0', + benchmark: { + datasetName: benchmark.datasetName, + datasetVersion: benchmark.datasetVersion, + }, + profile: input.profileName, + generatedAt: new Date().toISOString(), + strategies: strategyReports, + hybridDeltas, + gate: { + passed: gateFailures.length === 0, + failures: gateFailures, + }, + }); +} + +function evaluateGate(input: { + profile: BenchmarkProfile; + hybridAggregate: RetrievalProofingReport['strategies']['hybrid']['aggregate']['metrics']; + lexicalAggregate: RetrievalProofingReport['strategies']['lexical']['aggregate']['metrics']; + vectorAggregate: RetrievalProofingReport['strategies']['vector']['aggregate']['metrics']; +}): string[] { + const failures: string[] = []; + const minimums = input.profile.thresholds.hybridMinimums; + const deltas = input.profile.thresholds.baselineDeltaFloors; + const hybrid = input.hybridAggregate; + + if (hybrid.evidenceRelevance < minimums.evidenceRelevance) { + failures.push( + `hybrid evidenceRelevance ${hybrid.evidenceRelevance.toFixed(3)} < ${minimums.evidenceRelevance.toFixed(3)}` + ); + } + if (hybrid.citationSupportCoverage < minimums.citationSupportCoverage) { + failures.push( + `hybrid citationSupportCoverage ${hybrid.citationSupportCoverage.toFixed(3)} < ${minimums.citationSupportCoverage.toFixed(3)}` + ); + } + if (hybrid.compositeScore < minimums.compositeScore) { + failures.push( + `hybrid compositeScore ${hybrid.compositeScore.toFixed(3)} < ${minimums.compositeScore.toFixed(3)}` + ); + } + if (hybrid.unsupportedClaimPenalty > minimums.maxUnsupportedClaimPenalty) { + failures.push( + `hybrid unsupportedClaimPenalty ${hybrid.unsupportedClaimPenalty.toFixed(3)} > ${minimums.maxUnsupportedClaimPenalty.toFixed(3)}` + ); + } + + const hybridVsLexical = hybrid.compositeScore - input.lexicalAggregate.compositeScore; + if (hybridVsLexical < deltas.lexical) { + failures.push( + `hybrid-vs-lexical composite delta ${hybridVsLexical.toFixed(3)} < ${deltas.lexical.toFixed(3)}` + ); + } + + const hybridVsVector = hybrid.compositeScore - input.vectorAggregate.compositeScore; + if (hybridVsVector < deltas.vector) { + failures.push( + `hybrid-vs-vector composite delta ${hybridVsVector.toFixed(3)} < ${deltas.vector.toFixed(3)}` + ); + } + + return failures; +} + +function selectCases(cases: BenchmarkCase[], profile: BenchmarkProfile): BenchmarkCase[] { + if (!profile.caseIds || profile.caseIds.length === 0) { + return cases; + } + + const wanted = new Set(profile.caseIds); + const selected = cases.filter((entry) => wanted.has(entry.id)); + if (selected.length !== profile.caseIds.length) { + const selectedIds = new Set(selected.map((entry) => entry.id)); + const missing = profile.caseIds.filter((id) => !selectedIds.has(id)); + throw new Error(`Profile references missing benchmark case IDs: ${missing.join(', ')}`); + } + return selected; +} + +function retrieveDocsForCase( + benchmarkCase: BenchmarkCase, + strategy: RetrievalProofingStrategy +): string[] { + const rows = benchmarkCase.documents.map((doc) => { + const lexicalScore = computeLexicalScore(benchmarkCase.query, `${doc.title} ${doc.content}`); + const vectorScore = cosineSimilarity( + deterministicEmbedding(benchmarkCase.query), + deterministicEmbedding(`${doc.title} ${doc.content}`) + ); + const totalScore = + strategy === 'lexical' + ? lexicalScore + : strategy === 'vector' + ? vectorScore + : lexicalScore * 0.7 + vectorScore * 0.3; + + return { + id: doc.id, + totalScore, + }; + }); + + return rows + .sort((a, b) => b.totalScore - a.totalScore) + .slice(0, benchmarkCase.topK) + .map((entry) => entry.id); +} + +function computeLexicalScore(query: string, haystack: string): number { + const tokens = query + .toLowerCase() + .split(/[^a-z0-9]+/g) + .filter(Boolean); + if (tokens.length === 0) { + return 0; + } + + const source = haystack.toLowerCase(); + let score = 0; + for (const token of tokens) { + if (source.includes(token)) { + score += 1; + } + } + + return score / tokens.length; +} + +async function loadBenchmark(path: string) { + const raw = await readFile(path, 'utf8'); + return RetrievalBenchmarkSchema.parse(JSON.parse(raw)); +} + +async function loadProfiles(path: string) { + const raw = await readFile(path, 'utf8'); + return BenchmarkProfilesSchema.parse(JSON.parse(raw)); +} diff --git a/src/context/retrieval/proofing/schema.ts b/src/context/retrieval/proofing/schema.ts new file mode 100644 index 0000000..0e7a78c --- /dev/null +++ b/src/context/retrieval/proofing/schema.ts @@ -0,0 +1,118 @@ +import { z } from 'zod'; + +export const RetrievalProofingStrategySchema = z.enum(['lexical', 'vector', 'hybrid']); +export type RetrievalProofingStrategy = z.infer; + +export const BenchmarkDocumentSchema = z.object({ + id: z.string().min(1), + path: z.string().min(1), + title: z.string().min(1), + content: z.string().min(1), +}); + +export const BenchmarkCaseSchema = z.object({ + id: z.string().min(1), + title: z.string().min(1), + query: z.string().min(1), + intent: z.string().min(1), + difficulty: z.enum(['low', 'medium', 'high']), + topK: z.number().int().positive().default(3), + documents: z.array(BenchmarkDocumentSchema).min(2), + expectedEvidenceDocIds: z.array(z.string().min(1)).min(1), +}); +export type BenchmarkCase = z.infer; + +export const RetrievalBenchmarkSchema = z.object({ + version: z.literal('1.0'), + datasetName: z.string().min(1), + datasetVersion: z.string().min(1), + cases: z.array(BenchmarkCaseSchema).min(1), +}); +export type RetrievalBenchmark = z.infer; + +export const ProofingThresholdsSchema = z.object({ + hybridMinimums: z.object({ + evidenceRelevance: z.number().min(0).max(1), + citationSupportCoverage: z.number().min(0).max(1), + compositeScore: z.number().min(0).max(1), + maxUnsupportedClaimPenalty: z.number().min(0).max(1), + }), + baselineDeltaFloors: z.object({ + lexical: z.number(), + vector: z.number(), + }), +}); +export type ProofingThresholds = z.infer; + +export const BenchmarkProfileSchema = z.object({ + description: z.string().min(1), + caseIds: z.array(z.string().min(1)).optional(), + thresholds: ProofingThresholdsSchema, +}); +export type BenchmarkProfile = z.infer; + +export const BenchmarkProfilesSchema = z.object({ + version: z.literal('1.0'), + profiles: z.record(z.string(), BenchmarkProfileSchema), +}); +export type BenchmarkProfiles = z.infer; + +export const CaseMetricsSchema = z.object({ + evidenceRelevance: z.number().min(0).max(1), + citationSupportCoverage: z.number().min(0).max(1), + unsupportedClaimPenalty: z.number().min(0).max(1), + compositeScore: z.number().min(0).max(1), +}); +export type CaseMetrics = z.infer; + +export const StrategyCaseResultSchema = z.object({ + caseId: z.string(), + strategy: RetrievalProofingStrategySchema, + retrievedDocIds: z.array(z.string()), + expectedEvidenceDocIds: z.array(z.string()), + metrics: CaseMetricsSchema, +}); +export type StrategyCaseResult = z.infer; + +export const StrategyAggregateSchema = z.object({ + strategy: RetrievalProofingStrategySchema, + metrics: CaseMetricsSchema, +}); +export type StrategyAggregate = z.infer; + +export const StrategyDeltaSchema = z.object({ + baseline: z.enum(['lexical', 'vector']), + metricDeltas: z.object({ + evidenceRelevance: z.number(), + citationSupportCoverage: z.number(), + unsupportedClaimPenalty: z.number(), + compositeScore: z.number(), + }), +}); +export type StrategyDelta = z.infer; + +export const GateResultSchema = z.object({ + passed: z.boolean(), + failures: z.array(z.string()), +}); +export type GateResult = z.infer; + +export const RetrievalProofingReportSchema = z.object({ + schemaVersion: z.literal('1.0'), + benchmark: z.object({ + datasetName: z.string(), + datasetVersion: z.string(), + }), + profile: z.string(), + generatedAt: z.string(), + strategies: z.record( + RetrievalProofingStrategySchema, + z.object({ + cases: z.array(StrategyCaseResultSchema), + aggregate: StrategyAggregateSchema, + }) + ), + hybridDeltas: z.array(StrategyDeltaSchema), + gate: GateResultSchema, +}); +export type RetrievalProofingReport = z.infer; diff --git a/src/context/retrieval/proofing/scoring.ts b/src/context/retrieval/proofing/scoring.ts new file mode 100644 index 0000000..790a032 --- /dev/null +++ b/src/context/retrieval/proofing/scoring.ts @@ -0,0 +1,84 @@ +import type { BenchmarkCase, CaseMetrics } from './schema'; + +export function scoreCaseMetrics(input: { + benchmarkCase: BenchmarkCase; + retrievedDocIds: string[]; +}): CaseMetrics { + const { benchmarkCase, retrievedDocIds } = input; + const topK = benchmarkCase.topK; + const expectedSet = new Set(benchmarkCase.expectedEvidenceDocIds); + const top = retrievedDocIds.slice(0, topK); + const weightedHits = top.reduce((acc, docId, index) => { + if (!expectedSet.has(docId)) { + return acc; + } + return acc + (topK - index) / topK; + }, 0); + const weightDenominator = top.reduce((acc, _docId, index) => acc + (topK - index) / topK, 0); + const hits = top.filter((docId) => expectedSet.has(docId)).length; + + const evidenceRelevance = divide(weightedHits, weightDenominator); + const citationSupportCoverage = divide(hits, benchmarkCase.expectedEvidenceDocIds.length); + const unsupportedClaimPenalty = divide(topK - hits, topK); + const compositeScore = clamp01( + evidenceRelevance * 0.45 + citationSupportCoverage * 0.45 + (1 - unsupportedClaimPenalty) * 0.1 + ); + + return { + evidenceRelevance, + citationSupportCoverage, + unsupportedClaimPenalty, + compositeScore, + }; +} + +export function averageCaseMetrics(metrics: CaseMetrics[]): CaseMetrics { + if (metrics.length === 0) { + return { + evidenceRelevance: 0, + citationSupportCoverage: 0, + unsupportedClaimPenalty: 1, + compositeScore: 0, + }; + } + + return { + evidenceRelevance: average(metrics.map((metric) => metric.evidenceRelevance)), + citationSupportCoverage: average(metrics.map((metric) => metric.citationSupportCoverage)), + unsupportedClaimPenalty: average(metrics.map((metric) => metric.unsupportedClaimPenalty)), + compositeScore: average(metrics.map((metric) => metric.compositeScore)), + }; +} + +export function subtractMetrics(a: CaseMetrics, b: CaseMetrics): CaseMetrics { + return { + evidenceRelevance: a.evidenceRelevance - b.evidenceRelevance, + citationSupportCoverage: a.citationSupportCoverage - b.citationSupportCoverage, + unsupportedClaimPenalty: a.unsupportedClaimPenalty - b.unsupportedClaimPenalty, + compositeScore: a.compositeScore - b.compositeScore, + }; +} + +function average(values: number[]): number { + if (values.length === 0) { + return 0; + } + return values.reduce((acc, value) => acc + value, 0) / values.length; +} + +function divide(num: number, den: number): number { + if (den <= 0) { + return 0; + } + return num / den; +} + +function clamp01(value: number): number { + if (value <= 0) { + return 0; + } + if (value >= 1) { + return 1; + } + return value; +} diff --git a/tests/retrieval-proofing.test.ts b/tests/retrieval-proofing.test.ts new file mode 100644 index 0000000..66333fb --- /dev/null +++ b/tests/retrieval-proofing.test.ts @@ -0,0 +1,90 @@ +import { mkdtemp, readFile, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join, resolve } from 'node:path'; +import { describe, expect, it } from 'vitest'; +import { runRetrievalProofing } from '../src/context/retrieval/proofing/runner'; +import { RetrievalProofingReportSchema } from '../src/context/retrieval/proofing/schema'; + +const benchmarkPath = resolve(process.cwd(), 'benchmarks/retrieval-proofing/benchmark.v1.json'); +const profilesPath = resolve(process.cwd(), 'benchmarks/retrieval-proofing/profiles.v1.json'); + +describe('retrieval proofing', () => { + it('produces deterministic scoring for fixed benchmark/profile inputs', async () => { + const first = await runRetrievalProofing({ + benchmarkPath, + profilesPath, + profileName: 'smoke', + }); + const second = await runRetrievalProofing({ + benchmarkPath, + profilesPath, + profileName: 'smoke', + }); + + expect(first.generatedAt).not.toEqual(second.generatedAt); + expect({ ...first, generatedAt: 'fixed' }).toEqual({ ...second, generatedAt: 'fixed' }); + }); + + it('keeps JSON report schema stable and parseable', async () => { + const report = await runRetrievalProofing({ + benchmarkPath, + profilesPath, + profileName: 'smoke', + }); + + const parsed = RetrievalProofingReportSchema.parse(report); + + expect(parsed.schemaVersion).toBe('1.0'); + expect(Object.keys(parsed.strategies)).toEqual(['lexical', 'vector', 'hybrid']); + expect(parsed.strategies.hybrid.cases.length).toBeGreaterThan(0); + expect(typeof parsed.gate.passed).toBe('boolean'); + }); + + it('reports gate pass/fail based on configured thresholds', async () => { + const passReport = await runRetrievalProofing({ + benchmarkPath, + profilesPath, + profileName: 'smoke', + }); + expect(passReport.gate.passed).toBe(true); + + const tempDir = await mkdtemp(join(tmpdir(), 'retrieval-proofing-')); + const strictProfilesPath = join(tempDir, 'profiles.strict.json'); + const baseProfiles = JSON.parse(await readFile(profilesPath, 'utf8')) as { + version: string; + profiles: Record; + }; + + const strictProfiles = { + ...baseProfiles, + profiles: { + ...baseProfiles.profiles, + smoke: { + description: 'strict gate for failure test', + thresholds: { + hybridMinimums: { + evidenceRelevance: 0.99, + citationSupportCoverage: 0.99, + compositeScore: 0.99, + maxUnsupportedClaimPenalty: 0.01, + }, + baselineDeltaFloors: { + lexical: 0.2, + vector: 0.2, + }, + }, + }, + }, + }; + await writeFile(strictProfilesPath, `${JSON.stringify(strictProfiles, null, 2)}\n`, 'utf8'); + + const failReport = await runRetrievalProofing({ + benchmarkPath, + profilesPath: strictProfilesPath, + profileName: 'smoke', + }); + + expect(failReport.gate.passed).toBe(false); + expect(failReport.gate.failures.length).toBeGreaterThan(0); + }); +}); From dd34f45415d1b2f7055fb29e8400bc1b9f519ffa Mon Sep 17 00:00:00 2001 From: Daniel Wise Date: Tue, 3 Mar 2026 19:13:08 -0800 Subject: [PATCH 2/2] fix(retrieval): address PR feedback on docs and test stability --- docs/retrieval-proofing-benchmark-schema.md | 6 ++ .../specs/retrieval-quality-proofing/spec.md | 5 +- tests/retrieval-proofing.test.ts | 63 ++++++++++--------- 3 files changed, 43 insertions(+), 31 deletions(-) diff --git a/docs/retrieval-proofing-benchmark-schema.md b/docs/retrieval-proofing-benchmark-schema.md index 3551b00..4f30f2a 100644 --- a/docs/retrieval-proofing-benchmark-schema.md +++ b/docs/retrieval-proofing-benchmark-schema.md @@ -28,6 +28,12 @@ Top-level shape: "path": "src/context/retrieval/hybrid.ts", "title": "Hybrid retrieval source", "content": "..." + }, + { + "id": "d2", + "path": "src/context/retrieval/rerank.ts", + "title": "Rerank helpers", + "content": "..." } ] } diff --git a/openspec/specs/retrieval-quality-proofing/spec.md b/openspec/specs/retrieval-quality-proofing/spec.md index ef4b8ec..4c5794e 100644 --- a/openspec/specs/retrieval-quality-proofing/spec.md +++ b/openspec/specs/retrieval-quality-proofing/spec.md @@ -1,7 +1,9 @@ # retrieval-quality-proofing Specification ## Purpose -TBD - created by archiving change retrieval-quality-proofing. Update Purpose after archive. +Define how the system evaluates, compares, and enforces retrieval quality across lexical, vector, +and hybrid strategies using deterministic benchmarks, grounding-focused metrics, and +CI-enforceable quality gates. ## Requirements ### Requirement: Multi-Strategy Retrieval Evaluation The system MUST execute the same benchmark question set against at least three retrieval strategies: lexical-only, vector-only, and hybrid. @@ -34,4 +36,3 @@ The system MUST enforce configurable quality gates that verify hybrid retrieval #### Scenario: Gate passes on acceptable hybrid improvement - **WHEN** a proofing run determines that hybrid retrieval meets configured improvement thresholds versus baseline - **THEN** the command exits zero and marks the run as passing - diff --git a/tests/retrieval-proofing.test.ts b/tests/retrieval-proofing.test.ts index 66333fb..a0bca43 100644 --- a/tests/retrieval-proofing.test.ts +++ b/tests/retrieval-proofing.test.ts @@ -1,4 +1,4 @@ -import { mkdtemp, readFile, writeFile } from 'node:fs/promises'; +import { mkdtemp, readFile, rm, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join, resolve } from 'node:path'; import { describe, expect, it } from 'vitest'; @@ -21,7 +21,6 @@ describe('retrieval proofing', () => { profileName: 'smoke', }); - expect(first.generatedAt).not.toEqual(second.generatedAt); expect({ ...first, generatedAt: 'fixed' }).toEqual({ ...second, generatedAt: 'fixed' }); }); @@ -52,39 +51,45 @@ describe('retrieval proofing', () => { const strictProfilesPath = join(tempDir, 'profiles.strict.json'); const baseProfiles = JSON.parse(await readFile(profilesPath, 'utf8')) as { version: string; - profiles: Record; + profiles: Record; }; - const strictProfiles = { - ...baseProfiles, - profiles: { - ...baseProfiles.profiles, - smoke: { - description: 'strict gate for failure test', - thresholds: { - hybridMinimums: { - evidenceRelevance: 0.99, - citationSupportCoverage: 0.99, - compositeScore: 0.99, - maxUnsupportedClaimPenalty: 0.01, - }, - baselineDeltaFloors: { - lexical: 0.2, - vector: 0.2, + try { + const existingSmoke = baseProfiles.profiles.smoke ?? {}; + const strictProfiles = { + ...baseProfiles, + profiles: { + ...baseProfiles.profiles, + smoke: { + ...existingSmoke, + description: 'strict gate for failure test', + thresholds: { + hybridMinimums: { + evidenceRelevance: 0.99, + citationSupportCoverage: 0.99, + compositeScore: 0.99, + maxUnsupportedClaimPenalty: 0.01, + }, + baselineDeltaFloors: { + lexical: 0.2, + vector: 0.2, + }, }, }, }, - }, - }; - await writeFile(strictProfilesPath, `${JSON.stringify(strictProfiles, null, 2)}\n`, 'utf8'); + }; + await writeFile(strictProfilesPath, `${JSON.stringify(strictProfiles, null, 2)}\n`, 'utf8'); - const failReport = await runRetrievalProofing({ - benchmarkPath, - profilesPath: strictProfilesPath, - profileName: 'smoke', - }); + const failReport = await runRetrievalProofing({ + benchmarkPath, + profilesPath: strictProfilesPath, + profileName: 'smoke', + }); - expect(failReport.gate.passed).toBe(false); - expect(failReport.gate.failures.length).toBeGreaterThan(0); + expect(failReport.gate.passed).toBe(false); + expect(failReport.gate.failures.length).toBeGreaterThan(0); + } finally { + await rm(tempDir, { recursive: true, force: true }); + } }); });