From fa7846ec65de481fff63510a531c11322710a31f Mon Sep 17 00:00:00 2001 From: Daniel Wise Date: Tue, 3 Mar 2026 17:15:37 -0800 Subject: [PATCH 1/3] feat(retrieval): add retrieval quality proofing workflow and archive change - add retrieval proofing benchmark fixtures and profile thresholds - add proofing runner, deterministic scoring, and report generation - add retrieval-proof CLI command and smoke CI gate - add tests and docs for local/CI proofing workflow - archive retrieval-quality-proofing change and sync main spec --- .github/workflows/pr-checks.yml | 3 + README.md | 2 + .../retrieval-proofing/benchmark.v1.json | 217 ++++++++++++++++++ .../retrieval-proofing/profiles.v1.json | 36 +++ docs/retrieval-proofing-benchmark-schema.md | 63 +++++ docs/retrieval-proofing.md | 54 +++++ .../.openspec.yaml | 0 .../design.md | 0 .../proposal.md | 0 .../specs/retrieval-quality-proofing/spec.md | 0 .../tasks.md | 23 ++ .../retrieval-quality-proofing/tasks.md | 23 -- .../specs/retrieval-quality-proofing/spec.md | 37 +++ src/cli/commands.ts | 33 +++ src/cli/commands/retrieval-proof.ts | 51 ++++ src/context/retrieval/proofing/reports.ts | 65 ++++++ src/context/retrieval/proofing/runner.ts | 213 +++++++++++++++++ src/context/retrieval/proofing/schema.ts | 118 ++++++++++ src/context/retrieval/proofing/scoring.ts | 84 +++++++ tests/retrieval-proofing.test.ts | 90 ++++++++ 20 files changed, 1089 insertions(+), 23 deletions(-) create mode 100644 benchmarks/retrieval-proofing/benchmark.v1.json create mode 100644 benchmarks/retrieval-proofing/profiles.v1.json create mode 100644 docs/retrieval-proofing-benchmark-schema.md create mode 100644 docs/retrieval-proofing.md rename openspec/changes/{retrieval-quality-proofing => archive/2026-03-04-retrieval-quality-proofing}/.openspec.yaml (100%) rename openspec/changes/{retrieval-quality-proofing => archive/2026-03-04-retrieval-quality-proofing}/design.md (100%) rename openspec/changes/{retrieval-quality-proofing => archive/2026-03-04-retrieval-quality-proofing}/proposal.md (100%) rename openspec/changes/{retrieval-quality-proofing => archive/2026-03-04-retrieval-quality-proofing}/specs/retrieval-quality-proofing/spec.md (100%) create mode 100644 openspec/changes/archive/2026-03-04-retrieval-quality-proofing/tasks.md delete mode 100644 openspec/changes/retrieval-quality-proofing/tasks.md create mode 100644 openspec/specs/retrieval-quality-proofing/spec.md create mode 100644 src/cli/commands/retrieval-proof.ts create mode 100644 src/context/retrieval/proofing/reports.ts create mode 100644 src/context/retrieval/proofing/runner.ts create mode 100644 src/context/retrieval/proofing/schema.ts create mode 100644 src/context/retrieval/proofing/scoring.ts create mode 100644 tests/retrieval-proofing.test.ts diff --git a/.github/workflows/pr-checks.yml b/.github/workflows/pr-checks.yml index 2ec99dc..f97e85a 100644 --- a/.github/workflows/pr-checks.yml +++ b/.github/workflows/pr-checks.yml @@ -43,6 +43,9 @@ jobs: - name: Run checks run: pnpm checks + - name: Run retrieval proofing (smoke profile) + run: pnpm dev retrieval-proof --profile smoke --output-dir artifacts/retrieval-proofing + - name: Comment success summary on PR if: ${{ success() && github.event_name == 'pull_request' }} continue-on-error: true diff --git a/README.md b/README.md index b643150..1da25f9 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,7 @@ pnpm dev -- chat pnpm dev -- chat "summarize this repo" pnpm dev -- plan "create a rollout plan for indexing" pnpm dev -- index . +pnpm dev retrieval-proof --profile smoke pnpm dev -- automations list pnpm dev -- automations add --name "Hourly Check" --cron "0 * * * *" --prompt "summarize local status" pnpm dev -- automations run @@ -94,3 +95,4 @@ Environment variables (BYOK): - Anthropic embeddings currently fall back to deterministic local vectors. - This project intentionally uses Biome only (no ESLint/Prettier). +- Retrieval proofing benchmark schema/workflow docs: `docs/retrieval-proofing-benchmark-schema.md` and `docs/retrieval-proofing.md`. diff --git a/benchmarks/retrieval-proofing/benchmark.v1.json b/benchmarks/retrieval-proofing/benchmark.v1.json new file mode 100644 index 0000000..734c13b --- /dev/null +++ b/benchmarks/retrieval-proofing/benchmark.v1.json @@ -0,0 +1,217 @@ +{ + "version": "1.0", + "datasetName": "retrieval-proofing-core", + "datasetVersion": "2026.03.01", + "cases": [ + { + "id": "repo-layout", + "title": "Find retrieval implementation location", + "query": "where is hybrid retrieval implemented", + "intent": "lookup", + "difficulty": "low", + "topK": 3, + "expectedEvidenceDocIds": ["d1", "d2"], + "documents": [ + { + "id": "d1", + "path": "src/context/retrieval/hybrid.ts", + "title": "Hybrid retrieval source", + "content": "The hybrid retrieval runner combines lexical search, vector similarity, and ranking metadata." + }, + { + "id": "d2", + "path": "src/context/retrieval/rerank.ts", + "title": "Rerank helpers", + "content": "hybridRerank computes weighted retrieval ordering and cosine similarity for vector retrieval." + }, + { + "id": "d3", + "path": "README.md", + "title": "Project overview", + "content": "General project overview and quick start commands for local development." + }, + { + "id": "d4", + "path": "src/cli/commands.ts", + "title": "CLI commands", + "content": "Registers chat, plan, index, and automations commands." + } + ] + }, + { + "id": "quality-commands", + "title": "Identify quality gates", + "query": "what command runs lint typecheck and tests", + "intent": "lookup", + "difficulty": "medium", + "topK": 3, + "expectedEvidenceDocIds": ["d1"], + "documents": [ + { + "id": "d1", + "path": "package.json", + "title": "Project scripts", + "content": "The checks script runs pnpm test, pnpm typecheck, pnpm lint, and pnpm build." + }, + { + "id": "d2", + "path": "README.md", + "title": "README quality commands", + "content": "The README includes lint, typecheck, test, and build as quality commands." + }, + { + "id": "d3", + "path": "src/automation/runner.ts", + "title": "Automation runner", + "content": "Executes configured prompts on cron schedules." + }, + { + "id": "d4", + "path": "src/db/client.ts", + "title": "Database client", + "content": "Initializes PGLite and exposes query and exec methods." + } + ] + }, + { + "id": "provider-preflight", + "title": "Provider preflight requirements", + "query": "which env var is required for google provider preflight", + "intent": "lookup", + "difficulty": "medium", + "topK": 3, + "expectedEvidenceDocIds": ["d1", "d2"], + "documents": [ + { + "id": "d1", + "path": "src/cli/commands/chat.tsx", + "title": "Chat preflight", + "content": "Chat preflight checks provider env vars and prints setup instructions for google openai and anthropic." + }, + { + "id": "d2", + "path": "README.md", + "title": "Environment variable docs", + "content": "GOOGLE_GENERATIVE_AI_API_KEY is required when using the google provider." + }, + { + "id": "d3", + "path": "src/mcp/client.ts", + "title": "MCP client", + "content": "Starts and interacts with external MCP servers." + }, + { + "id": "d4", + "path": "src/context/indexer/full-index.ts", + "title": "Indexer", + "content": "Indexes repository files and writes chunks and embeddings." + } + ] + }, + { + "id": "policy-approval", + "title": "Approval behavior", + "query": "which actions require approval in interactive mode", + "intent": "reasoning", + "difficulty": "high", + "topK": 3, + "expectedEvidenceDocIds": ["d1", "d2"], + "documents": [ + { + "id": "d1", + "path": "README.md", + "title": "Interactive approval docs", + "content": "Sensitive write and destructive tool actions require explicit approve, deny, or dismiss decisions in the TUI." + }, + { + "id": "d2", + "path": "src/policy/engine.ts", + "title": "Policy engine", + "content": "Policy engine classifies tool side effects and enforces approval decisions." + }, + { + "id": "d3", + "path": "src/context/retrieval/rerank.ts", + "title": "Rerank", + "content": "Reranks retrieval candidates with weighted score combination." + }, + { + "id": "d4", + "path": "src/db/migrate.ts", + "title": "Migrations", + "content": "Applies schema migrations at startup." + } + ] + }, + { + "id": "automation-hooks", + "title": "Hook trigger behavior", + "query": "what hooks trigger tests or typecheck", + "intent": "lookup", + "difficulty": "low", + "topK": 3, + "expectedEvidenceDocIds": ["d1"], + "documents": [ + { + "id": "d1", + "path": "AGENTS.md", + "title": "Hook definitions", + "content": "file-change runs pnpm test and git-head-change runs pnpm typecheck." + }, + { + "id": "d2", + "path": "README.md", + "title": "README automation", + "content": "Automations can run prompts but does not define hook command mapping." + }, + { + "id": "d3", + "path": "src/automation/scheduler.ts", + "title": "Scheduler", + "content": "Cron scheduler dispatches queued automation specs." + }, + { + "id": "d4", + "path": "src/agent/orchestrator.ts", + "title": "Orchestrator", + "content": "Runs gather reason act verify loops with validation retries." + } + ] + }, + { + "id": "ci-workflow", + "title": "CI checks pipeline", + "query": "where is pr checks workflow defined", + "intent": "lookup", + "difficulty": "medium", + "topK": 3, + "expectedEvidenceDocIds": ["d1", "d2"], + "documents": [ + { + "id": "d1", + "path": ".github/workflows/pr-checks.yml", + "title": "PR checks workflow", + "content": "Runs pnpm checks in CI on pull requests." + }, + { + "id": "d2", + "path": "README.md", + "title": "Project quality commands", + "content": "The quality command list maps to CI checks execution." + }, + { + "id": "d3", + "path": "src/tools/registry.ts", + "title": "Tool registry", + "content": "Defines registered local tools and validation." + }, + { + "id": "d4", + "path": "src/db/client.ts", + "title": "Database client", + "content": "Provides thin wrapper around PGLite query execution." + } + ] + } + ] +} diff --git a/benchmarks/retrieval-proofing/profiles.v1.json b/benchmarks/retrieval-proofing/profiles.v1.json new file mode 100644 index 0000000..ff5c585 --- /dev/null +++ b/benchmarks/retrieval-proofing/profiles.v1.json @@ -0,0 +1,36 @@ +{ + "version": "1.0", + "profiles": { + "smoke": { + "description": "Fast CI profile with a representative subset of benchmark cases", + "caseIds": ["repo-layout", "quality-commands", "provider-preflight"], + "thresholds": { + "hybridMinimums": { + "evidenceRelevance": 0.55, + "citationSupportCoverage": 0.75, + "compositeScore": 0.62, + "maxUnsupportedClaimPenalty": 0.45 + }, + "baselineDeltaFloors": { + "lexical": -0.03, + "vector": 0.02 + } + } + }, + "full": { + "description": "Full benchmark profile for deeper retrieval proofing", + "thresholds": { + "hybridMinimums": { + "evidenceRelevance": 0.5, + "citationSupportCoverage": 0.7, + "compositeScore": 0.58, + "maxUnsupportedClaimPenalty": 0.5 + }, + "baselineDeltaFloors": { + "lexical": -0.01, + "vector": 0.02 + } + } + } + } +} diff --git a/docs/retrieval-proofing-benchmark-schema.md b/docs/retrieval-proofing-benchmark-schema.md new file mode 100644 index 0000000..3551b00 --- /dev/null +++ b/docs/retrieval-proofing-benchmark-schema.md @@ -0,0 +1,63 @@ +# Retrieval Proofing Benchmark Schema (v1.0) + +This document defines the versioned fixture format used by retrieval proofing. + +## Fixture File + +Path: `benchmarks/retrieval-proofing/benchmark.v1.json` + +Top-level shape: + +```json +{ + "version": "1.0", + "datasetName": "retrieval-proofing-core", + "datasetVersion": "2026.03.01", + "cases": [ + { + "id": "repo-layout", + "title": "Find retrieval implementation location", + "query": "where is hybrid retrieval implemented", + "intent": "lookup", + "difficulty": "low", + "topK": 3, + "expectedEvidenceDocIds": ["d1", "d2"], + "documents": [ + { + "id": "d1", + "path": "src/context/retrieval/hybrid.ts", + "title": "Hybrid retrieval source", + "content": "..." + } + ] + } + ] +} +``` + +## Field Semantics + +- `version`: Fixture schema version. Must be `1.0` for this release. +- `datasetName`: Human-readable benchmark dataset name. +- `datasetVersion`: Version of benchmark content. Bump when case content or labels change. +- `cases`: Benchmark case list. +- `cases[].id`: Stable identifier used by profile filters and reports. +- `cases[].query`: Query string used by all retrieval strategies. +- `cases[].topK`: Number of retrieved documents considered for scoring. +- `cases[].documents`: Candidate evidence set for the case. +- `cases[].expectedEvidenceDocIds`: Canonical evidence documents used for deterministic scoring. + +## Profile File + +Path: `benchmarks/retrieval-proofing/profiles.v1.json` + +- `version`: Profile schema version (`1.0`). +- `profiles..caseIds`: Optional subset of case IDs for this profile. +- `profiles..thresholds.hybridMinimums`: Absolute floors for hybrid metrics. +- `profiles..thresholds.baselineDeltaFloors`: Minimum hybrid-vs-baseline composite deltas. + +## Versioning Rules + +- Bump `datasetVersion` whenever benchmark content changes. +- Keep schema `version` at `1.0` unless the JSON structure changes. +- Prefer adding new cases over mutating existing case IDs to preserve comparability. diff --git a/docs/retrieval-proofing.md b/docs/retrieval-proofing.md new file mode 100644 index 0000000..88d6c5a --- /dev/null +++ b/docs/retrieval-proofing.md @@ -0,0 +1,54 @@ +# Retrieval Quality Proofing + +Retrieval proofing evaluates `lexical`, `vector`, and `hybrid` strategies on the same benchmark dataset, emits JSON/Markdown artifacts, and enforces hybrid quality gates. + +## Run Locally + +Smoke profile: + +```bash +pnpm dev retrieval-proof --profile smoke +``` + +Full profile: + +```bash +pnpm dev retrieval-proof --profile full +``` + +Custom output directory: + +```bash +pnpm dev retrieval-proof --profile smoke --output-dir artifacts/retrieval-proofing +``` + +## Artifacts + +Each run writes: + +- `-.json`: per-case metrics, aggregate metrics, hybrid deltas, gate result. +- `-.md`: concise human-readable summary for PR/release notes. + +Default output path: + +- `artifacts/retrieval-proofing/` + +## CI Workflow + +PR checks run retrieval proofing with the smoke profile: + +```bash +pnpm dev retrieval-proof --profile smoke --output-dir artifacts/retrieval-proofing +``` + +If hybrid thresholds fail, the command exits non-zero and CI fails. + +## Updating Baseline Thresholds Safely + +1. Run the full profile locally and inspect both JSON and Markdown reports. +2. Confirm changes are intentional and linked to retrieval behavior changes. +3. Update thresholds in `benchmarks/retrieval-proofing/profiles.v1.json`. +4. Re-run both `smoke` and `full` profiles and ensure results are stable. +5. Include rationale for threshold changes in PR description (what changed and why). + +Avoid lowering thresholds to mask regressions. Prefer improving retrieval behavior first. diff --git a/openspec/changes/retrieval-quality-proofing/.openspec.yaml b/openspec/changes/archive/2026-03-04-retrieval-quality-proofing/.openspec.yaml similarity index 100% rename from openspec/changes/retrieval-quality-proofing/.openspec.yaml rename to openspec/changes/archive/2026-03-04-retrieval-quality-proofing/.openspec.yaml diff --git a/openspec/changes/retrieval-quality-proofing/design.md b/openspec/changes/archive/2026-03-04-retrieval-quality-proofing/design.md similarity index 100% rename from openspec/changes/retrieval-quality-proofing/design.md rename to openspec/changes/archive/2026-03-04-retrieval-quality-proofing/design.md diff --git a/openspec/changes/retrieval-quality-proofing/proposal.md b/openspec/changes/archive/2026-03-04-retrieval-quality-proofing/proposal.md similarity index 100% rename from openspec/changes/retrieval-quality-proofing/proposal.md rename to openspec/changes/archive/2026-03-04-retrieval-quality-proofing/proposal.md diff --git a/openspec/changes/retrieval-quality-proofing/specs/retrieval-quality-proofing/spec.md b/openspec/changes/archive/2026-03-04-retrieval-quality-proofing/specs/retrieval-quality-proofing/spec.md similarity index 100% rename from openspec/changes/retrieval-quality-proofing/specs/retrieval-quality-proofing/spec.md rename to openspec/changes/archive/2026-03-04-retrieval-quality-proofing/specs/retrieval-quality-proofing/spec.md diff --git a/openspec/changes/archive/2026-03-04-retrieval-quality-proofing/tasks.md b/openspec/changes/archive/2026-03-04-retrieval-quality-proofing/tasks.md new file mode 100644 index 0000000..b914fd6 --- /dev/null +++ b/openspec/changes/archive/2026-03-04-retrieval-quality-proofing/tasks.md @@ -0,0 +1,23 @@ +## 1. Benchmark Dataset and Configuration + +- [x] 1.1 Define and document versioned benchmark fixture schema for retrieval proofing cases +- [x] 1.2 Add initial benchmark dataset covering multiple query intents and grounding difficulty levels +- [x] 1.3 Implement profile-based proofing configuration (for example `smoke` and `full`) with threshold settings + +## 2. Evaluation Runner and Scoring + +- [x] 2.1 Implement retrieval proofing runner that executes lexical, vector, and hybrid modes over the same case set +- [x] 2.2 Implement deterministic grounding metric scoring for evidence relevance, citation support, and unsupported-claim penalty +- [x] 2.3 Add aggregate scoring and strategy delta computation suitable for pass/fail gating + +## 3. Reporting and CLI Integration + +- [x] 3.1 Add CLI command(s) to run retrieval proofing for a selected benchmark profile +- [x] 3.2 Generate JSON report artifacts with per-case and aggregate metrics for each strategy +- [x] 3.3 Generate Markdown summary report highlighting hybrid-vs-baseline outcomes and gate status + +## 4. Quality Gates and Verification + +- [x] 4.1 Integrate proofing command into CI with non-zero exit on failed hybrid thresholds +- [x] 4.2 Add tests for scoring determinism, report schema stability, and gate pass/fail behavior +- [x] 4.3 Document local and CI proofing workflows, including how to update baseline thresholds safely diff --git a/openspec/changes/retrieval-quality-proofing/tasks.md b/openspec/changes/retrieval-quality-proofing/tasks.md deleted file mode 100644 index 0fe1789..0000000 --- a/openspec/changes/retrieval-quality-proofing/tasks.md +++ /dev/null @@ -1,23 +0,0 @@ -## 1. Benchmark Dataset and Configuration - -- [ ] 1.1 Define and document versioned benchmark fixture schema for retrieval proofing cases -- [ ] 1.2 Add initial benchmark dataset covering multiple query intents and grounding difficulty levels -- [ ] 1.3 Implement profile-based proofing configuration (for example `smoke` and `full`) with threshold settings - -## 2. Evaluation Runner and Scoring - -- [ ] 2.1 Implement retrieval proofing runner that executes lexical, vector, and hybrid modes over the same case set -- [ ] 2.2 Implement deterministic grounding metric scoring for evidence relevance, citation support, and unsupported-claim penalty -- [ ] 2.3 Add aggregate scoring and strategy delta computation suitable for pass/fail gating - -## 3. Reporting and CLI Integration - -- [ ] 3.1 Add CLI command(s) to run retrieval proofing for a selected benchmark profile -- [ ] 3.2 Generate JSON report artifacts with per-case and aggregate metrics for each strategy -- [ ] 3.3 Generate Markdown summary report highlighting hybrid-vs-baseline outcomes and gate status - -## 4. Quality Gates and Verification - -- [ ] 4.1 Integrate proofing command into CI with non-zero exit on failed hybrid thresholds -- [ ] 4.2 Add tests for scoring determinism, report schema stability, and gate pass/fail behavior -- [ ] 4.3 Document local and CI proofing workflows, including how to update baseline thresholds safely diff --git a/openspec/specs/retrieval-quality-proofing/spec.md b/openspec/specs/retrieval-quality-proofing/spec.md new file mode 100644 index 0000000..ef4b8ec --- /dev/null +++ b/openspec/specs/retrieval-quality-proofing/spec.md @@ -0,0 +1,37 @@ +# retrieval-quality-proofing Specification + +## Purpose +TBD - created by archiving change retrieval-quality-proofing. Update Purpose after archive. +## Requirements +### Requirement: Multi-Strategy Retrieval Evaluation +The system MUST execute the same benchmark question set against at least three retrieval strategies: lexical-only, vector-only, and hybrid. + +#### Scenario: Compare strategies on shared benchmark +- **WHEN** a proofing run starts for a benchmark profile +- **THEN** the system runs every benchmark case across lexical, vector, and hybrid modes using identical inputs and scoring configuration + +### Requirement: Deterministic Grounding Metrics +The system MUST calculate deterministic grounding metrics for each benchmark case and strategy, including evidence relevance, citation support coverage, and unsupported-claim penalty. + +#### Scenario: Produce deterministic scores +- **WHEN** the same benchmark profile and repository state are evaluated multiple times +- **THEN** the computed grounding metrics and aggregate scores are identical across runs except for explicitly declared non-deterministic fields + +### Requirement: Versioned Benchmark and Report Artifacts +The system MUST support versioned benchmark fixtures and emit both machine-readable and human-readable report artifacts for every proofing run. + +#### Scenario: Generate proof artifacts +- **WHEN** a proofing run completes +- **THEN** the system writes a JSON report containing per-case and aggregate metric values and writes a Markdown summary highlighting strategy deltas and pass/fail gate status + +### Requirement: Hybrid Quality Gate Enforcement +The system MUST enforce configurable quality gates that verify hybrid retrieval outperforms configured baseline strategies on grounding metrics. + +#### Scenario: Gate fails on hybrid regression +- **WHEN** a proofing run determines that hybrid retrieval does not meet configured improvement thresholds versus baseline +- **THEN** the command exits non-zero and marks the run as failed for CI enforcement + +#### Scenario: Gate passes on acceptable hybrid improvement +- **WHEN** a proofing run determines that hybrid retrieval meets configured improvement thresholds versus baseline +- **THEN** the command exits zero and marks the run as passing + diff --git a/src/cli/commands.ts b/src/cli/commands.ts index 683716f..00564bc 100644 --- a/src/cli/commands.ts +++ b/src/cli/commands.ts @@ -7,6 +7,7 @@ import { import { runChatCommand } from './commands/chat'; import { runIndexCommand } from './commands/index'; import { runPlanCommand } from './commands/plan'; +import { runRetrievalProofCommand } from './commands/retrieval-proof'; export function createProgram(): Command { const program = new Command(); @@ -37,6 +38,38 @@ export function createProgram(): Command { await runIndexCommand(repoRoot); }); + program + .command('retrieval-proof') + .description('Run retrieval quality proofing against benchmark profiles') + .option( + '--benchmark ', + 'benchmark fixture JSON path', + 'benchmarks/retrieval-proofing/benchmark.v1.json' + ) + .option( + '--profiles ', + 'benchmark profiles JSON path', + 'benchmarks/retrieval-proofing/profiles.v1.json' + ) + .option('--profile ', 'benchmark profile name', 'smoke') + .option( + '--output-dir ', + 'directory for generated reports', + 'artifacts/retrieval-proofing' + ) + .option('--no-fail-on-gate', 'do not exit non-zero when gate fails') + .action( + async (options: { + benchmark: string; + profiles: string; + profile: string; + outputDir: string; + failOnGate: boolean; + }) => { + await runRetrievalProofCommand(options); + } + ); + const automations = program.command('automations').description('Manage local automations'); automations.command('list').action(async () => { diff --git a/src/cli/commands/retrieval-proof.ts b/src/cli/commands/retrieval-proof.ts new file mode 100644 index 0000000..905a2c1 --- /dev/null +++ b/src/cli/commands/retrieval-proof.ts @@ -0,0 +1,51 @@ +import { mkdir, writeFile } from 'node:fs/promises'; +import { isAbsolute, join, resolve } from 'node:path'; +import { formatProofingMarkdown } from '../../context/retrieval/proofing/reports'; +import { runRetrievalProofing } from '../../context/retrieval/proofing/runner'; + +type RetrievalProofCommandOptions = { + benchmark: string; + profiles: string; + profile: string; + outputDir: string; + failOnGate: boolean; +}; + +export async function runRetrievalProofCommand( + options: RetrievalProofCommandOptions +): Promise { + const benchmarkPath = absoluteFromCwd(options.benchmark); + const profilesPath = absoluteFromCwd(options.profiles); + const outputDir = absoluteFromCwd(options.outputDir); + + const report = await runRetrievalProofing({ + benchmarkPath, + profilesPath, + profileName: options.profile, + }); + + await mkdir(outputDir, { recursive: true }); + const timestamp = report.generatedAt.replaceAll(':', '-'); + const baseName = `${report.profile}-${timestamp}`; + const jsonPath = join(outputDir, `${baseName}.json`); + const markdownPath = join(outputDir, `${baseName}.md`); + + await writeFile(jsonPath, `${JSON.stringify(report, null, 2)}\n`, 'utf8'); + await writeFile(markdownPath, `${formatProofingMarkdown(report)}\n`, 'utf8'); + + console.log(`Retrieval proofing complete for profile "${report.profile}".`); + console.log(`Gate status: ${report.gate.passed ? 'PASS' : 'FAIL'}`); + console.log(`JSON report: ${jsonPath}`); + console.log(`Markdown report: ${markdownPath}`); + + if (!report.gate.passed && options.failOnGate) { + throw new Error(`Retrieval proofing gate failed: ${report.gate.failures.join('; ')}`); + } +} + +function absoluteFromCwd(path: string): string { + if (isAbsolute(path)) { + return path; + } + return resolve(process.cwd(), path); +} diff --git a/src/context/retrieval/proofing/reports.ts b/src/context/retrieval/proofing/reports.ts new file mode 100644 index 0000000..0067ec2 --- /dev/null +++ b/src/context/retrieval/proofing/reports.ts @@ -0,0 +1,65 @@ +import type { RetrievalProofingReport } from './schema'; + +export function formatProofingMarkdown(report: RetrievalProofingReport): string { + const hybridAggregate = report.strategies.hybrid.aggregate.metrics; + const lexicalAggregate = report.strategies.lexical.aggregate.metrics; + const vectorAggregate = report.strategies.vector.aggregate.metrics; + + const lines = [ + '# Retrieval Quality Proofing Report', + '', + `- Generated: ${report.generatedAt}`, + `- Benchmark: ${report.benchmark.datasetName}@${report.benchmark.datasetVersion}`, + `- Profile: ${report.profile}`, + `- Gate: ${report.gate.passed ? 'PASS' : 'FAIL'}`, + '', + '## Aggregate Metrics', + '', + '| Strategy | Evidence Relevance | Citation Coverage | Unsupported Penalty | Composite |', + '| --- | ---: | ---: | ---: | ---: |', + renderAggregateRow('hybrid', hybridAggregate), + renderAggregateRow('lexical', lexicalAggregate), + renderAggregateRow('vector', vectorAggregate), + '', + '## Hybrid Deltas vs Baselines', + '', + '| Baseline | Evidence Relevance Δ | Citation Coverage Δ | Unsupported Penalty Δ | Composite Δ |', + '| --- | ---: | ---: | ---: | ---: |', + ...report.hybridDeltas.map((entry) => + [ + `| ${entry.baseline}`, + `${entry.metricDeltas.evidenceRelevance.toFixed(3)}`, + `${entry.metricDeltas.citationSupportCoverage.toFixed(3)}`, + `${entry.metricDeltas.unsupportedClaimPenalty.toFixed(3)}`, + `${entry.metricDeltas.compositeScore.toFixed(3)} |`, + ].join(' | ') + ), + '', + '## Gate Status', + '', + report.gate.passed ? '- All configured thresholds passed.' : '- Failure reasons:', + ...report.gate.failures.map((failure) => ` - ${failure}`), + '', + '## Per-Case Hybrid Summary', + '', + '| Case | Retrieved Doc IDs | Expected Evidence IDs | Composite |', + '| --- | --- | --- | ---: |', + ...report.strategies.hybrid.cases.map((entry) => + [ + `| ${entry.caseId}`, + entry.retrievedDocIds.join(', '), + entry.expectedEvidenceDocIds.join(', '), + `${entry.metrics.compositeScore.toFixed(3)} |`, + ].join(' | ') + ), + ]; + + return lines.join('\n'); +} + +function renderAggregateRow( + strategy: string, + metrics: RetrievalProofingReport['strategies']['hybrid']['aggregate']['metrics'] +): string { + return `| ${strategy} | ${metrics.evidenceRelevance.toFixed(3)} | ${metrics.citationSupportCoverage.toFixed(3)} | ${metrics.unsupportedClaimPenalty.toFixed(3)} | ${metrics.compositeScore.toFixed(3)} |`; +} diff --git a/src/context/retrieval/proofing/runner.ts b/src/context/retrieval/proofing/runner.ts new file mode 100644 index 0000000..00be525 --- /dev/null +++ b/src/context/retrieval/proofing/runner.ts @@ -0,0 +1,213 @@ +import { readFile } from 'node:fs/promises'; +import { cosineSimilarity, deterministicEmbedding } from '../rerank'; +import { + type BenchmarkCase, + type BenchmarkProfile, + BenchmarkProfilesSchema, + RetrievalBenchmarkSchema, + type RetrievalProofingReport, + RetrievalProofingReportSchema, + type RetrievalProofingStrategy, +} from './schema'; +import { averageCaseMetrics, scoreCaseMetrics, subtractMetrics } from './scoring'; + +const STRATEGIES: RetrievalProofingStrategy[] = ['lexical', 'vector', 'hybrid']; + +export async function runRetrievalProofing(input: { + benchmarkPath: string; + profilesPath: string; + profileName: string; +}): Promise { + const benchmark = await loadBenchmark(input.benchmarkPath); + const profiles = await loadProfiles(input.profilesPath); + const profile = profiles.profiles[input.profileName]; + + if (!profile) { + throw new Error( + `Unknown benchmark profile "${input.profileName}". Available: ${Object.keys(profiles.profiles).join(', ')}` + ); + } + + const selectedCases = selectCases(benchmark.cases, profile); + const strategyReports = Object.fromEntries( + STRATEGIES.map((strategy) => { + const cases = selectedCases.map((benchmarkCase) => { + const retrievedDocIds = retrieveDocsForCase(benchmarkCase, strategy); + const metrics = scoreCaseMetrics({ benchmarkCase, retrievedDocIds }); + return { + caseId: benchmarkCase.id, + strategy, + retrievedDocIds, + expectedEvidenceDocIds: benchmarkCase.expectedEvidenceDocIds, + metrics, + }; + }); + const aggregate = { + strategy, + metrics: averageCaseMetrics(cases.map((entry) => entry.metrics)), + }; + return [strategy, { cases, aggregate }]; + }) + ) as RetrievalProofingReport['strategies']; + + const hybridAggregate = strategyReports.hybrid.aggregate.metrics; + const lexicalAggregate = strategyReports.lexical.aggregate.metrics; + const vectorAggregate = strategyReports.vector.aggregate.metrics; + + const hybridDeltas = [ + { + baseline: 'lexical' as const, + metricDeltas: subtractMetrics(hybridAggregate, lexicalAggregate), + }, + { + baseline: 'vector' as const, + metricDeltas: subtractMetrics(hybridAggregate, vectorAggregate), + }, + ]; + + const gateFailures = evaluateGate({ + profile, + hybridAggregate, + lexicalAggregate, + vectorAggregate, + }); + + return RetrievalProofingReportSchema.parse({ + schemaVersion: '1.0', + benchmark: { + datasetName: benchmark.datasetName, + datasetVersion: benchmark.datasetVersion, + }, + profile: input.profileName, + generatedAt: new Date().toISOString(), + strategies: strategyReports, + hybridDeltas, + gate: { + passed: gateFailures.length === 0, + failures: gateFailures, + }, + }); +} + +function evaluateGate(input: { + profile: BenchmarkProfile; + hybridAggregate: RetrievalProofingReport['strategies']['hybrid']['aggregate']['metrics']; + lexicalAggregate: RetrievalProofingReport['strategies']['lexical']['aggregate']['metrics']; + vectorAggregate: RetrievalProofingReport['strategies']['vector']['aggregate']['metrics']; +}): string[] { + const failures: string[] = []; + const minimums = input.profile.thresholds.hybridMinimums; + const deltas = input.profile.thresholds.baselineDeltaFloors; + const hybrid = input.hybridAggregate; + + if (hybrid.evidenceRelevance < minimums.evidenceRelevance) { + failures.push( + `hybrid evidenceRelevance ${hybrid.evidenceRelevance.toFixed(3)} < ${minimums.evidenceRelevance.toFixed(3)}` + ); + } + if (hybrid.citationSupportCoverage < minimums.citationSupportCoverage) { + failures.push( + `hybrid citationSupportCoverage ${hybrid.citationSupportCoverage.toFixed(3)} < ${minimums.citationSupportCoverage.toFixed(3)}` + ); + } + if (hybrid.compositeScore < minimums.compositeScore) { + failures.push( + `hybrid compositeScore ${hybrid.compositeScore.toFixed(3)} < ${minimums.compositeScore.toFixed(3)}` + ); + } + if (hybrid.unsupportedClaimPenalty > minimums.maxUnsupportedClaimPenalty) { + failures.push( + `hybrid unsupportedClaimPenalty ${hybrid.unsupportedClaimPenalty.toFixed(3)} > ${minimums.maxUnsupportedClaimPenalty.toFixed(3)}` + ); + } + + const hybridVsLexical = hybrid.compositeScore - input.lexicalAggregate.compositeScore; + if (hybridVsLexical < deltas.lexical) { + failures.push( + `hybrid-vs-lexical composite delta ${hybridVsLexical.toFixed(3)} < ${deltas.lexical.toFixed(3)}` + ); + } + + const hybridVsVector = hybrid.compositeScore - input.vectorAggregate.compositeScore; + if (hybridVsVector < deltas.vector) { + failures.push( + `hybrid-vs-vector composite delta ${hybridVsVector.toFixed(3)} < ${deltas.vector.toFixed(3)}` + ); + } + + return failures; +} + +function selectCases(cases: BenchmarkCase[], profile: BenchmarkProfile): BenchmarkCase[] { + if (!profile.caseIds || profile.caseIds.length === 0) { + return cases; + } + + const wanted = new Set(profile.caseIds); + const selected = cases.filter((entry) => wanted.has(entry.id)); + if (selected.length !== profile.caseIds.length) { + const selectedIds = new Set(selected.map((entry) => entry.id)); + const missing = profile.caseIds.filter((id) => !selectedIds.has(id)); + throw new Error(`Profile references missing benchmark case IDs: ${missing.join(', ')}`); + } + return selected; +} + +function retrieveDocsForCase( + benchmarkCase: BenchmarkCase, + strategy: RetrievalProofingStrategy +): string[] { + const rows = benchmarkCase.documents.map((doc) => { + const lexicalScore = computeLexicalScore(benchmarkCase.query, `${doc.title} ${doc.content}`); + const vectorScore = cosineSimilarity( + deterministicEmbedding(benchmarkCase.query), + deterministicEmbedding(`${doc.title} ${doc.content}`) + ); + const totalScore = + strategy === 'lexical' + ? lexicalScore + : strategy === 'vector' + ? vectorScore + : lexicalScore * 0.7 + vectorScore * 0.3; + + return { + id: doc.id, + totalScore, + }; + }); + + return rows + .sort((a, b) => b.totalScore - a.totalScore) + .slice(0, benchmarkCase.topK) + .map((entry) => entry.id); +} + +function computeLexicalScore(query: string, haystack: string): number { + const tokens = query + .toLowerCase() + .split(/[^a-z0-9]+/g) + .filter(Boolean); + if (tokens.length === 0) { + return 0; + } + + const source = haystack.toLowerCase(); + let score = 0; + for (const token of tokens) { + if (source.includes(token)) { + score += 1; + } + } + + return score / tokens.length; +} + +async function loadBenchmark(path: string) { + const raw = await readFile(path, 'utf8'); + return RetrievalBenchmarkSchema.parse(JSON.parse(raw)); +} + +async function loadProfiles(path: string) { + const raw = await readFile(path, 'utf8'); + return BenchmarkProfilesSchema.parse(JSON.parse(raw)); +} diff --git a/src/context/retrieval/proofing/schema.ts b/src/context/retrieval/proofing/schema.ts new file mode 100644 index 0000000..0e7a78c --- /dev/null +++ b/src/context/retrieval/proofing/schema.ts @@ -0,0 +1,118 @@ +import { z } from 'zod'; + +export const RetrievalProofingStrategySchema = z.enum(['lexical', 'vector', 'hybrid']); +export type RetrievalProofingStrategy = z.infer; + +export const BenchmarkDocumentSchema = z.object({ + id: z.string().min(1), + path: z.string().min(1), + title: z.string().min(1), + content: z.string().min(1), +}); + +export const BenchmarkCaseSchema = z.object({ + id: z.string().min(1), + title: z.string().min(1), + query: z.string().min(1), + intent: z.string().min(1), + difficulty: z.enum(['low', 'medium', 'high']), + topK: z.number().int().positive().default(3), + documents: z.array(BenchmarkDocumentSchema).min(2), + expectedEvidenceDocIds: z.array(z.string().min(1)).min(1), +}); +export type BenchmarkCase = z.infer; + +export const RetrievalBenchmarkSchema = z.object({ + version: z.literal('1.0'), + datasetName: z.string().min(1), + datasetVersion: z.string().min(1), + cases: z.array(BenchmarkCaseSchema).min(1), +}); +export type RetrievalBenchmark = z.infer; + +export const ProofingThresholdsSchema = z.object({ + hybridMinimums: z.object({ + evidenceRelevance: z.number().min(0).max(1), + citationSupportCoverage: z.number().min(0).max(1), + compositeScore: z.number().min(0).max(1), + maxUnsupportedClaimPenalty: z.number().min(0).max(1), + }), + baselineDeltaFloors: z.object({ + lexical: z.number(), + vector: z.number(), + }), +}); +export type ProofingThresholds = z.infer; + +export const BenchmarkProfileSchema = z.object({ + description: z.string().min(1), + caseIds: z.array(z.string().min(1)).optional(), + thresholds: ProofingThresholdsSchema, +}); +export type BenchmarkProfile = z.infer; + +export const BenchmarkProfilesSchema = z.object({ + version: z.literal('1.0'), + profiles: z.record(z.string(), BenchmarkProfileSchema), +}); +export type BenchmarkProfiles = z.infer; + +export const CaseMetricsSchema = z.object({ + evidenceRelevance: z.number().min(0).max(1), + citationSupportCoverage: z.number().min(0).max(1), + unsupportedClaimPenalty: z.number().min(0).max(1), + compositeScore: z.number().min(0).max(1), +}); +export type CaseMetrics = z.infer; + +export const StrategyCaseResultSchema = z.object({ + caseId: z.string(), + strategy: RetrievalProofingStrategySchema, + retrievedDocIds: z.array(z.string()), + expectedEvidenceDocIds: z.array(z.string()), + metrics: CaseMetricsSchema, +}); +export type StrategyCaseResult = z.infer; + +export const StrategyAggregateSchema = z.object({ + strategy: RetrievalProofingStrategySchema, + metrics: CaseMetricsSchema, +}); +export type StrategyAggregate = z.infer; + +export const StrategyDeltaSchema = z.object({ + baseline: z.enum(['lexical', 'vector']), + metricDeltas: z.object({ + evidenceRelevance: z.number(), + citationSupportCoverage: z.number(), + unsupportedClaimPenalty: z.number(), + compositeScore: z.number(), + }), +}); +export type StrategyDelta = z.infer; + +export const GateResultSchema = z.object({ + passed: z.boolean(), + failures: z.array(z.string()), +}); +export type GateResult = z.infer; + +export const RetrievalProofingReportSchema = z.object({ + schemaVersion: z.literal('1.0'), + benchmark: z.object({ + datasetName: z.string(), + datasetVersion: z.string(), + }), + profile: z.string(), + generatedAt: z.string(), + strategies: z.record( + RetrievalProofingStrategySchema, + z.object({ + cases: z.array(StrategyCaseResultSchema), + aggregate: StrategyAggregateSchema, + }) + ), + hybridDeltas: z.array(StrategyDeltaSchema), + gate: GateResultSchema, +}); +export type RetrievalProofingReport = z.infer; diff --git a/src/context/retrieval/proofing/scoring.ts b/src/context/retrieval/proofing/scoring.ts new file mode 100644 index 0000000..790a032 --- /dev/null +++ b/src/context/retrieval/proofing/scoring.ts @@ -0,0 +1,84 @@ +import type { BenchmarkCase, CaseMetrics } from './schema'; + +export function scoreCaseMetrics(input: { + benchmarkCase: BenchmarkCase; + retrievedDocIds: string[]; +}): CaseMetrics { + const { benchmarkCase, retrievedDocIds } = input; + const topK = benchmarkCase.topK; + const expectedSet = new Set(benchmarkCase.expectedEvidenceDocIds); + const top = retrievedDocIds.slice(0, topK); + const weightedHits = top.reduce((acc, docId, index) => { + if (!expectedSet.has(docId)) { + return acc; + } + return acc + (topK - index) / topK; + }, 0); + const weightDenominator = top.reduce((acc, _docId, index) => acc + (topK - index) / topK, 0); + const hits = top.filter((docId) => expectedSet.has(docId)).length; + + const evidenceRelevance = divide(weightedHits, weightDenominator); + const citationSupportCoverage = divide(hits, benchmarkCase.expectedEvidenceDocIds.length); + const unsupportedClaimPenalty = divide(topK - hits, topK); + const compositeScore = clamp01( + evidenceRelevance * 0.45 + citationSupportCoverage * 0.45 + (1 - unsupportedClaimPenalty) * 0.1 + ); + + return { + evidenceRelevance, + citationSupportCoverage, + unsupportedClaimPenalty, + compositeScore, + }; +} + +export function averageCaseMetrics(metrics: CaseMetrics[]): CaseMetrics { + if (metrics.length === 0) { + return { + evidenceRelevance: 0, + citationSupportCoverage: 0, + unsupportedClaimPenalty: 1, + compositeScore: 0, + }; + } + + return { + evidenceRelevance: average(metrics.map((metric) => metric.evidenceRelevance)), + citationSupportCoverage: average(metrics.map((metric) => metric.citationSupportCoverage)), + unsupportedClaimPenalty: average(metrics.map((metric) => metric.unsupportedClaimPenalty)), + compositeScore: average(metrics.map((metric) => metric.compositeScore)), + }; +} + +export function subtractMetrics(a: CaseMetrics, b: CaseMetrics): CaseMetrics { + return { + evidenceRelevance: a.evidenceRelevance - b.evidenceRelevance, + citationSupportCoverage: a.citationSupportCoverage - b.citationSupportCoverage, + unsupportedClaimPenalty: a.unsupportedClaimPenalty - b.unsupportedClaimPenalty, + compositeScore: a.compositeScore - b.compositeScore, + }; +} + +function average(values: number[]): number { + if (values.length === 0) { + return 0; + } + return values.reduce((acc, value) => acc + value, 0) / values.length; +} + +function divide(num: number, den: number): number { + if (den <= 0) { + return 0; + } + return num / den; +} + +function clamp01(value: number): number { + if (value <= 0) { + return 0; + } + if (value >= 1) { + return 1; + } + return value; +} diff --git a/tests/retrieval-proofing.test.ts b/tests/retrieval-proofing.test.ts new file mode 100644 index 0000000..66333fb --- /dev/null +++ b/tests/retrieval-proofing.test.ts @@ -0,0 +1,90 @@ +import { mkdtemp, readFile, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join, resolve } from 'node:path'; +import { describe, expect, it } from 'vitest'; +import { runRetrievalProofing } from '../src/context/retrieval/proofing/runner'; +import { RetrievalProofingReportSchema } from '../src/context/retrieval/proofing/schema'; + +const benchmarkPath = resolve(process.cwd(), 'benchmarks/retrieval-proofing/benchmark.v1.json'); +const profilesPath = resolve(process.cwd(), 'benchmarks/retrieval-proofing/profiles.v1.json'); + +describe('retrieval proofing', () => { + it('produces deterministic scoring for fixed benchmark/profile inputs', async () => { + const first = await runRetrievalProofing({ + benchmarkPath, + profilesPath, + profileName: 'smoke', + }); + const second = await runRetrievalProofing({ + benchmarkPath, + profilesPath, + profileName: 'smoke', + }); + + expect(first.generatedAt).not.toEqual(second.generatedAt); + expect({ ...first, generatedAt: 'fixed' }).toEqual({ ...second, generatedAt: 'fixed' }); + }); + + it('keeps JSON report schema stable and parseable', async () => { + const report = await runRetrievalProofing({ + benchmarkPath, + profilesPath, + profileName: 'smoke', + }); + + const parsed = RetrievalProofingReportSchema.parse(report); + + expect(parsed.schemaVersion).toBe('1.0'); + expect(Object.keys(parsed.strategies)).toEqual(['lexical', 'vector', 'hybrid']); + expect(parsed.strategies.hybrid.cases.length).toBeGreaterThan(0); + expect(typeof parsed.gate.passed).toBe('boolean'); + }); + + it('reports gate pass/fail based on configured thresholds', async () => { + const passReport = await runRetrievalProofing({ + benchmarkPath, + profilesPath, + profileName: 'smoke', + }); + expect(passReport.gate.passed).toBe(true); + + const tempDir = await mkdtemp(join(tmpdir(), 'retrieval-proofing-')); + const strictProfilesPath = join(tempDir, 'profiles.strict.json'); + const baseProfiles = JSON.parse(await readFile(profilesPath, 'utf8')) as { + version: string; + profiles: Record; + }; + + const strictProfiles = { + ...baseProfiles, + profiles: { + ...baseProfiles.profiles, + smoke: { + description: 'strict gate for failure test', + thresholds: { + hybridMinimums: { + evidenceRelevance: 0.99, + citationSupportCoverage: 0.99, + compositeScore: 0.99, + maxUnsupportedClaimPenalty: 0.01, + }, + baselineDeltaFloors: { + lexical: 0.2, + vector: 0.2, + }, + }, + }, + }, + }; + await writeFile(strictProfilesPath, `${JSON.stringify(strictProfiles, null, 2)}\n`, 'utf8'); + + const failReport = await runRetrievalProofing({ + benchmarkPath, + profilesPath: strictProfilesPath, + profileName: 'smoke', + }); + + expect(failReport.gate.passed).toBe(false); + expect(failReport.gate.failures.length).toBeGreaterThan(0); + }); +}); From c3c28895ba80c6fb1194462848e6998111a0bee2 Mon Sep 17 00:00:00 2001 From: Daniel Wise Date: Tue, 3 Mar 2026 19:06:24 -0800 Subject: [PATCH 2/3] feat(embedding): add strategy-based fallback engine with provenance - add typed embedding strategy config, validation, and deterministic resolution - add anthropic native-first fallback policy with failure-category control - add embedding provenance envelope and persist provenance on chunk embeddings - surface provenance in retrieval metadata and optional provenance logging - add migration runner support for sequential SQL migrations and provenance column - add conformance/policy/provenance tests and rollout documentation - archive embedding-parity-hardening change and sync new capability specs --- README.md | 4 + docs/embedding-strategy-rollout.md | 31 +++ .../.openspec.yaml | 0 .../design.md | 0 .../proposal.md | 0 .../spec.md | 0 .../embedding-strategy-configuration/spec.md | 0 .../tasks.md | 26 +- .../spec.md | 38 +++ .../embedding-strategy-configuration/spec.md | 27 ++ src/cli/runtime.ts | 3 + src/context/embedding/config.ts | 77 ++++++ src/context/embedding/engine.ts | 173 ++++++++++++ src/context/embedding/strategy.ts | 185 +++++++++++++ src/context/indexer/full-index.ts | 104 +++++-- src/context/retrieval/hybrid.ts | 8 +- src/daemon/main.ts | 2 + src/db/migrate.ts | 30 +- .../migrations/0002_embedding_provenance.sql | 3 + tests/embedding-strategy.test.ts | 259 ++++++++++++++++++ 20 files changed, 929 insertions(+), 41 deletions(-) create mode 100644 docs/embedding-strategy-rollout.md rename openspec/changes/{embedding-parity-hardening => archive/2026-03-04-embedding-parity-hardening}/.openspec.yaml (100%) rename openspec/changes/{embedding-parity-hardening => archive/2026-03-04-embedding-parity-hardening}/design.md (100%) rename openspec/changes/{embedding-parity-hardening => archive/2026-03-04-embedding-parity-hardening}/proposal.md (100%) rename openspec/changes/{embedding-parity-hardening => archive/2026-03-04-embedding-parity-hardening}/specs/anthropic-embedding-fallback-and-provenance/spec.md (100%) rename openspec/changes/{embedding-parity-hardening => archive/2026-03-04-embedding-parity-hardening}/specs/embedding-strategy-configuration/spec.md (100%) rename openspec/changes/{embedding-parity-hardening => archive/2026-03-04-embedding-parity-hardening}/tasks.md (50%) create mode 100644 openspec/specs/anthropic-embedding-fallback-and-provenance/spec.md create mode 100644 openspec/specs/embedding-strategy-configuration/spec.md create mode 100644 src/context/embedding/config.ts create mode 100644 src/context/embedding/engine.ts create mode 100644 src/context/embedding/strategy.ts create mode 100644 src/db/migrations/0002_embedding_provenance.sql create mode 100644 tests/embedding-strategy.test.ts diff --git a/README.md b/README.md index 1da25f9..e0c0a62 100644 --- a/README.md +++ b/README.md @@ -90,9 +90,13 @@ Environment variables (BYOK): - `DUBSBOT_ANTHROPIC_MODEL` - `DUBSBOT_GOOGLE_MODEL` (defaults to `gemini-3.1-pro-preview`) - `DUBSBOT_OTEL_ENABLED=1` to enable telemetry export hooks +- `DUBSBOT_EMBEDDING_STRATEGY_V2=1` to enable explicit embedding strategy resolution/fallback +- `DUBSBOT_EMBEDDING_STRATEGY_CONFIG_JSON` to provide explicit strategy config +- `DUBSBOT_EMBEDDING_PROVENANCE_LOG=1` to emit embedding provenance log lines ## Notes - Anthropic embeddings currently fall back to deterministic local vectors. - This project intentionally uses Biome only (no ESLint/Prettier). - Retrieval proofing benchmark schema/workflow docs: `docs/retrieval-proofing-benchmark-schema.md` and `docs/retrieval-proofing.md`. +- Embedding strategy rollout guide: `docs/embedding-strategy-rollout.md`. diff --git a/docs/embedding-strategy-rollout.md b/docs/embedding-strategy-rollout.md new file mode 100644 index 0000000..e8f983a --- /dev/null +++ b/docs/embedding-strategy-rollout.md @@ -0,0 +1,31 @@ +# Embedding Strategy V2 Rollout + +This rollout gates the explicit embedding strategy engine behind: + +- `DUBSBOT_EMBEDDING_STRATEGY_V2=1` + +Optional config override: + +- `DUBSBOT_EMBEDDING_STRATEGY_CONFIG_JSON` (JSON string matching schema version `1.0`) + +Optional provenance logging: + +- `DUBSBOT_EMBEDDING_PROVENANCE_LOG=1` + +## Enable (staged) + +1. Set `DUBSBOT_EMBEDDING_STRATEGY_V2=1` in a non-production environment. +2. Start with default legacy-mapped config (no custom JSON). +3. Run indexing and retrieval checks. +4. If needed, provide explicit strategy JSON to control Anthropic fallback paths. +5. Verify fallback/provenance behavior with tests: + - `pnpm test -- embedding-strategy` + +## Rollback + +1. Unset or set `DUBSBOT_EMBEDDING_STRATEGY_V2=0`. +2. Restart CLI/daemon processes. +3. System returns to legacy embedding execution path. + +Rollback is safe because provenance fields are additive and read-compatible. + diff --git a/openspec/changes/embedding-parity-hardening/.openspec.yaml b/openspec/changes/archive/2026-03-04-embedding-parity-hardening/.openspec.yaml similarity index 100% rename from openspec/changes/embedding-parity-hardening/.openspec.yaml rename to openspec/changes/archive/2026-03-04-embedding-parity-hardening/.openspec.yaml diff --git a/openspec/changes/embedding-parity-hardening/design.md b/openspec/changes/archive/2026-03-04-embedding-parity-hardening/design.md similarity index 100% rename from openspec/changes/embedding-parity-hardening/design.md rename to openspec/changes/archive/2026-03-04-embedding-parity-hardening/design.md diff --git a/openspec/changes/embedding-parity-hardening/proposal.md b/openspec/changes/archive/2026-03-04-embedding-parity-hardening/proposal.md similarity index 100% rename from openspec/changes/embedding-parity-hardening/proposal.md rename to openspec/changes/archive/2026-03-04-embedding-parity-hardening/proposal.md diff --git a/openspec/changes/embedding-parity-hardening/specs/anthropic-embedding-fallback-and-provenance/spec.md b/openspec/changes/archive/2026-03-04-embedding-parity-hardening/specs/anthropic-embedding-fallback-and-provenance/spec.md similarity index 100% rename from openspec/changes/embedding-parity-hardening/specs/anthropic-embedding-fallback-and-provenance/spec.md rename to openspec/changes/archive/2026-03-04-embedding-parity-hardening/specs/anthropic-embedding-fallback-and-provenance/spec.md diff --git a/openspec/changes/embedding-parity-hardening/specs/embedding-strategy-configuration/spec.md b/openspec/changes/archive/2026-03-04-embedding-parity-hardening/specs/embedding-strategy-configuration/spec.md similarity index 100% rename from openspec/changes/embedding-parity-hardening/specs/embedding-strategy-configuration/spec.md rename to openspec/changes/archive/2026-03-04-embedding-parity-hardening/specs/embedding-strategy-configuration/spec.md diff --git a/openspec/changes/embedding-parity-hardening/tasks.md b/openspec/changes/archive/2026-03-04-embedding-parity-hardening/tasks.md similarity index 50% rename from openspec/changes/embedding-parity-hardening/tasks.md rename to openspec/changes/archive/2026-03-04-embedding-parity-hardening/tasks.md index 533c582..a3b7e6f 100644 --- a/openspec/changes/embedding-parity-hardening/tasks.md +++ b/openspec/changes/archive/2026-03-04-embedding-parity-hardening/tasks.md @@ -1,24 +1,24 @@ ## 1. Strategy Configuration Foundation -- [ ] 1.1 Introduce typed `embeddingStrategy` config schema with provider/model primary and ordered fallback entries -- [ ] 1.2 Add startup validation for unknown providers, missing models, and cyclic fallback paths with structured errors -- [ ] 1.3 Add backward-compatible default mapping from legacy embedding settings to explicit strategy definitions +- [x] 1.1 Introduce typed `embeddingStrategy` config schema with provider/model primary and ordered fallback entries +- [x] 1.2 Add startup validation for unknown providers, missing models, and cyclic fallback paths with structured errors +- [x] 1.3 Add backward-compatible default mapping from legacy embedding settings to explicit strategy definitions ## 2. Runtime Strategy Resolution and Anthropic Policy -- [ ] 2.1 Implement deterministic strategy resolver that requires a valid strategy id for each embedding request -- [ ] 2.2 Implement Anthropic native-first execution path with explicit fallback eligibility based on configured failure categories -- [ ] 2.3 Enforce configured fallback order and terminal failure behavior when fallback is disallowed or exhausted +- [x] 2.1 Implement deterministic strategy resolver that requires a valid strategy id for each embedding request +- [x] 2.2 Implement Anthropic native-first execution path with explicit fallback eligibility based on configured failure categories +- [x] 2.3 Enforce configured fallback order and terminal failure behavior when fallback is disallowed or exhausted ## 3. Provenance Envelope and Data Plumbing -- [ ] 3.1 Define a normalized embedding result envelope including strategy id, provider/model attempt path, fallback state, and failure category -- [ ] 3.2 Propagate provenance fields through indexing writes and retrieval/query responses -- [ ] 3.3 Update logging/metrics hooks to include provenance identifiers for debugging and parity analysis +- [x] 3.1 Define a normalized embedding result envelope including strategy id, provider/model attempt path, fallback state, and failure category +- [x] 3.2 Propagate provenance fields through indexing writes and retrieval/query responses +- [x] 3.3 Update logging/metrics hooks to include provenance identifiers for debugging and parity analysis ## 4. Verification and Rollout Safety -- [ ] 4.1 Add conformance tests for valid/invalid strategy config loading and runtime resolution behavior -- [ ] 4.2 Add Anthropic policy tests covering success, non-fallbackable failures, fallbackable failures, and no-fallback scenarios -- [ ] 4.3 Add provenance completeness tests for both successful and terminal-failure embedding outcomes -- [ ] 4.4 Gate rollout behind a feature flag and document enable/rollback procedure for staged deployment +- [x] 4.1 Add conformance tests for valid/invalid strategy config loading and runtime resolution behavior +- [x] 4.2 Add Anthropic policy tests covering success, non-fallbackable failures, fallbackable failures, and no-fallback scenarios +- [x] 4.3 Add provenance completeness tests for both successful and terminal-failure embedding outcomes +- [x] 4.4 Gate rollout behind a feature flag and document enable/rollback procedure for staged deployment diff --git a/openspec/specs/anthropic-embedding-fallback-and-provenance/spec.md b/openspec/specs/anthropic-embedding-fallback-and-provenance/spec.md new file mode 100644 index 0000000..e34f9d8 --- /dev/null +++ b/openspec/specs/anthropic-embedding-fallback-and-provenance/spec.md @@ -0,0 +1,38 @@ +# anthropic-embedding-fallback-and-provenance Specification + +## Purpose +TBD - created by archiving change embedding-parity-hardening. Update Purpose after archive. +## Requirements +### Requirement: Anthropic Native-First Execution Policy +For strategies configured with Anthropic as primary, the system SHALL attempt Anthropic native embedding first and SHALL only consider fallback providers explicitly listed in that strategy. + +#### Scenario: Anthropic primary succeeds +- **WHEN** a request resolves to a strategy with Anthropic as primary and Anthropic returns embeddings successfully +- **THEN** the system returns the Anthropic embedding result without invoking fallback providers + +#### Scenario: Anthropic primary fails with non-fallbackable error +- **WHEN** Anthropic returns an error outside configured fallbackable categories +- **THEN** the system returns a terminal embedding error and MUST NOT invoke fallback providers + +### Requirement: Controlled Anthropic Fallback Behavior +The system SHALL invoke fallback providers for Anthropic strategies only for configured fallbackable failure categories and in configured fallback order. + +#### Scenario: Fallback is invoked in configured order +- **WHEN** Anthropic primary fails with a fallbackable error category and fallback providers are configured +- **THEN** the system attempts fallback providers sequentially in strategy order until one succeeds or all fail + +#### Scenario: No fallback configured +- **WHEN** Anthropic primary fails with a fallbackable error category but no fallback providers are configured +- **THEN** the system returns a structured failure indicating no fallback path was available + +### Requirement: Embedding Provenance Metadata +The system SHALL attach provenance metadata to every embedding result and terminal failure outcome, including strategy id, attempt provider/model path, and fallback usage state. + +#### Scenario: Provenance is emitted on success +- **WHEN** any provider successfully returns embeddings +- **THEN** the result includes provenance fields for strategy id, resolved provider/model, attempt path, and whether fallback was used + +#### Scenario: Provenance is emitted on terminal failure +- **WHEN** all attempts fail or fallback is disallowed +- **THEN** the error payload includes provenance fields for attempted providers/models, failure category, and terminal resolution reason + diff --git a/openspec/specs/embedding-strategy-configuration/spec.md b/openspec/specs/embedding-strategy-configuration/spec.md new file mode 100644 index 0000000..3d34504 --- /dev/null +++ b/openspec/specs/embedding-strategy-configuration/spec.md @@ -0,0 +1,27 @@ +# embedding-strategy-configuration Specification + +## Purpose +TBD - created by archiving change embedding-parity-hardening. Update Purpose after archive. +## Requirements +### Requirement: Provider-Configurable Embedding Strategy +The system SHALL support explicit embedding strategy configuration per embedding use-case, including primary provider/model selection and an ordered fallback list. + +#### Scenario: Valid strategy is loaded +- **WHEN** the service starts with a strategy configuration where each strategy has a primary provider/model and valid fallback entries +- **THEN** the system initializes successfully and registers the strategy for runtime resolution + +#### Scenario: Invalid strategy is rejected +- **WHEN** the service starts with a strategy configuration that references an unknown provider, missing model, or cyclic fallback path +- **THEN** the system MUST fail validation and return a configuration error that identifies the invalid strategy entry + +### Requirement: Deterministic Runtime Strategy Resolution +The system SHALL resolve embedding strategies deterministically for each request using the configured strategy identifier and SHALL NOT use implicit provider defaults. + +#### Scenario: Strategy id resolves to configured primary +- **WHEN** an embedding request specifies a known strategy id +- **THEN** the system uses the configured primary provider/model for the first execution attempt + +#### Scenario: Unknown strategy id is rejected +- **WHEN** an embedding request specifies a strategy id not present in configuration +- **THEN** the system returns a structured error and MUST NOT attempt embedding generation + diff --git a/src/cli/runtime.ts b/src/cli/runtime.ts index d1528a6..7b45964 100644 --- a/src/cli/runtime.ts +++ b/src/cli/runtime.ts @@ -1,5 +1,6 @@ import { AgentOrchestrator } from '../agent/orchestrator'; import { loadAgentsConfig } from '../config/agents-loader'; +import { loadEmbeddingStrategyConfig } from '../context/embedding/config'; import { createDb } from '../db/client'; import { runMigrations } from '../db/migrate'; import { OptionalOtelExporter } from '../observability/otel'; @@ -13,6 +14,7 @@ import { ToolRegistry } from '../tools/registry'; export async function createRuntime() { await runMigrations(); const db = await createDb(); + const embeddingStrategyConfig = loadEmbeddingStrategyConfig(); const agentsConfig = await loadAgentsConfig(process.cwd()); const provider = createProviderAdapter(detectProvider()); const policyEngine = new DefaultPolicyEngine(createDefaultApprovalPolicy()); @@ -24,6 +26,7 @@ export async function createRuntime() { return { db, + embeddingStrategyConfig, provider, policyEngine, orchestrator, diff --git a/src/context/embedding/config.ts b/src/context/embedding/config.ts new file mode 100644 index 0000000..83b713e --- /dev/null +++ b/src/context/embedding/config.ts @@ -0,0 +1,77 @@ +import { detectProvider, type ProviderName } from '../../providers'; +import { + type EmbeddingStrategyConfig, + EmbeddingStrategyConfigError, + parseEmbeddingStrategyConfig, +} from './strategy'; + +export function loadEmbeddingStrategyConfig(): EmbeddingStrategyConfig { + const rawFromEnv = process.env.DUBSBOT_EMBEDDING_STRATEGY_CONFIG_JSON; + const raw = rawFromEnv ? JSON.parse(rawFromEnv) : buildLegacyDefaultConfig(); + const parsed = parseEmbeddingStrategyConfig(raw); + if (!parsed.config) { + throw new EmbeddingStrategyConfigError(parsed.issues); + } + return parsed.config; +} + +export function isEmbeddingStrategyV2Enabled(): boolean { + return process.env.DUBSBOT_EMBEDDING_STRATEGY_V2 === '1'; +} + +function buildLegacyDefaultConfig(): EmbeddingStrategyConfig { + const primaryProvider = detectProvider(); + const primary = toPrimaryStrategy(primaryProvider, 'default-primary'); + const strategies = [primary]; + + if (primaryProvider === 'anthropic') { + strategies.push(toPrimaryStrategy('openai', 'fallback-openai')); + strategies.push(toPrimaryStrategy('google', 'fallback-google')); + primary.fallback = [ + { + strategyId: 'fallback-openai', + onFailure: ['rate_limit', 'timeout', 'service_unavailable'], + }, + { + strategyId: 'fallback-google', + onFailure: ['rate_limit', 'timeout', 'service_unavailable'], + }, + ]; + } + + return { + version: '1.0', + defaults: { + indexing: 'default-primary', + query: 'default-primary', + }, + strategies, + }; +} + +function toPrimaryStrategy(provider: ProviderName, id: string) { + return { + id, + provider, + model: defaultEmbeddingModel(provider), + fallback: [] as Array<{ + strategyId: string; + onFailure: Array< + 'rate_limit' | 'timeout' | 'service_unavailable' | 'auth' | 'invalid_request' | 'unknown' + >; + }>, + }; +} + +function defaultEmbeddingModel(provider: ProviderName): string { + switch (provider) { + case 'openai': + return process.env.DUBSBOT_OPENAI_EMBEDDING_MODEL ?? 'text-embedding-3-small'; + case 'google': + return process.env.DUBSBOT_GOOGLE_EMBEDDING_MODEL ?? 'text-embedding-004'; + case 'anthropic': + return process.env.DUBSBOT_ANTHROPIC_EMBEDDING_MODEL ?? 'deterministic-v1'; + default: + return 'deterministic-v1'; + } +} diff --git a/src/context/embedding/engine.ts b/src/context/embedding/engine.ts new file mode 100644 index 0000000..9159281 --- /dev/null +++ b/src/context/embedding/engine.ts @@ -0,0 +1,173 @@ +import type { ProviderAdapter } from '../../providers/types'; +import { + type EmbeddingStrategyConfig, + type FailureCategory, + resolveEmbeddingStrategy, +} from './strategy'; + +export type EmbeddingAttempt = { + strategyId: string; + provider: string; + model: string; + status: 'success' | 'failure'; + failureCategory?: FailureCategory; +}; + +export type EmbeddingProvenance = { + strategyId: string; + attemptPath: EmbeddingAttempt[]; + resolvedBy?: { strategyId: string; provider: string; model: string }; + fallbackUsed: boolean; + failureCategory?: FailureCategory; + terminalReason?: 'fallback_disallowed' | 'fallback_exhausted' | 'no_fallback'; +}; + +export type EmbeddingExecutionSuccess = { + ok: true; + embedding: number[]; + provider: string; + model: string; + provenance: EmbeddingProvenance; +}; + +export type EmbeddingExecutionFailure = { + ok: false; + message: string; + provenance: EmbeddingProvenance; +}; + +export type EmbeddingExecutionResult = EmbeddingExecutionSuccess | EmbeddingExecutionFailure; + +export class EmbeddingExecutionError extends Error { + constructor( + message: string, + public readonly provenance: EmbeddingProvenance + ) { + super(message); + this.name = 'EmbeddingExecutionError'; + } +} + +export async function executeEmbeddingWithStrategy(input: { + config: EmbeddingStrategyConfig; + strategyId: string; + value: string; + adapterForProvider: (provider: string) => ProviderAdapter; +}): Promise { + const attemptPath: EmbeddingAttempt[] = []; + const queue: string[] = [input.strategyId]; + const visited = new Set(); + let fallbackUsed = false; + + while (queue.length > 0) { + const currentId = queue.shift(); + if (!currentId || visited.has(currentId)) { + continue; + } + visited.add(currentId); + + const strategy = resolveEmbeddingStrategy(input.config, currentId); + const adapter = input.adapterForProvider(strategy.provider); + + try { + const vectors = await adapter.embed({ + model: strategy.model, + values: [input.value], + }); + const vector = vectors[0] ?? []; + attemptPath.push({ + strategyId: currentId, + provider: strategy.provider, + model: strategy.model, + status: 'success', + }); + return { + ok: true, + embedding: vector, + provider: strategy.provider, + model: strategy.model, + provenance: { + strategyId: input.strategyId, + attemptPath, + fallbackUsed, + resolvedBy: { + strategyId: currentId, + provider: strategy.provider, + model: strategy.model, + }, + }, + }; + } catch (error) { + const failureCategory = classifyEmbeddingFailure(error); + attemptPath.push({ + strategyId: currentId, + provider: strategy.provider, + model: strategy.model, + status: 'failure', + failureCategory, + }); + + const eligibleFallback = strategy.fallback.find((entry) => + entry.onFailure.includes(failureCategory) + ); + if (!eligibleFallback) { + return { + ok: false, + message: `Embedding failed for strategy "${currentId}" with category "${failureCategory}" and no eligible fallback.`, + provenance: { + strategyId: input.strategyId, + attemptPath, + fallbackUsed, + failureCategory, + terminalReason: strategy.fallback.length > 0 ? 'fallback_disallowed' : 'no_fallback', + }, + }; + } + + fallbackUsed = true; + queue.push(eligibleFallback.strategyId); + } + } + + return { + ok: false, + message: `Embedding failed for strategy "${input.strategyId}" after exhausting fallback chain.`, + provenance: { + strategyId: input.strategyId, + attemptPath, + fallbackUsed, + failureCategory: attemptPath.at(-1)?.failureCategory, + terminalReason: 'fallback_exhausted', + }, + }; +} + +export function classifyEmbeddingFailure(error: unknown): FailureCategory { + const message = + error instanceof Error ? error.message.toLowerCase() : String(error).toLowerCase(); + if (message.includes('rate limit') || message.includes('429')) { + return 'rate_limit'; + } + if (message.includes('timeout') || message.includes('timed out')) { + return 'timeout'; + } + if (message.includes('503') || message.includes('unavailable')) { + return 'service_unavailable'; + } + if (message.includes('401') || message.includes('403') || message.includes('auth')) { + return 'auth'; + } + if (message.includes('400') || message.includes('invalid')) { + return 'invalid_request'; + } + return 'unknown'; +} + +export function assertEmbeddingSuccess( + result: EmbeddingExecutionResult +): EmbeddingExecutionSuccess { + if (result.ok) { + return result; + } + throw new EmbeddingExecutionError(result.message, result.provenance); +} diff --git a/src/context/embedding/strategy.ts b/src/context/embedding/strategy.ts new file mode 100644 index 0000000..f3eac27 --- /dev/null +++ b/src/context/embedding/strategy.ts @@ -0,0 +1,185 @@ +import { z } from 'zod'; +import type { ProviderName } from '../../providers'; + +export const FailureCategorySchema = z.enum([ + 'rate_limit', + 'timeout', + 'service_unavailable', + 'auth', + 'invalid_request', + 'unknown', +]); +export type FailureCategory = z.infer; + +export const EmbeddingStrategyRefSchema = z.object({ + strategyId: z.string().min(1), + onFailure: z.array(FailureCategorySchema).min(1), +}); +export type EmbeddingStrategyRef = z.infer; + +export const EmbeddingStrategySchema = z.object({ + id: z.string().min(1), + provider: z.enum(['openai', 'anthropic', 'google']), + model: z.string().min(1), + fallback: z.array(EmbeddingStrategyRefSchema).default([]), +}); +export type EmbeddingStrategy = z.infer & { + provider: ProviderName; +}; + +export const EmbeddingStrategyConfigSchema = z.object({ + version: z.literal('1.0'), + defaults: z.object({ + indexing: z.string().min(1), + query: z.string().min(1), + }), + strategies: z.array(EmbeddingStrategySchema).min(1), +}); +export type EmbeddingStrategyConfig = z.infer; + +export type EmbeddingStrategyValidationIssue = { + code: + | 'unknown_provider' + | 'missing_model' + | 'duplicate_strategy' + | 'unknown_fallback_strategy' + | 'cyclic_fallback_path' + | 'unknown_default_strategy'; + strategyId?: string; + detail: string; +}; + +export class EmbeddingStrategyConfigError extends Error { + constructor(public readonly issues: EmbeddingStrategyValidationIssue[]) { + super( + `Invalid embedding strategy configuration: ${issues.map((issue) => issue.detail).join('; ')}` + ); + this.name = 'EmbeddingStrategyConfigError'; + } +} + +export class EmbeddingStrategyResolutionError extends Error { + constructor( + public readonly strategyId: string, + public readonly reason: 'unknown_strategy' + ) { + super(`Unable to resolve embedding strategy "${strategyId}": ${reason}`); + this.name = 'EmbeddingStrategyResolutionError'; + } +} + +export function parseEmbeddingStrategyConfig(raw: unknown): { + config?: EmbeddingStrategyConfig; + issues: EmbeddingStrategyValidationIssue[]; +} { + const parsed = EmbeddingStrategyConfigSchema.safeParse(raw); + if (!parsed.success) { + return { + issues: parsed.error.issues.map((issue) => ({ + code: issue.path.includes('provider') ? 'unknown_provider' : 'missing_model', + detail: `${issue.path.join('.') || ''}: ${issue.message}`, + })), + }; + } + + const config = parsed.data; + const issues: EmbeddingStrategyValidationIssue[] = []; + const byId = new Map(); + for (const strategy of config.strategies) { + if (byId.has(strategy.id)) { + issues.push({ + code: 'duplicate_strategy', + strategyId: strategy.id, + detail: `Duplicate strategy id "${strategy.id}"`, + }); + continue; + } + byId.set(strategy.id, strategy); + + if (!strategy.model.trim()) { + issues.push({ + code: 'missing_model', + strategyId: strategy.id, + detail: `Strategy "${strategy.id}" is missing model`, + }); + } + } + + for (const strategy of config.strategies) { + for (const fallback of strategy.fallback) { + if (!byId.has(fallback.strategyId)) { + issues.push({ + code: 'unknown_fallback_strategy', + strategyId: strategy.id, + detail: `Strategy "${strategy.id}" references unknown fallback strategy "${fallback.strategyId}"`, + }); + } + } + } + + if (!byId.has(config.defaults.indexing)) { + issues.push({ + code: 'unknown_default_strategy', + strategyId: config.defaults.indexing, + detail: `Default indexing strategy "${config.defaults.indexing}" does not exist`, + }); + } + if (!byId.has(config.defaults.query)) { + issues.push({ + code: 'unknown_default_strategy', + strategyId: config.defaults.query, + detail: `Default query strategy "${config.defaults.query}" does not exist`, + }); + } + + const visited = new Set(); + const inStack = new Set(); + const path: string[] = []; + + function walk(strategyId: string) { + if (inStack.has(strategyId)) { + const cycleStart = path.indexOf(strategyId); + const cycle = [...path.slice(cycleStart), strategyId].join(' -> '); + issues.push({ + code: 'cyclic_fallback_path', + strategyId, + detail: `Cyclic fallback path detected: ${cycle}`, + }); + return; + } + if (visited.has(strategyId)) { + return; + } + + visited.add(strategyId); + inStack.add(strategyId); + path.push(strategyId); + const strategy = byId.get(strategyId); + if (strategy) { + for (const fallback of strategy.fallback) { + if (byId.has(fallback.strategyId)) { + walk(fallback.strategyId); + } + } + } + path.pop(); + inStack.delete(strategyId); + } + + for (const strategy of config.strategies) { + walk(strategy.id); + } + + return issues.length > 0 ? { issues } : { config, issues: [] }; +} + +export function resolveEmbeddingStrategy( + config: EmbeddingStrategyConfig, + strategyId: string +): EmbeddingStrategy { + const strategy = config.strategies.find((entry) => entry.id === strategyId); + if (!strategy) { + throw new EmbeddingStrategyResolutionError(strategyId, 'unknown_strategy'); + } + return strategy as EmbeddingStrategy; +} diff --git a/src/context/indexer/full-index.ts b/src/context/indexer/full-index.ts index 1e1c76e..9bd2390 100644 --- a/src/context/indexer/full-index.ts +++ b/src/context/indexer/full-index.ts @@ -2,7 +2,14 @@ import { createHash, randomUUID } from 'node:crypto'; import { readFile } from 'node:fs/promises'; import fg from 'fast-glob'; import type { DubsbotDb } from '../../db/client'; +import { createProviderAdapter } from '../../providers'; import type { ProviderAdapter } from '../../providers/types'; +import { isEmbeddingStrategyV2Enabled, loadEmbeddingStrategyConfig } from '../embedding/config'; +import { + assertEmbeddingSuccess, + type EmbeddingProvenance, + executeEmbeddingWithStrategy, +} from '../embedding/engine'; import { deterministicEmbedding } from '../retrieval/rerank'; type Chunk = { @@ -57,6 +64,7 @@ export async function runFullIndex(input: { repoRoot: string; embedProvider?: ProviderAdapter; embeddingModel?: string; + embeddingStrategyId?: string; }): Promise<{ filesIndexed: number; chunksIndexed: number }> { const paths = await fg(['**/*', '!node_modules/**', '!.git/**', '!dist/**', '!coverage/**'], { cwd: input.repoRoot, @@ -68,6 +76,20 @@ export async function runFullIndex(input: { let filesIndexed = 0; let chunksIndexed = 0; + const isStrategyV2 = isEmbeddingStrategyV2Enabled(); + const strategyConfig = isStrategyV2 ? loadEmbeddingStrategyConfig() : null; + const adapterCache = new Map(); + + function getAdapter(provider: string): ProviderAdapter { + const cached = adapterCache.get(provider); + if (cached) { + return cached; + } + const adapter = createProviderAdapter(provider as 'openai' | 'anthropic' | 'google'); + adapterCache.set(provider, adapter); + return adapter; + } + for (const relativePath of paths) { const absolutePath = `${input.repoRoot}/${relativePath}`; const content = await readFile(absolutePath, 'utf8').catch(() => null); @@ -104,26 +126,58 @@ export async function runFullIndex(input: { [chunkId, persistedFileId, chunk.index, chunk.content, chunk.startLine, chunk.endLine] ); - const embedding = - input.embedProvider != null - ? ( - await input.embedProvider.embed({ - model: input.embeddingModel ?? 'text-embedding-3-small', - values: [chunk.content], - }) - )[0] - : deterministicEmbedding(chunk.content); + let embedding: number[]; + let provider = input.embedProvider ? 'remote' : 'local'; + let model = input.embeddingModel ?? 'deterministic-v1'; + let provenance: EmbeddingProvenance = { + strategyId: 'legacy-default', + attemptPath: [ + { + strategyId: 'legacy-default', + provider, + model, + status: 'success', + }, + ], + fallbackUsed: false, + resolvedBy: { + strategyId: 'legacy-default', + provider, + model, + }, + }; + + if (isStrategyV2 && strategyConfig) { + const strategyId = input.embeddingStrategyId ?? strategyConfig.defaults.indexing; + const result = await executeEmbeddingWithStrategy({ + config: strategyConfig, + strategyId, + value: chunk.content, + adapterForProvider: getAdapter, + }); + const success = assertEmbeddingSuccess(result); + embedding = success.embedding; + provider = success.provider; + model = success.model; + provenance = success.provenance; + emitEmbeddingTelemetry(success.provenance); + } else { + embedding = + input.embedProvider != null + ? ( + await input.embedProvider.embed({ + model: input.embeddingModel ?? 'text-embedding-3-small', + values: [chunk.content], + }) + )[0] + : deterministicEmbedding(chunk.content); + } await input.db.query( - `INSERT INTO chunk_embeddings (chunk_id, provider, model, embedding) - VALUES ($1, $2, $3, $4::jsonb) - ON CONFLICT (chunk_id) DO UPDATE SET provider = EXCLUDED.provider, model = EXCLUDED.model, embedding = EXCLUDED.embedding`, - [ - chunkId, - input.embedProvider ? 'remote' : 'local', - input.embeddingModel ?? 'deterministic-v1', - JSON.stringify(embedding), - ] + `INSERT INTO chunk_embeddings (chunk_id, provider, model, embedding, provenance) + VALUES ($1, $2, $3, $4::jsonb, $5::jsonb) + ON CONFLICT (chunk_id) DO UPDATE SET provider = EXCLUDED.provider, model = EXCLUDED.model, embedding = EXCLUDED.embedding, provenance = EXCLUDED.provenance`, + [chunkId, provider, model, JSON.stringify(embedding), JSON.stringify(provenance)] ); await input.db.query('INSERT INTO bm25_documents (id, chunk_id, body) VALUES ($1, $2, $3)', [ @@ -136,3 +190,17 @@ export async function runFullIndex(input: { return { filesIndexed, chunksIndexed }; } + +function emitEmbeddingTelemetry(provenance: EmbeddingProvenance): void { + if (process.env.DUBSBOT_EMBEDDING_PROVENANCE_LOG !== '1') { + return; + } + const resolved = provenance.resolvedBy + ? `${provenance.resolvedBy.provider}:${provenance.resolvedBy.model}` + : 'none'; + console.info( + `[embedding] strategy=${provenance.strategyId} resolved=${resolved} fallback=${provenance.fallbackUsed} attempts=${provenance.attemptPath + .map((attempt) => `${attempt.provider}:${attempt.model}:${attempt.status}`) + .join('>')}` + ); +} diff --git a/src/context/retrieval/hybrid.ts b/src/context/retrieval/hybrid.ts index f052d21..1ea6c75 100644 --- a/src/context/retrieval/hybrid.ts +++ b/src/context/retrieval/hybrid.ts @@ -9,6 +9,9 @@ type ChunkRow = { content: string; path: string; embedding: string | null; + provider: string | null; + model: string | null; + provenance: string | null; }; async function grepSearch( @@ -61,7 +64,7 @@ export async function runHybridRetrieval(input: { const queryVector = deterministicEmbedding(query.vectorQuery || query.lexicalQuery); const rows = await input.db.query( - `SELECT c.id, c.content, f.path, ce.embedding::text as embedding + `SELECT c.id, c.content, f.path, ce.embedding::text as embedding, ce.provider, ce.model, ce.provenance::text as provenance FROM chunks c JOIN files f ON f.id = c.file_id LEFT JOIN chunk_embeddings ce ON ce.chunk_id = c.id @@ -104,6 +107,9 @@ export async function runHybridRetrieval(input: { score: entry.totalScore, metadata: { path: entry.item.path, + provider: entry.item.provider ?? 'unknown', + model: entry.item.model ?? 'unknown', + embeddingProvenance: entry.item.provenance ? JSON.parse(entry.item.provenance) : null, lexicalScore: entry.lexicalScore, vectorScore: entry.vectorScore, rank: index + 1, diff --git a/src/daemon/main.ts b/src/daemon/main.ts index 01c535b..33e226b 100644 --- a/src/daemon/main.ts +++ b/src/daemon/main.ts @@ -3,6 +3,7 @@ import { EventHookRunner } from '../automation/event-hooks'; import { AutomationRunner } from '../automation/runner'; import { AutomationScheduler } from '../automation/scheduler'; import { loadAgentsConfig } from '../config/agents-loader'; +import { loadEmbeddingStrategyConfig } from '../context/embedding/config'; import { RepoFsWatcher } from '../context/fs-watcher'; import { GitWatcher } from '../context/git-watcher'; import { runIncrementalIndex } from '../context/indexer/incremental'; @@ -15,6 +16,7 @@ import { createProviderAdapter, detectProvider } from '../providers'; async function main(): Promise { await runMigrations(); const db = await createDb(); + loadEmbeddingStrategyConfig(); const provider = createProviderAdapter(detectProvider()); const policy = new DefaultPolicyEngine(createDefaultApprovalPolicy()); const orchestrator = new AgentOrchestrator({ provider, policyEngine: policy }); diff --git a/src/db/migrate.ts b/src/db/migrate.ts index 397be47..e58a85c 100644 --- a/src/db/migrate.ts +++ b/src/db/migrate.ts @@ -1,5 +1,6 @@ import { readFile } from 'node:fs/promises'; import { join } from 'node:path'; +import fg from 'fast-glob'; import { createDb } from './client'; export async function runMigrations(): Promise { @@ -8,17 +9,28 @@ export async function runMigrations(): Promise { 'CREATE TABLE IF NOT EXISTS schema_migrations (version TEXT PRIMARY KEY, applied_at TIMESTAMPTZ NOT NULL DEFAULT NOW());' ); - const migrationPath = join(process.cwd(), 'src', 'db', 'migrations', '0001_init.sql'); - const migrationSql = await readFile(migrationPath, 'utf8'); + const migrationFiles = await fg(['*.sql'], { + cwd: join(process.cwd(), 'src', 'db', 'migrations'), + onlyFiles: true, + absolute: false, + }); + migrationFiles.sort(); - const already = await db.query<{ exists: boolean }>( - "SELECT EXISTS (SELECT 1 FROM schema_migrations WHERE version = '0001_init') AS exists" + const existing = await db.query<{ version: string }>( + 'SELECT version FROM schema_migrations ORDER BY version ASC' ); + const applied = new Set(existing.rows.map((row) => row.version)); - if (already.rows[0]?.exists) { - return; - } + for (const file of migrationFiles) { + const version = file.replace(/\.sql$/, ''); + if (applied.has(version)) { + continue; + } + + const migrationPath = join(process.cwd(), 'src', 'db', 'migrations', file); + const migrationSql = await readFile(migrationPath, 'utf8'); - await db.exec(migrationSql); - await db.query("INSERT INTO schema_migrations (version) VALUES ('0001_init')"); + await db.exec(migrationSql); + await db.query('INSERT INTO schema_migrations (version) VALUES ($1)', [version]); + } } diff --git a/src/db/migrations/0002_embedding_provenance.sql b/src/db/migrations/0002_embedding_provenance.sql new file mode 100644 index 0000000..a49cddd --- /dev/null +++ b/src/db/migrations/0002_embedding_provenance.sql @@ -0,0 +1,3 @@ +ALTER TABLE chunk_embeddings +ADD COLUMN IF NOT EXISTS provenance JSONB NOT NULL DEFAULT '{}'::jsonb; + diff --git a/tests/embedding-strategy.test.ts b/tests/embedding-strategy.test.ts new file mode 100644 index 0000000..852a3d2 --- /dev/null +++ b/tests/embedding-strategy.test.ts @@ -0,0 +1,259 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; +import { loadEmbeddingStrategyConfig } from '../src/context/embedding/config'; +import { executeEmbeddingWithStrategy } from '../src/context/embedding/engine'; +import { + EmbeddingStrategyConfigError, + EmbeddingStrategyResolutionError, + parseEmbeddingStrategyConfig, + resolveEmbeddingStrategy, +} from '../src/context/embedding/strategy'; +import type { ProviderAdapter } from '../src/providers/types'; + +class FakeProvider implements ProviderAdapter { + constructor( + private readonly behavior: 'ok' | 'rate_limit' | 'auth' | 'timeout' | 'service_unavailable' + ) {} + + async generateStructured(): Promise { + throw new Error('not used'); + } + + async *streamStructured(): AsyncIterable {} + + async embed(): Promise { + if (this.behavior === 'ok') { + return [[0.1, 0.2, 0.3]]; + } + if (this.behavior === 'rate_limit') { + throw new Error('429 rate limit'); + } + if (this.behavior === 'auth') { + throw new Error('401 auth'); + } + if (this.behavior === 'timeout') { + throw new Error('timeout'); + } + throw new Error('503 unavailable'); + } + + async countTokens(): Promise { + return 1; + } + + supports(): boolean { + return true; + } +} + +const baseConfig = { + version: '1.0', + defaults: { indexing: 'indexing', query: 'query' }, + strategies: [ + { + id: 'indexing', + provider: 'anthropic', + model: 'claude-embed', + fallback: [{ strategyId: 'fallback-openai', onFailure: ['rate_limit', 'timeout'] }], + }, + { + id: 'query', + provider: 'anthropic', + model: 'claude-embed', + fallback: [{ strategyId: 'fallback-openai', onFailure: ['rate_limit'] }], + }, + { + id: 'fallback-openai', + provider: 'openai', + model: 'text-embedding-3-small', + fallback: [], + }, + ], +} as const; + +function requireConfig(raw: unknown) { + const parsed = parseEmbeddingStrategyConfig(raw); + if (!parsed.config) { + throw new Error( + `Expected valid config, received issues: ${parsed.issues.map((i) => i.detail).join('; ')}` + ); + } + return parsed.config; +} + +describe('embedding strategy configuration', () => { + beforeEach(() => { + delete process.env.DUBSBOT_EMBEDDING_STRATEGY_CONFIG_JSON; + }); + + it('loads valid strategy config and resolves known strategy ids', () => { + const config = requireConfig(baseConfig); + expect(resolveEmbeddingStrategy(config, 'indexing').provider).toBe('anthropic'); + }); + + it('rejects invalid config entries (unknown provider, missing fallback strategy, cycles)', () => { + const parsed = parseEmbeddingStrategyConfig({ + version: '1.0', + defaults: { indexing: 'a', query: 'b' }, + strategies: [ + { + id: 'a', + provider: 'openai', + model: 'x', + fallback: [{ strategyId: 'b', onFailure: ['rate_limit'] }], + }, + { + id: 'b', + provider: 'google', + model: 'x', + fallback: [{ strategyId: 'a', onFailure: ['rate_limit'] }], + }, + { + id: 'bad', + provider: 'openai', + model: 'x', + fallback: [{ strategyId: 'missing', onFailure: ['rate_limit'] }], + }, + ], + }); + + expect(parsed.config).toBeUndefined(); + expect(parsed.issues.some((issue) => issue.code === 'cyclic_fallback_path')).toBe(true); + expect(parsed.issues.some((issue) => issue.code === 'unknown_fallback_strategy')).toBe(true); + }); + + it('throws structured config error at startup when env config is invalid', () => { + process.env.DUBSBOT_EMBEDDING_STRATEGY_CONFIG_JSON = JSON.stringify({ + version: '1.0', + defaults: { indexing: 'missing', query: 'missing' }, + strategies: [{ id: 'only', provider: 'openai', model: 'x', fallback: [] }], + }); + + expect(() => loadEmbeddingStrategyConfig()).toThrow(EmbeddingStrategyConfigError); + }); + + it('throws structured runtime error when strategy id is unknown', () => { + const config = requireConfig(baseConfig); + expect(() => resolveEmbeddingStrategy(config, 'not-found')).toThrow( + EmbeddingStrategyResolutionError + ); + }); +}); + +describe('anthropic native-first fallback policy', () => { + it('returns anthropic result directly on success', async () => { + const config = requireConfig(baseConfig); + const result = await executeEmbeddingWithStrategy({ + config, + strategyId: 'indexing', + value: 'hello', + adapterForProvider: (provider) => + provider === 'anthropic' ? new FakeProvider('ok') : new FakeProvider('ok'), + }); + + expect(result.ok).toBe(true); + if (result.ok) { + expect(result.provenance.fallbackUsed).toBe(false); + expect(result.provenance.attemptPath).toHaveLength(1); + expect(result.provenance.resolvedBy?.provider).toBe('anthropic'); + } + }); + + it('does not fallback on non-fallbackable anthropic failure', async () => { + const config = requireConfig(baseConfig); + const adapterSpy = vi.fn((provider: string) => + provider === 'anthropic' ? new FakeProvider('auth') : new FakeProvider('ok') + ); + + const result = await executeEmbeddingWithStrategy({ + config, + strategyId: 'indexing', + value: 'hello', + adapterForProvider: adapterSpy, + }); + + expect(result.ok).toBe(false); + expect(adapterSpy).toHaveBeenCalledTimes(1); + if (!result.ok) { + expect(result.provenance.failureCategory).toBe('auth'); + expect(result.provenance.terminalReason).toBe('fallback_disallowed'); + } + }); + + it('falls back in configured order for fallbackable errors', async () => { + const config = requireConfig(baseConfig); + const result = await executeEmbeddingWithStrategy({ + config, + strategyId: 'indexing', + value: 'hello', + adapterForProvider: (provider) => + provider === 'anthropic' ? new FakeProvider('rate_limit') : new FakeProvider('ok'), + }); + + expect(result.ok).toBe(true); + if (result.ok) { + expect(result.provenance.fallbackUsed).toBe(true); + expect(result.provenance.attemptPath.map((entry) => entry.provider)).toEqual([ + 'anthropic', + 'openai', + ]); + } + }); + + it('returns terminal failure when no fallback is configured', async () => { + const config = requireConfig({ + version: '1.0', + defaults: { indexing: 'solo', query: 'solo' }, + strategies: [{ id: 'solo', provider: 'anthropic', model: 'claude-embed', fallback: [] }], + }); + const result = await executeEmbeddingWithStrategy({ + config, + strategyId: 'solo', + value: 'hello', + adapterForProvider: () => new FakeProvider('rate_limit'), + }); + + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.provenance.terminalReason).toBe('no_fallback'); + expect(result.provenance.attemptPath).toHaveLength(1); + } + }); +}); + +describe('embedding provenance completeness', () => { + it('includes complete provenance fields on success', async () => { + const config = requireConfig(baseConfig); + const result = await executeEmbeddingWithStrategy({ + config, + strategyId: 'indexing', + value: 'hello', + adapterForProvider: (provider) => + provider === 'anthropic' ? new FakeProvider('rate_limit') : new FakeProvider('ok'), + }); + + expect(result.ok).toBe(true); + if (result.ok) { + expect(result.provenance.strategyId).toBe('indexing'); + expect(result.provenance.resolvedBy).toBeDefined(); + expect(result.provenance.attemptPath.length).toBeGreaterThan(0); + } + }); + + it('includes complete provenance fields on terminal failure', async () => { + const config = requireConfig(baseConfig); + const result = await executeEmbeddingWithStrategy({ + config, + strategyId: 'indexing', + value: 'hello', + adapterForProvider: () => new FakeProvider('timeout'), + }); + + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.provenance.strategyId).toBe('indexing'); + expect(result.provenance.attemptPath.length).toBeGreaterThan(0); + expect(result.provenance.failureCategory).toBeDefined(); + expect(result.provenance.terminalReason).toBeDefined(); + } + }); +}); From ee0110e128457996b398acea0db00c845bb746fc Mon Sep 17 00:00:00 2001 From: Daniel Wise Date: Tue, 3 Mar 2026 19:17:53 -0800 Subject: [PATCH 3/3] fix(embedding): address PR review feedback for strategy rollout --- .../spec.md | 5 ++-- .../embedding-strategy-configuration/spec.md | 5 ++-- src/context/embedding/config.ts | 22 +++++++++++++--- src/context/embedding/engine.ts | 26 ++++++++++++++++--- src/context/embedding/strategy.ts | 9 +++++-- src/context/retrieval/hybrid.ts | 13 +++++++++- src/daemon/main.ts | 9 +++++-- src/db/migrate.ts | 15 +++++++++-- 8 files changed, 87 insertions(+), 17 deletions(-) diff --git a/openspec/specs/anthropic-embedding-fallback-and-provenance/spec.md b/openspec/specs/anthropic-embedding-fallback-and-provenance/spec.md index e34f9d8..1943817 100644 --- a/openspec/specs/anthropic-embedding-fallback-and-provenance/spec.md +++ b/openspec/specs/anthropic-embedding-fallback-and-provenance/spec.md @@ -1,7 +1,9 @@ # anthropic-embedding-fallback-and-provenance Specification ## Purpose -TBD - created by archiving change embedding-parity-hardening. Update Purpose after archive. +Define expected Anthropic-primary embedding behavior, including native-first execution, +failure-category-gated fallback sequencing, and required provenance metadata for both successful +embedding results and terminal failures. ## Requirements ### Requirement: Anthropic Native-First Execution Policy For strategies configured with Anthropic as primary, the system SHALL attempt Anthropic native embedding first and SHALL only consider fallback providers explicitly listed in that strategy. @@ -35,4 +37,3 @@ The system SHALL attach provenance metadata to every embedding result and termin #### Scenario: Provenance is emitted on terminal failure - **WHEN** all attempts fail or fallback is disallowed - **THEN** the error payload includes provenance fields for attempted providers/models, failure category, and terminal resolution reason - diff --git a/openspec/specs/embedding-strategy-configuration/spec.md b/openspec/specs/embedding-strategy-configuration/spec.md index 3d34504..78a2c89 100644 --- a/openspec/specs/embedding-strategy-configuration/spec.md +++ b/openspec/specs/embedding-strategy-configuration/spec.md @@ -1,7 +1,9 @@ # embedding-strategy-configuration Specification ## Purpose -TBD - created by archiving change embedding-parity-hardening. Update Purpose after archive. +Define how embedding strategies are configured and resolved across providers and models, +including named strategy IDs, primary provider/model selection, ordered fallback chains, and +deterministic runtime resolution with startup validation of invalid or inconsistent configurations. ## Requirements ### Requirement: Provider-Configurable Embedding Strategy The system SHALL support explicit embedding strategy configuration per embedding use-case, including primary provider/model selection and an ordered fallback list. @@ -24,4 +26,3 @@ The system SHALL resolve embedding strategies deterministically for each request #### Scenario: Unknown strategy id is rejected - **WHEN** an embedding request specifies a strategy id not present in configuration - **THEN** the system returns a structured error and MUST NOT attempt embedding generation - diff --git a/src/context/embedding/config.ts b/src/context/embedding/config.ts index 83b713e..4d211c6 100644 --- a/src/context/embedding/config.ts +++ b/src/context/embedding/config.ts @@ -7,7 +7,7 @@ import { export function loadEmbeddingStrategyConfig(): EmbeddingStrategyConfig { const rawFromEnv = process.env.DUBSBOT_EMBEDDING_STRATEGY_CONFIG_JSON; - const raw = rawFromEnv ? JSON.parse(rawFromEnv) : buildLegacyDefaultConfig(); + const raw = rawFromEnv ? parseJsonConfigFromEnv(rawFromEnv) : buildLegacyDefaultConfig(); const parsed = parseEmbeddingStrategyConfig(raw); if (!parsed.config) { throw new EmbeddingStrategyConfigError(parsed.issues); @@ -70,8 +70,24 @@ function defaultEmbeddingModel(provider: ProviderName): string { case 'google': return process.env.DUBSBOT_GOOGLE_EMBEDDING_MODEL ?? 'text-embedding-004'; case 'anthropic': - return process.env.DUBSBOT_ANTHROPIC_EMBEDDING_MODEL ?? 'deterministic-v1'; + return process.env.DUBSBOT_ANTHROPIC_EMBEDDING_MODEL ?? 'local-deterministic'; default: - return 'deterministic-v1'; + return 'local-deterministic'; + } +} + +function parseJsonConfigFromEnv(rawFromEnv: string): unknown { + try { + return JSON.parse(rawFromEnv); + } catch (error) { + if (error instanceof SyntaxError) { + throw new EmbeddingStrategyConfigError([ + { + code: 'schema_invalid', + detail: `DUBSBOT_EMBEDDING_STRATEGY_CONFIG_JSON is invalid JSON: ${error.message}`, + }, + ]); + } + throw error; } } diff --git a/src/context/embedding/engine.ts b/src/context/embedding/engine.ts index 9159281..ec65453 100644 --- a/src/context/embedding/engine.ts +++ b/src/context/embedding/engine.ts @@ -107,10 +107,10 @@ export async function executeEmbeddingWithStrategy(input: { failureCategory, }); - const eligibleFallback = strategy.fallback.find((entry) => + const eligibleFallbacks = strategy.fallback.filter((entry) => entry.onFailure.includes(failureCategory) ); - if (!eligibleFallback) { + if (eligibleFallbacks.length === 0) { return { ok: false, message: `Embedding failed for strategy "${currentId}" with category "${failureCategory}" and no eligible fallback.`, @@ -124,8 +124,28 @@ export async function executeEmbeddingWithStrategy(input: { }; } + const fallbackIds = eligibleFallbacks + .map((entry) => entry.strategyId) + .filter((strategyId) => !visited.has(strategyId) && !queue.includes(strategyId)); + + if (fallbackIds.length === 0) { + return { + ok: false, + message: `Embedding failed for strategy "${currentId}" with category "${failureCategory}" and exhausted fallback chain.`, + provenance: { + strategyId: input.strategyId, + attemptPath, + fallbackUsed: true, + failureCategory, + terminalReason: 'fallback_exhausted', + }, + }; + } + fallbackUsed = true; - queue.push(eligibleFallback.strategyId); + for (const fallbackId of fallbackIds) { + queue.push(fallbackId); + } } } diff --git a/src/context/embedding/strategy.ts b/src/context/embedding/strategy.ts index f3eac27..503fe15 100644 --- a/src/context/embedding/strategy.ts +++ b/src/context/embedding/strategy.ts @@ -44,7 +44,8 @@ export type EmbeddingStrategyValidationIssue = { | 'duplicate_strategy' | 'unknown_fallback_strategy' | 'cyclic_fallback_path' - | 'unknown_default_strategy'; + | 'unknown_default_strategy' + | 'schema_invalid'; strategyId?: string; detail: string; }; @@ -76,7 +77,11 @@ export function parseEmbeddingStrategyConfig(raw: unknown): { if (!parsed.success) { return { issues: parsed.error.issues.map((issue) => ({ - code: issue.path.includes('provider') ? 'unknown_provider' : 'missing_model', + code: issue.path.includes('provider') + ? 'unknown_provider' + : issue.path.includes('model') + ? 'missing_model' + : 'schema_invalid', detail: `${issue.path.join('.') || ''}: ${issue.message}`, })), }; diff --git a/src/context/retrieval/hybrid.ts b/src/context/retrieval/hybrid.ts index 1ea6c75..f1a2efc 100644 --- a/src/context/retrieval/hybrid.ts +++ b/src/context/retrieval/hybrid.ts @@ -109,7 +109,7 @@ export async function runHybridRetrieval(input: { path: entry.item.path, provider: entry.item.provider ?? 'unknown', model: entry.item.model ?? 'unknown', - embeddingProvenance: entry.item.provenance ? JSON.parse(entry.item.provenance) : null, + embeddingProvenance: safeJsonParse(entry.item.provenance), lexicalScore: entry.lexicalScore, vectorScore: entry.vectorScore, rank: index + 1, @@ -145,3 +145,14 @@ export async function runHybridRetrieval(input: { return bundle; } + +function safeJsonParse(value: string | null): unknown { + if (!value) { + return null; + } + try { + return JSON.parse(value); + } catch { + return null; + } +} diff --git a/src/daemon/main.ts b/src/daemon/main.ts index 33e226b..4edf875 100644 --- a/src/daemon/main.ts +++ b/src/daemon/main.ts @@ -3,7 +3,10 @@ import { EventHookRunner } from '../automation/event-hooks'; import { AutomationRunner } from '../automation/runner'; import { AutomationScheduler } from '../automation/scheduler'; import { loadAgentsConfig } from '../config/agents-loader'; -import { loadEmbeddingStrategyConfig } from '../context/embedding/config'; +import { + isEmbeddingStrategyV2Enabled, + loadEmbeddingStrategyConfig, +} from '../context/embedding/config'; import { RepoFsWatcher } from '../context/fs-watcher'; import { GitWatcher } from '../context/git-watcher'; import { runIncrementalIndex } from '../context/indexer/incremental'; @@ -16,7 +19,9 @@ import { createProviderAdapter, detectProvider } from '../providers'; async function main(): Promise { await runMigrations(); const db = await createDb(); - loadEmbeddingStrategyConfig(); + if (isEmbeddingStrategyV2Enabled()) { + loadEmbeddingStrategyConfig(); + } const provider = createProviderAdapter(detectProvider()); const policy = new DefaultPolicyEngine(createDefaultApprovalPolicy()); const orchestrator = new AgentOrchestrator({ provider, policyEngine: policy }); diff --git a/src/db/migrate.ts b/src/db/migrate.ts index e58a85c..3260914 100644 --- a/src/db/migrate.ts +++ b/src/db/migrate.ts @@ -30,7 +30,18 @@ export async function runMigrations(): Promise { const migrationPath = join(process.cwd(), 'src', 'db', 'migrations', file); const migrationSql = await readFile(migrationPath, 'utf8'); - await db.exec(migrationSql); - await db.query('INSERT INTO schema_migrations (version) VALUES ($1)', [version]); + try { + await db.exec('BEGIN'); + await db.exec(migrationSql); + await db.query('INSERT INTO schema_migrations (version) VALUES ($1)', [version]); + await db.exec('COMMIT'); + } catch (error) { + try { + await db.exec('ROLLBACK'); + } catch { + // Ignore rollback failures to avoid masking original migration errors. + } + throw error; + } } }