From 359b4b557124b810e806cca2d78f3b2afd61923f Mon Sep 17 00:00:00 2001 From: Danh Doan Date: Wed, 22 Apr 2026 16:34:22 +0700 Subject: [PATCH 1/3] feat: [ENG-2325] AutoHarness V2 baseline runner (dual-arm replay) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 7 Task 7.5 — Tier 1 Q1 brutal-review item. Ships the reusable `HarnessBaselineRunner` that powers the future `brv harness baseline` CLI command: replays the last N stored scenarios against two arms (pass-through template = raw, current version = harness), reports per-scenario outcomes + overall rates + delta. Complementary to the reference KPI harness (`scripts/autoharness-kpi/`, PR #523): - KPI harness: fixed task set + fixed model → release-notes headline number. - Baseline runner: user's own scenarios + current harness → per-user "is it working for me?" signal. The oclif command file (`src/oclif/commands/harness/baseline.ts`) is deliberately NOT shipped in this PR. The command needs the daemon-transport wiring for harness subcommands that Task 7.1 (status/inspect) establishes — none of Phase 7's CLI surface is merged yet, so there's no transport pattern to follow. Shipping just the runner class now (pure, testable, DI-ready) lets the oclif wrapper land in a small follow-up after 7.1 defines the pattern. This matches the "reusable logic now, wiring later" pattern used by the KPI harness scaffolding PR (#523). - Pure orchestration; caller injects the `HarnessToolsFactory` (production wires this to `SandboxService.buildHarnessTools({dryRun: true})` for write-blocked eval runs, matching Phase 6.1). - Raw arm fabricates a synthetic version from the stored template (`getTemplate(commandType, currentVersion.projectType).code`) — no need to persist it; the module builder accepts any code string. - `v1.0 scope narrowing`: only `commandType === 'curate'` is supported (query / chat throw `UNSUPPORTED_COMMAND_TYPE`) until query templates ship in the Phase 4.3 follow-up. - `count` bounded [1, 50]; scenarios < 3 fails with a clear "run curate N more times first" message so users know how to unblock themselves. - Each scenario runs once per arm (single-run side-by-side display). Contrast Phase 6.1 Evaluator which does 10 runs per scenario for statistical significance — different semantics, different surface, not shared on purpose. Tests (9): COUNT_OUT_OF_RANGE (0, 51), UNSUPPORTED_COMMAND_TYPE (query, chat), INSUFFICIENT_SCENARIOS (2 scenarios), NO_CURRENT_VERSION (empty store), happy-path delta = +100%, no-op delta = 0, count-caps-slice, harness-throws-captures- stderr. All 278 harness unit tests pass. Build + lint + typecheck clean. --- package-lock.json | 15 +- .../infra/harness/harness-baseline-runner.ts | 228 ++++++++++++ .../harness/harness-baseline-runner.test.ts | 342 ++++++++++++++++++ 3 files changed, 583 insertions(+), 2 deletions(-) create mode 100644 src/agent/infra/harness/harness-baseline-runner.ts create mode 100644 test/unit/agent/harness/harness-baseline-runner.test.ts diff --git a/package-lock.json b/package-lock.json index 340bc1518..c120c291a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "byterover-cli", - "version": "3.7.1", + "version": "3.8.2", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "byterover-cli", - "version": "3.7.1", + "version": "3.8.2", "bundleDependencies": [ "@campfirein/brv-transport-client", "@campfirein/byterover-packages" @@ -5889,6 +5889,7 @@ "os": [ "android" ], + "peer": true, "engines": { "node": ">= 10" }, @@ -5909,6 +5910,7 @@ "os": [ "darwin" ], + "peer": true, "engines": { "node": ">= 10" }, @@ -5929,6 +5931,7 @@ "os": [ "darwin" ], + "peer": true, "engines": { "node": ">= 10" }, @@ -5949,6 +5952,7 @@ "os": [ "linux" ], + "peer": true, "engines": { "node": ">= 10" }, @@ -5969,6 +5973,7 @@ "os": [ "linux" ], + "peer": true, "engines": { "node": ">= 10" }, @@ -5989,6 +5994,7 @@ "os": [ "linux" ], + "peer": true, "engines": { "node": ">= 10" }, @@ -6009,6 +6015,7 @@ "os": [ "linux" ], + "peer": true, "engines": { "node": ">= 10" }, @@ -6029,6 +6036,7 @@ "os": [ "linux" ], + "peer": true, "engines": { "node": ">= 10" }, @@ -6049,6 +6057,7 @@ "os": [ "linux" ], + "peer": true, "engines": { "node": ">= 10" }, @@ -6069,6 +6078,7 @@ "os": [ "win32" ], + "peer": true, "engines": { "node": ">= 10" }, @@ -6089,6 +6099,7 @@ "os": [ "win32" ], + "peer": true, "engines": { "node": ">= 10" }, diff --git a/src/agent/infra/harness/harness-baseline-runner.ts b/src/agent/infra/harness/harness-baseline-runner.ts new file mode 100644 index 000000000..f43f18de5 --- /dev/null +++ b/src/agent/infra/harness/harness-baseline-runner.ts @@ -0,0 +1,228 @@ +/** + * AutoHarness V2 — HarnessBaselineRunner. + * + * Powers `brv harness baseline` (Tier 1 Q1 brutal-review item). + * Replays the last N stored scenarios through two arms: + * - Raw: the Option C pass-through template for the current + * `projectType` + * - Harness: the pair's current (latest) stored version + * + * Each scenario runs once per arm. Success = harness function + * invocation completed without throwing. Returns per-scenario + * outcomes + overall rates + `delta = harness - raw`. + * + * Complementary to the reference KPI harness (`scripts/autoharness-kpi/`): + * - That runs a FIXED task set on a FIXED model for the release- + * notes headline. + * - This runs the USER's scenarios against the USER's current + * harness for the personal "is it working for me?" signal. + * + * `dryRun` enforcement is the caller's responsibility — production + * wires `HarnessToolsFactory` to `SandboxService.buildHarnessTools({dryRun: true})`. + */ + +import type {HarnessContext, ProjectType, ValidatedEvaluationScenario} from '../../core/domain/harness/types.js' +import type {IHarnessStore} from '../../core/interfaces/i-harness-store.js' +import type {ILogger} from '../../core/interfaces/i-logger.js' +import type {HarnessToolsFactory} from './harness-evaluator.js' + +import {HarnessModuleBuilder} from './harness-module-builder.js' +import {getTemplate, type SupportedCommandType} from './templates/index.js' + +// ─── Public types ──────────────────────────────────────────────────────── + +export interface BaselineScenarioResult { + readonly harnessStderr?: string + readonly harnessSuccess: boolean + readonly rawStderr?: string + readonly rawSuccess: boolean + readonly scenarioId: string +} + +export interface BaselineReport { + readonly delta: number + readonly harnessSuccessRate: number + readonly perScenario: readonly BaselineScenarioResult[] + readonly rawSuccessRate: number + readonly scenarioCount: number +} + +export type BaselineRunnerErrorCode = + | 'COUNT_OUT_OF_RANGE' + | 'INSUFFICIENT_SCENARIOS' + | 'NO_CURRENT_VERSION' + | 'UNSUPPORTED_COMMAND_TYPE' + +export class HarnessBaselineRunnerError extends Error { + constructor( + message: string, + public readonly code: BaselineRunnerErrorCode, + public readonly details: Readonly> = {}, + ) { + super(message) + this.name = 'HarnessBaselineRunnerError' + } +} + +// ─── Constants ─────────────────────────────────────────────────────────── + +/** Smallest scenario count that produces a meaningful baseline number. */ +export const BASELINE_MIN_SCENARIOS = 3 + +/** Hard ceiling on `--count` — 10 runs × 50 scenarios × 2 arms = 1000 runs, already slow. */ +export const BASELINE_MAX_COUNT = 50 + +// ─── Runner ────────────────────────────────────────────────────────────── + +/** + * v1.0 scope narrowing (per Phase 4 Task 4.3): only `curate` templates + * exist, so baseline only supports `commandType === 'curate'` for now. + * When query templates ship, widen this set. + */ +const SUPPORTED_BASELINE_COMMANDS: ReadonlySet = new Set(['curate']) + +export class HarnessBaselineRunner { + private readonly moduleBuilder: HarnessModuleBuilder + + constructor( + private readonly harnessStore: IHarnessStore, + private readonly logger: ILogger, + private readonly toolsFactory: HarnessToolsFactory, + ) { + this.moduleBuilder = new HarnessModuleBuilder(logger) + } + + async runBaseline(params: { + readonly commandType: 'chat' | 'curate' | 'query' + readonly count: number + readonly projectId: string + }): Promise { + const {commandType, count, projectId} = params + + if (count < 1 || count > BASELINE_MAX_COUNT) { + throw new HarnessBaselineRunnerError( + `--count must be in [1, ${BASELINE_MAX_COUNT}], got ${count}`, + 'COUNT_OUT_OF_RANGE', + {count, max: BASELINE_MAX_COUNT}, + ) + } + + if (!SUPPORTED_BASELINE_COMMANDS.has(commandType)) { + throw new HarnessBaselineRunnerError( + `baseline is only supported for commandType 'curate' in v1.0 (got '${commandType}'). Query/chat templates land in a follow-up.`, + 'UNSUPPORTED_COMMAND_TYPE', + {commandType}, + ) + } + + const allScenarios = await this.harnessStore.listScenarios(projectId, commandType) + const scenarios = allScenarios.slice(0, count) + + if (scenarios.length < BASELINE_MIN_SCENARIOS) { + throw new HarnessBaselineRunnerError( + `not enough scenarios — baseline needs at least ${BASELINE_MIN_SCENARIOS}, found ${scenarios.length}. Run curate ${BASELINE_MIN_SCENARIOS - scenarios.length} more time(s) first.`, + 'INSUFFICIENT_SCENARIOS', + {found: scenarios.length, required: BASELINE_MIN_SCENARIOS}, + ) + } + + const currentVersion = await this.harnessStore.getLatest(projectId, commandType) + if (currentVersion === undefined) { + throw new HarnessBaselineRunnerError( + `no current harness version for (${projectId}, ${commandType}) — bootstrap first by running curate once.`, + 'NO_CURRENT_VERSION', + {commandType, projectId}, + ) + } + + const rawCode = getTemplate( + commandType as SupportedCommandType, + currentVersion.projectType, + ).code + + // Build both modules. Either failing to build is a bug-level + // error — the current version came from the store (previously + // validated), and the raw template is a shipped constant. + const rawBuild = this.moduleBuilder.build({ + ...currentVersion, + code: rawCode, + id: `${currentVersion.id}:raw`, + }) + if (!rawBuild.loaded) { + throw new Error(`raw template failed to build: ${rawBuild.reason}`) + } + + const harnessBuild = this.moduleBuilder.build(currentVersion) + if (!harnessBuild.loaded) { + throw new Error(`current version failed to build: ${harnessBuild.reason}`) + } + + // Run each scenario once per arm. Serial over scenarios to keep + // outcomes traceable in logs; concurrent per-arm inside each + // scenario would add ordering noise for tiny latency savings. + const perScenario: BaselineScenarioResult[] = [] + for (const scenario of scenarios) { + // eslint-disable-next-line no-await-in-loop + const rawRun = await this.runSingleScenario(rawBuild.module, scenario) + // eslint-disable-next-line no-await-in-loop + const harnessRun = await this.runSingleScenario(harnessBuild.module, scenario) + perScenario.push({ + harnessStderr: harnessRun.stderr, + harnessSuccess: harnessRun.success, + rawStderr: rawRun.stderr, + rawSuccess: rawRun.success, + scenarioId: scenario.id, + }) + } + + const rawSuccesses = perScenario.filter((r) => r.rawSuccess).length + const harnessSuccesses = perScenario.filter((r) => r.harnessSuccess).length + const rawSuccessRate = rawSuccesses / perScenario.length + const harnessSuccessRate = harnessSuccesses / perScenario.length + + return { + delta: harnessSuccessRate - rawSuccessRate, + harnessSuccessRate, + perScenario, + rawSuccessRate, + scenarioCount: perScenario.length, + } + } + + /** + * Execute one scenario against one arm's module. Mirrors the + * logic in HarnessEvaluator.executeSingleRun (intentionally not + * shared — evaluator uses 10-run means; baseline uses single-run + * side-by-side, different semantics, different test surface). + */ + private async runSingleScenario( + module: import('../../core/domain/harness/types.js').HarnessModule, + scenario: ValidatedEvaluationScenario, + ): Promise<{stderr?: string; success: boolean}> { + const tools = this.toolsFactory() + const ctx: HarnessContext = { + abort: new AbortController().signal, + env: { + commandType: scenario.commandType as 'chat' | 'curate' | 'query', + projectType: scenario.projectType as ProjectType, + workingDirectory: '/baseline', + }, + tools, + } + + try { + const fn = scenario.commandType === 'query' ? module.query : module.curate + if (fn === undefined) { + return {stderr: `no ${scenario.commandType} function on module`, success: false} + } + + await fn(ctx) + return {success: true} + } catch (error) { + return { + stderr: error instanceof Error ? error.message : String(error), + success: false, + } + } + } +} diff --git a/test/unit/agent/harness/harness-baseline-runner.test.ts b/test/unit/agent/harness/harness-baseline-runner.test.ts new file mode 100644 index 000000000..fb7e792f4 --- /dev/null +++ b/test/unit/agent/harness/harness-baseline-runner.test.ts @@ -0,0 +1,342 @@ +import {expect} from 'chai' +import {createSandbox, type SinonSandbox, type SinonStub} from 'sinon' + +import type { + HarnessContextTools, + HarnessVersion, + ValidatedEvaluationScenario, +} from '../../../../src/agent/core/domain/harness/types.js' +import type {IHarnessStore} from '../../../../src/agent/core/interfaces/i-harness-store.js' + +import {NoOpLogger} from '../../../../src/agent/core/interfaces/i-logger.js' +import { + BASELINE_MAX_COUNT, + BASELINE_MIN_SCENARIOS, + HarnessBaselineRunner, + HarnessBaselineRunnerError, +} from '../../../../src/agent/infra/harness/harness-baseline-runner.js' + +// ─── Helpers ───────────────────────────────────────────────────────────── + +const PROJECT_ID = 'baseline-test' + +function makeVersion(overrides: Partial = {}): HarnessVersion { + const passThroughCode = ` + exports.meta = function() { + return { + capabilities: ['curate'], + commandType: 'curate', + projectPatterns: ['**/*'], + version: 1, + } + } + exports.curate = async function(ctx) { return ctx.tools.curate([]) } + ` + return { + code: passThroughCode, + commandType: 'curate', + createdAt: 1_700_000_000_000, + heuristic: 0.3, + id: 'v-baseline-test', + metadata: { + capabilities: ['curate'], + commandType: 'curate', + projectPatterns: ['**/*'], + version: 1, + }, + projectId: PROJECT_ID, + projectType: 'generic', + version: 1, + ...overrides, + } +} + +function makeScenario(id: string): ValidatedEvaluationScenario { + return { + code: 'test code', + commandType: 'curate', + createdAt: 1_700_000_000_000, + expectedBehavior: 'ok', + id, + projectId: PROJECT_ID, + projectType: 'generic', + taskDescription: 'test task', + } +} + +function makeStoreStub(sb: SinonSandbox): { + readonly getLatest: SinonStub + readonly listScenarios: SinonStub + readonly store: IHarnessStore +} { + const getLatest = sb.stub() + const listScenarios = sb.stub() + const store = { + deleteOutcomes: sb.stub(), + deleteScenario: sb.stub(), + getLatest, + getVersion: sb.stub(), + listOutcomes: sb.stub(), + listScenarios, + listVersions: sb.stub(), + pruneOldVersions: sb.stub(), + recordFeedback: sb.stub(), + saveOutcome: sb.stub(), + saveScenario: sb.stub(), + saveVersion: sb.stub(), + } satisfies IHarnessStore + + return {getLatest, listScenarios, store} +} + +/** + * Build a tools factory whose `curate` rejects/resolves per-arm. + * Discriminates arms by a side-channel counter: each factory call + * alternates. Tests use this to control per-arm outcomes. + */ +function makeTwoArmToolsFactory( + sb: SinonSandbox, + spec: {readonly harness: () => void; readonly raw: () => void}, +): () => HarnessContextTools { + // Call pattern: in `runBaseline`, for EACH scenario, raw arm is + // built first (first curate call), then harness arm (second). + // Factory is invoked once per single-scenario execution, so even + // calls are raw and odd calls are harness — track via counter. + let call = 0 + return () => { + call++ + const isRawArm = call % 2 === 1 // 1st, 3rd, 5th... are raw + const curate = sb.stub().callsFake(() => { + if (isRawArm) return spec.raw() + return spec.harness() + }) + const readFile = sb.stub().resolves() + return {curate, readFile} as unknown as HarnessContextTools + } +} + +// ─── Tests ──────────────────────────────────────────────────────────────── + +describe('HarnessBaselineRunner', () => { + let sb: SinonSandbox + + beforeEach(() => { + sb = createSandbox() + }) + + afterEach(() => { + sb.restore() + }) + + it('1. throws COUNT_OUT_OF_RANGE when count is 0', async () => { + const {store} = makeStoreStub(sb) + const runner = new HarnessBaselineRunner(store, new NoOpLogger(), () => + ({curate: sb.stub().resolves(), readFile: sb.stub().resolves()}) as unknown as HarnessContextTools, + ) + + let caught: unknown + try { + await runner.runBaseline({commandType: 'curate', count: 0, projectId: PROJECT_ID}) + } catch (error) { + caught = error + } + + expect(caught).to.be.instanceOf(HarnessBaselineRunnerError) + expect((caught as HarnessBaselineRunnerError).code).to.equal('COUNT_OUT_OF_RANGE') + }) + + it('2. throws COUNT_OUT_OF_RANGE when count exceeds BASELINE_MAX_COUNT', async () => { + const {store} = makeStoreStub(sb) + const runner = new HarnessBaselineRunner(store, new NoOpLogger(), () => + ({curate: sb.stub(), readFile: sb.stub()}) as unknown as HarnessContextTools, + ) + + let caught: unknown + try { + await runner.runBaseline({ + commandType: 'curate', + count: BASELINE_MAX_COUNT + 1, + projectId: PROJECT_ID, + }) + } catch (error) { + caught = error + } + + expect((caught as HarnessBaselineRunnerError).code).to.equal('COUNT_OUT_OF_RANGE') + }) + + it('3. throws UNSUPPORTED_COMMAND_TYPE for query / chat (v1.0 curate-only)', async () => { + const {store} = makeStoreStub(sb) + const runner = new HarnessBaselineRunner(store, new NoOpLogger(), () => + ({curate: sb.stub(), readFile: sb.stub()}) as unknown as HarnessContextTools, + ) + + for (const cmd of ['query', 'chat'] as const) { + let caught: unknown + try { + // eslint-disable-next-line no-await-in-loop + await runner.runBaseline({commandType: cmd, count: 10, projectId: PROJECT_ID}) + } catch (error) { + caught = error + } + + expect( + (caught as HarnessBaselineRunnerError).code, + `mismatch for commandType=${cmd}`, + ).to.equal('UNSUPPORTED_COMMAND_TYPE') + } + }) + + it('4. throws INSUFFICIENT_SCENARIOS when < 3 scenarios exist', async () => { + const {getLatest, listScenarios, store} = makeStoreStub(sb) + listScenarios.resolves([makeScenario('s1'), makeScenario('s2')]) + getLatest.resolves(makeVersion()) + const runner = new HarnessBaselineRunner(store, new NoOpLogger(), () => + ({curate: sb.stub(), readFile: sb.stub()}) as unknown as HarnessContextTools, + ) + + let caught: unknown + try { + await runner.runBaseline({commandType: 'curate', count: 10, projectId: PROJECT_ID}) + } catch (error) { + caught = error + } + + const err = caught as HarnessBaselineRunnerError + expect(err.code).to.equal('INSUFFICIENT_SCENARIOS') + expect(err.details.found).to.equal(2) + expect(err.details.required).to.equal(BASELINE_MIN_SCENARIOS) + }) + + it('5. throws NO_CURRENT_VERSION when the pair has no stored version', async () => { + const {getLatest, listScenarios, store} = makeStoreStub(sb) + listScenarios.resolves([makeScenario('s1'), makeScenario('s2'), makeScenario('s3')]) + getLatest.resolves() + const runner = new HarnessBaselineRunner(store, new NoOpLogger(), () => + ({curate: sb.stub(), readFile: sb.stub()}) as unknown as HarnessContextTools, + ) + + let caught: unknown + try { + await runner.runBaseline({commandType: 'curate', count: 10, projectId: PROJECT_ID}) + } catch (error) { + caught = error + } + + expect((caught as HarnessBaselineRunnerError).code).to.equal('NO_CURRENT_VERSION') + }) + + it('6. happy path: raw fails every scenario, harness succeeds → delta = 100%', async () => { + const {getLatest, listScenarios, store} = makeStoreStub(sb) + const scenarios = [ + makeScenario('s1'), + makeScenario('s2'), + makeScenario('s3'), + makeScenario('s4'), + ] + listScenarios.resolves(scenarios) + getLatest.resolves(makeVersion()) + + const factory = makeTwoArmToolsFactory(sb, { + harness: () => Promise.resolve(), + raw() { + throw new Error('raw arm failure') + }, + }) + + const runner = new HarnessBaselineRunner(store, new NoOpLogger(), factory) + const report = await runner.runBaseline({ + commandType: 'curate', + count: 10, + projectId: PROJECT_ID, + }) + + expect(report.scenarioCount).to.equal(4) + expect(report.rawSuccessRate).to.equal(0) + expect(report.harnessSuccessRate).to.equal(1) + expect(report.delta).to.equal(1) + for (const result of report.perScenario) { + expect(result.rawSuccess).to.equal(false) + expect(result.rawStderr).to.match(/raw arm failure/) + expect(result.harnessSuccess).to.equal(true) + } + }) + + it('7. mixed path: both arms pass → delta = 0', async () => { + const {getLatest, listScenarios, store} = makeStoreStub(sb) + listScenarios.resolves([makeScenario('s1'), makeScenario('s2'), makeScenario('s3')]) + getLatest.resolves(makeVersion()) + + const factory = makeTwoArmToolsFactory(sb, { + harness: () => Promise.resolve(), + raw: () => Promise.resolve(), + }) + + const runner = new HarnessBaselineRunner(store, new NoOpLogger(), factory) + const report = await runner.runBaseline({ + commandType: 'curate', + count: 3, + projectId: PROJECT_ID, + }) + + expect(report.rawSuccessRate).to.equal(1) + expect(report.harnessSuccessRate).to.equal(1) + expect(report.delta).to.equal(0) + }) + + it('8. count caps the scenarios slice', async () => { + const {getLatest, listScenarios, store} = makeStoreStub(sb) + // 10 scenarios in store; ask for 5 → only 5 run. + const scenarios = Array.from({length: 10}, (_, i) => makeScenario(`s${i}`)) + listScenarios.resolves(scenarios) + getLatest.resolves(makeVersion()) + + const factory = makeTwoArmToolsFactory(sb, { + harness: () => Promise.resolve(), + raw: () => Promise.resolve(), + }) + + const runner = new HarnessBaselineRunner(store, new NoOpLogger(), factory) + const report = await runner.runBaseline({ + commandType: 'curate', + count: 5, + projectId: PROJECT_ID, + }) + + expect(report.scenarioCount).to.equal(5) + expect(report.perScenario.map((r) => r.scenarioId)).to.deep.equal([ + 's0', + 's1', + 's2', + 's3', + 's4', + ]) + }) + + it('9. harness run that throws is captured as failure with stderr', async () => { + const {getLatest, listScenarios, store} = makeStoreStub(sb) + listScenarios.resolves([makeScenario('s1'), makeScenario('s2'), makeScenario('s3')]) + getLatest.resolves(makeVersion()) + + const factory = makeTwoArmToolsFactory(sb, { + harness() { + throw new Error('harness cratered') + }, + raw: () => Promise.resolve(), + }) + + const runner = new HarnessBaselineRunner(store, new NoOpLogger(), factory) + const report = await runner.runBaseline({ + commandType: 'curate', + count: 3, + projectId: PROJECT_ID, + }) + + expect(report.rawSuccessRate).to.equal(1) + expect(report.harnessSuccessRate).to.equal(0) + expect(report.delta).to.equal(-1) + for (const r of report.perScenario) { + expect(r.harnessStderr).to.match(/cratered/) + } + }) +}) From 8c5c0d1d61f88065acd894e5c6f48c90a2742871 Mon Sep 17 00:00:00 2001 From: Danh Doan Date: Wed, 22 Apr 2026 16:40:46 +0700 Subject: [PATCH 2/3] test: [ENG-2325] add deleteOutcome stub after rebase on proj Proj/autoharness-v2 picked up a new IHarnessStore.deleteOutcome method between branch-point and PR CI. Rebase picked up the interface change; this commit updates the makeStoreStub helper to satisfy it. --- test/unit/agent/harness/harness-baseline-runner.test.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/test/unit/agent/harness/harness-baseline-runner.test.ts b/test/unit/agent/harness/harness-baseline-runner.test.ts index fb7e792f4..5fb59380d 100644 --- a/test/unit/agent/harness/harness-baseline-runner.test.ts +++ b/test/unit/agent/harness/harness-baseline-runner.test.ts @@ -72,6 +72,7 @@ function makeStoreStub(sb: SinonSandbox): { const getLatest = sb.stub() const listScenarios = sb.stub() const store = { + deleteOutcome: sb.stub(), deleteOutcomes: sb.stub(), deleteScenario: sb.stub(), getLatest, From 6d0b90db32d456735c718d7fb0b70601acec0308 Mon Sep 17 00:00:00 2001 From: Danh Doan Date: Wed, 22 Apr 2026 16:47:39 +0700 Subject: [PATCH 3/3] review: [ENG-2325] address PR 526 review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix INSUFFICIENT_SCENARIOS to guard on store coverage, not the sliced window (caller passing a too-low --count now surfaces as a valid run, not a misleading "run curate N more times" error). Add test 4b. - Hoist HarnessModule to top-level import type block; drop inline dynamic type import. - Replace `as SupportedCommandType` cast with an isSupportedBaselineCommand type predicate so TypeScript narrows at the guard. - Swap ternary fn-lookup for an explicit fnMap — prevents silent fall-through if chat is ever added to the supported set. - Document the raw-first call-order coupling in makeTwoArmToolsFactory. --- .../infra/harness/harness-baseline-runner.ts | 40 +++++++++++-------- .../harness/harness-baseline-runner.test.ts | 29 ++++++++++++++ 2 files changed, 53 insertions(+), 16 deletions(-) diff --git a/src/agent/infra/harness/harness-baseline-runner.ts b/src/agent/infra/harness/harness-baseline-runner.ts index f43f18de5..37b22b5dd 100644 --- a/src/agent/infra/harness/harness-baseline-runner.ts +++ b/src/agent/infra/harness/harness-baseline-runner.ts @@ -21,7 +21,7 @@ * wires `HarnessToolsFactory` to `SandboxService.buildHarnessTools({dryRun: true})`. */ -import type {HarnessContext, ProjectType, ValidatedEvaluationScenario} from '../../core/domain/harness/types.js' +import type {HarnessContext, HarnessModule, ProjectType, ValidatedEvaluationScenario} from '../../core/domain/harness/types.js' import type {IHarnessStore} from '../../core/interfaces/i-harness-store.js' import type {ILogger} from '../../core/interfaces/i-logger.js' import type {HarnessToolsFactory} from './harness-evaluator.js' @@ -81,6 +81,10 @@ export const BASELINE_MAX_COUNT = 50 */ const SUPPORTED_BASELINE_COMMANDS: ReadonlySet = new Set(['curate']) +function isSupportedBaselineCommand(cmd: string): cmd is SupportedCommandType { + return SUPPORTED_BASELINE_COMMANDS.has(cmd) +} + export class HarnessBaselineRunner { private readonly moduleBuilder: HarnessModuleBuilder @@ -107,7 +111,7 @@ export class HarnessBaselineRunner { ) } - if (!SUPPORTED_BASELINE_COMMANDS.has(commandType)) { + if (!isSupportedBaselineCommand(commandType)) { throw new HarnessBaselineRunnerError( `baseline is only supported for commandType 'curate' in v1.0 (got '${commandType}'). Query/chat templates land in a follow-up.`, 'UNSUPPORTED_COMMAND_TYPE', @@ -116,16 +120,19 @@ export class HarnessBaselineRunner { } const allScenarios = await this.harnessStore.listScenarios(projectId, commandType) - const scenarios = allScenarios.slice(0, count) - if (scenarios.length < BASELINE_MIN_SCENARIOS) { + // Guard on store coverage, not the requested window: `--count 2` with + // 10 stored scenarios is a bad request (count too low), not missing data. + if (allScenarios.length < BASELINE_MIN_SCENARIOS) { throw new HarnessBaselineRunnerError( - `not enough scenarios — baseline needs at least ${BASELINE_MIN_SCENARIOS}, found ${scenarios.length}. Run curate ${BASELINE_MIN_SCENARIOS - scenarios.length} more time(s) first.`, + `not enough scenarios — baseline needs at least ${BASELINE_MIN_SCENARIOS}, found ${allScenarios.length}. Run curate ${BASELINE_MIN_SCENARIOS - allScenarios.length} more time(s) first.`, 'INSUFFICIENT_SCENARIOS', - {found: scenarios.length, required: BASELINE_MIN_SCENARIOS}, + {found: allScenarios.length, required: BASELINE_MIN_SCENARIOS}, ) } + const scenarios = allScenarios.slice(0, count) + const currentVersion = await this.harnessStore.getLatest(projectId, commandType) if (currentVersion === undefined) { throw new HarnessBaselineRunnerError( @@ -135,10 +142,7 @@ export class HarnessBaselineRunner { ) } - const rawCode = getTemplate( - commandType as SupportedCommandType, - currentVersion.projectType, - ).code + const rawCode = getTemplate(commandType, currentVersion.projectType).code // Build both modules. Either failing to build is a bug-level // error — the current version came from the store (previously @@ -196,7 +200,7 @@ export class HarnessBaselineRunner { * side-by-side, different semantics, different test surface). */ private async runSingleScenario( - module: import('../../core/domain/harness/types.js').HarnessModule, + module: HarnessModule, scenario: ValidatedEvaluationScenario, ): Promise<{stderr?: string; success: boolean}> { const tools = this.toolsFactory() @@ -210,12 +214,16 @@ export class HarnessBaselineRunner { tools, } - try { - const fn = scenario.commandType === 'query' ? module.query : module.curate - if (fn === undefined) { - return {stderr: `no ${scenario.commandType} function on module`, success: false} - } + const fnMap: Partial Promise>> = { + curate: module.curate, + query: module.query, + } + const fn = fnMap[scenario.commandType] + if (fn === undefined) { + return {stderr: `no ${scenario.commandType} function on module`, success: false} + } + try { await fn(ctx) return {success: true} } catch (error) { diff --git a/test/unit/agent/harness/harness-baseline-runner.test.ts b/test/unit/agent/harness/harness-baseline-runner.test.ts index 5fb59380d..836fa8836 100644 --- a/test/unit/agent/harness/harness-baseline-runner.test.ts +++ b/test/unit/agent/harness/harness-baseline-runner.test.ts @@ -94,6 +94,13 @@ function makeStoreStub(sb: SinonSandbox): { * Build a tools factory whose `curate` rejects/resolves per-arm. * Discriminates arms by a side-channel counter: each factory call * alternates. Tests use this to control per-arm outcomes. + * + * IMPLEMENTATION COUPLING: this helper assumes `runBaseline` invokes + * the RAW arm before the HARNESS arm within each scenario (odd call = + * raw, even = harness). If that order is ever reversed, arm + * assertions invert silently. A more resilient scheme would tag the + * arm via `ctx.env` and discriminate on the tag; kept simple here + * because the single `runBaseline` caller is serial and deliberate. */ function makeTwoArmToolsFactory( sb: SinonSandbox, @@ -209,6 +216,28 @@ describe('HarnessBaselineRunner', () => { expect(err.details.required).to.equal(BASELINE_MIN_SCENARIOS) }) + it('4b. INSUFFICIENT_SCENARIOS reflects STORE coverage, not the sliced window', async () => { + // Store has plenty (10); caller passes --count=2. Error should report + // the requested window as the bad input, not claim missing data. + const {getLatest, listScenarios, store} = makeStoreStub(sb) + listScenarios.resolves(Array.from({length: 10}, (_, i) => makeScenario(`s${i}`))) + getLatest.resolves(makeVersion()) + const runner = new HarnessBaselineRunner(store, new NoOpLogger(), () => + ({curate: sb.stub(), readFile: sb.stub()}) as unknown as HarnessContextTools, + ) + + let caught: unknown + try { + await runner.runBaseline({commandType: 'curate', count: 2, projectId: PROJECT_ID}) + } catch (error) { + caught = error + } + + // With the fix, 10 stored ≥ 3, so guard does NOT fire — the run proceeds + // and completes on the 2-scenario slice (count itself is valid: [1, 50]). + expect(caught).to.equal(undefined) + }) + it('5. throws NO_CURRENT_VERSION when the pair has no stored version', async () => { const {getLatest, listScenarios, store} = makeStoreStub(sb) listScenarios.resolves([makeScenario('s1'), makeScenario('s2'), makeScenario('s3')])