From 359b4b557124b810e806cca2d78f3b2afd61923f Mon Sep 17 00:00:00 2001
From: Danh Doan <danhdoancv@gmail.com>
Date: Wed, 22 Apr 2026 16:34:22 +0700
Subject: [PATCH 1/3] feat: [ENG-2325] AutoHarness V2 baseline runner (dual-arm
 replay)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 7 Task 7.5 — Tier 1 Q1 brutal-review item. Ships the reusable
`HarnessBaselineRunner` that powers the future `brv harness baseline`
CLI command: replays the last N stored scenarios against two arms
(pass-through template = raw, current version = harness), reports
per-scenario outcomes + overall rates + delta.

Complementary to the reference KPI harness
(`scripts/autoharness-kpi/`, PR #523):
  - KPI harness: fixed task set + fixed model → release-notes
    headline number.
  - Baseline runner: user's own scenarios + current harness →
    per-user "is it working for me?" signal.

The oclif command file (`src/oclif/commands/harness/baseline.ts`)
is deliberately NOT shipped in this PR. The command needs the
daemon-transport wiring for harness subcommands that Task 7.1
(status/inspect) establishes — none of Phase 7's CLI surface is
merged yet, so there's no transport pattern to follow. Shipping
just the runner class now (pure, testable, DI-ready) lets the
oclif wrapper land in a small follow-up after 7.1 defines the
pattern.

This matches the "reusable logic now, wiring later" pattern used
by the KPI harness scaffolding PR (#523).

- Pure orchestration; caller injects the `HarnessToolsFactory`
  (production wires this to `SandboxService.buildHarnessTools({dryRun: true})`
  for write-blocked eval runs, matching Phase 6.1).
- Raw arm fabricates a synthetic version from the stored template
  (`getTemplate(commandType, currentVersion.projectType).code`) —
  no need to persist it; the module builder accepts any code
  string.
- `v1.0 scope narrowing`: only `commandType === 'curate'` is
  supported (query / chat throw `UNSUPPORTED_COMMAND_TYPE`) until
  query templates ship in the Phase 4.3 follow-up.
- `count` bounded [1, 50]; scenarios < 3 fails with a clear
  "run curate N more times first" message so users know how to
  unblock themselves.
- Each scenario runs once per arm (single-run side-by-side
  display). Contrast Phase 6.1 Evaluator which does 10 runs per
  scenario for statistical significance — different semantics,
  different surface, not shared on purpose.

Tests (9): COUNT_OUT_OF_RANGE (0, 51), UNSUPPORTED_COMMAND_TYPE
(query, chat), INSUFFICIENT_SCENARIOS (2 scenarios),
NO_CURRENT_VERSION (empty store), happy-path delta = +100%,
no-op delta = 0, count-caps-slice, harness-throws-captures-
stderr.

All 278 harness unit tests pass. Build + lint + typecheck clean.
---
 package-lock.json                             |  15 +-
 .../infra/harness/harness-baseline-runner.ts  | 228 ++++++++++++
 .../harness/harness-baseline-runner.test.ts   | 342 ++++++++++++++++++
 3 files changed, 583 insertions(+), 2 deletions(-)
 create mode 100644 src/agent/infra/harness/harness-baseline-runner.ts
 create mode 100644 test/unit/agent/harness/harness-baseline-runner.test.ts

diff --git a/package-lock.json b/package-lock.json
index 340bc1518..c120c291a 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "byterover-cli",
-  "version": "3.7.1",
+  "version": "3.8.2",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "byterover-cli",
-      "version": "3.7.1",
+      "version": "3.8.2",
       "bundleDependencies": [
         "@campfirein/brv-transport-client",
         "@campfirein/byterover-packages"
@@ -5889,6 +5889,7 @@
       "os": [
         "android"
       ],
+      "peer": true,
       "engines": {
         "node": ">= 10"
       },
@@ -5909,6 +5910,7 @@
       "os": [
         "darwin"
       ],
+      "peer": true,
       "engines": {
         "node": ">= 10"
       },
@@ -5929,6 +5931,7 @@
       "os": [
         "darwin"
       ],
+      "peer": true,
       "engines": {
         "node": ">= 10"
       },
@@ -5949,6 +5952,7 @@
       "os": [
         "linux"
       ],
+      "peer": true,
       "engines": {
         "node": ">= 10"
       },
@@ -5969,6 +5973,7 @@
       "os": [
         "linux"
       ],
+      "peer": true,
       "engines": {
         "node": ">= 10"
       },
@@ -5989,6 +5994,7 @@
       "os": [
         "linux"
       ],
+      "peer": true,
       "engines": {
         "node": ">= 10"
       },
@@ -6009,6 +6015,7 @@
       "os": [
         "linux"
       ],
+      "peer": true,
       "engines": {
         "node": ">= 10"
       },
@@ -6029,6 +6036,7 @@
       "os": [
         "linux"
       ],
+      "peer": true,
       "engines": {
         "node": ">= 10"
       },
@@ -6049,6 +6057,7 @@
       "os": [
         "linux"
       ],
+      "peer": true,
       "engines": {
         "node": ">= 10"
       },
@@ -6069,6 +6078,7 @@
       "os": [
         "win32"
       ],
+      "peer": true,
       "engines": {
         "node": ">= 10"
       },
@@ -6089,6 +6099,7 @@
       "os": [
         "win32"
       ],
+      "peer": true,
       "engines": {
         "node": ">= 10"
       },
diff --git a/src/agent/infra/harness/harness-baseline-runner.ts b/src/agent/infra/harness/harness-baseline-runner.ts
new file mode 100644
index 000000000..f43f18de5
--- /dev/null
+++ b/src/agent/infra/harness/harness-baseline-runner.ts
@@ -0,0 +1,228 @@
+/**
+ * AutoHarness V2 — HarnessBaselineRunner.
+ *
+ * Powers `brv harness baseline` (Tier 1 Q1 brutal-review item).
+ * Replays the last N stored scenarios through two arms:
+ *   - Raw: the Option C pass-through template for the current
+ *          `projectType`
+ *   - Harness: the pair's current (latest) stored version
+ *
+ * Each scenario runs once per arm. Success = harness function
+ * invocation completed without throwing. Returns per-scenario
+ * outcomes + overall rates + `delta = harness - raw`.
+ *
+ * Complementary to the reference KPI harness (`scripts/autoharness-kpi/`):
+ *   - That runs a FIXED task set on a FIXED model for the release-
+ *     notes headline.
+ *   - This runs the USER's scenarios against the USER's current
+ *     harness for the personal "is it working for me?" signal.
+ *
+ * `dryRun` enforcement is the caller's responsibility — production
+ * wires `HarnessToolsFactory` to `SandboxService.buildHarnessTools({dryRun: true})`.
+ */
+
+import type {HarnessContext, ProjectType, ValidatedEvaluationScenario} from '../../core/domain/harness/types.js'
+import type {IHarnessStore} from '../../core/interfaces/i-harness-store.js'
+import type {ILogger} from '../../core/interfaces/i-logger.js'
+import type {HarnessToolsFactory} from './harness-evaluator.js'
+
+import {HarnessModuleBuilder} from './harness-module-builder.js'
+import {getTemplate, type SupportedCommandType} from './templates/index.js'
+
+// ─── Public types ────────────────────────────────────────────────────────
+
+export interface BaselineScenarioResult {
+  readonly harnessStderr?: string
+  readonly harnessSuccess: boolean
+  readonly rawStderr?: string
+  readonly rawSuccess: boolean
+  readonly scenarioId: string
+}
+
+export interface BaselineReport {
+  readonly delta: number
+  readonly harnessSuccessRate: number
+  readonly perScenario: readonly BaselineScenarioResult[]
+  readonly rawSuccessRate: number
+  readonly scenarioCount: number
+}
+
+export type BaselineRunnerErrorCode =
+  | 'COUNT_OUT_OF_RANGE'
+  | 'INSUFFICIENT_SCENARIOS'
+  | 'NO_CURRENT_VERSION'
+  | 'UNSUPPORTED_COMMAND_TYPE'
+
+export class HarnessBaselineRunnerError extends Error {
+  constructor(
+    message: string,
+    public readonly code: BaselineRunnerErrorCode,
+    public readonly details: Readonly<Record<string, unknown>> = {},
+  ) {
+    super(message)
+    this.name = 'HarnessBaselineRunnerError'
+  }
+}
+
+// ─── Constants ───────────────────────────────────────────────────────────
+
+/** Smallest scenario count that produces a meaningful baseline number. */
+export const BASELINE_MIN_SCENARIOS = 3
+
+/** Hard ceiling on `--count` — 10 runs × 50 scenarios × 2 arms = 1000 runs, already slow. */
+export const BASELINE_MAX_COUNT = 50
+
+// ─── Runner ──────────────────────────────────────────────────────────────
+
+/**
+ * v1.0 scope narrowing (per Phase 4 Task 4.3): only `curate` templates
+ * exist, so baseline only supports `commandType === 'curate'` for now.
+ * When query templates ship, widen this set.
+ */
+const SUPPORTED_BASELINE_COMMANDS: ReadonlySet<string> = new Set<SupportedCommandType>(['curate'])
+
+export class HarnessBaselineRunner {
+  private readonly moduleBuilder: HarnessModuleBuilder
+
+  constructor(
+    private readonly harnessStore: IHarnessStore,
+    private readonly logger: ILogger,
+    private readonly toolsFactory: HarnessToolsFactory,
+  ) {
+    this.moduleBuilder = new HarnessModuleBuilder(logger)
+  }
+
+  async runBaseline(params: {
+    readonly commandType: 'chat' | 'curate' | 'query'
+    readonly count: number
+    readonly projectId: string
+  }): Promise<BaselineReport> {
+    const {commandType, count, projectId} = params
+
+    if (count < 1 || count > BASELINE_MAX_COUNT) {
+      throw new HarnessBaselineRunnerError(
+        `--count must be in [1, ${BASELINE_MAX_COUNT}], got ${count}`,
+        'COUNT_OUT_OF_RANGE',
+        {count, max: BASELINE_MAX_COUNT},
+      )
+    }
+
+    if (!SUPPORTED_BASELINE_COMMANDS.has(commandType)) {
+      throw new HarnessBaselineRunnerError(
+        `baseline is only supported for commandType 'curate' in v1.0 (got '${commandType}'). Query/chat templates land in a follow-up.`,
+        'UNSUPPORTED_COMMAND_TYPE',
+        {commandType},
+      )
+    }
+
+    const allScenarios = await this.harnessStore.listScenarios(projectId, commandType)
+    const scenarios = allScenarios.slice(0, count)
+
+    if (scenarios.length < BASELINE_MIN_SCENARIOS) {
+      throw new HarnessBaselineRunnerError(
+        `not enough scenarios — baseline needs at least ${BASELINE_MIN_SCENARIOS}, found ${scenarios.length}. Run curate ${BASELINE_MIN_SCENARIOS - scenarios.length} more time(s) first.`,
+        'INSUFFICIENT_SCENARIOS',
+        {found: scenarios.length, required: BASELINE_MIN_SCENARIOS},
+      )
+    }
+
+    const currentVersion = await this.harnessStore.getLatest(projectId, commandType)
+    if (currentVersion === undefined) {
+      throw new HarnessBaselineRunnerError(
+        `no current harness version for (${projectId}, ${commandType}) — bootstrap first by running curate once.`,
+        'NO_CURRENT_VERSION',
+        {commandType, projectId},
+      )
+    }
+
+    const rawCode = getTemplate(
+      commandType as SupportedCommandType,
+      currentVersion.projectType,
+    ).code
+
+    // Build both modules. Either failing to build is a bug-level
+    // error — the current version came from the store (previously
+    // validated), and the raw template is a shipped constant.
+    const rawBuild = this.moduleBuilder.build({
+      ...currentVersion,
+      code: rawCode,
+      id: `${currentVersion.id}:raw`,
+    })
+    if (!rawBuild.loaded) {
+      throw new Error(`raw template failed to build: ${rawBuild.reason}`)
+    }
+
+    const harnessBuild = this.moduleBuilder.build(currentVersion)
+    if (!harnessBuild.loaded) {
+      throw new Error(`current version failed to build: ${harnessBuild.reason}`)
+    }
+
+    // Run each scenario once per arm. Serial over scenarios to keep
+    // outcomes traceable in logs; concurrent per-arm inside each
+    // scenario would add ordering noise for tiny latency savings.
+    const perScenario: BaselineScenarioResult[] = []
+    for (const scenario of scenarios) {
+      // eslint-disable-next-line no-await-in-loop
+      const rawRun = await this.runSingleScenario(rawBuild.module, scenario)
+      // eslint-disable-next-line no-await-in-loop
+      const harnessRun = await this.runSingleScenario(harnessBuild.module, scenario)
+      perScenario.push({
+        harnessStderr: harnessRun.stderr,
+        harnessSuccess: harnessRun.success,
+        rawStderr: rawRun.stderr,
+        rawSuccess: rawRun.success,
+        scenarioId: scenario.id,
+      })
+    }
+
+    const rawSuccesses = perScenario.filter((r) => r.rawSuccess).length
+    const harnessSuccesses = perScenario.filter((r) => r.harnessSuccess).length
+    const rawSuccessRate = rawSuccesses / perScenario.length
+    const harnessSuccessRate = harnessSuccesses / perScenario.length
+
+    return {
+      delta: harnessSuccessRate - rawSuccessRate,
+      harnessSuccessRate,
+      perScenario,
+      rawSuccessRate,
+      scenarioCount: perScenario.length,
+    }
+  }
+
+  /**
+   * Execute one scenario against one arm's module. Mirrors the
+   * logic in HarnessEvaluator.executeSingleRun (intentionally not
+   * shared — evaluator uses 10-run means; baseline uses single-run
+   * side-by-side, different semantics, different test surface).
+   */
+  private async runSingleScenario(
+    module: import('../../core/domain/harness/types.js').HarnessModule,
+    scenario: ValidatedEvaluationScenario,
+  ): Promise<{stderr?: string; success: boolean}> {
+    const tools = this.toolsFactory()
+    const ctx: HarnessContext = {
+      abort: new AbortController().signal,
+      env: {
+        commandType: scenario.commandType as 'chat' | 'curate' | 'query',
+        projectType: scenario.projectType as ProjectType,
+        workingDirectory: '/baseline',
+      },
+      tools,
+    }
+
+    try {
+      const fn = scenario.commandType === 'query' ? module.query : module.curate
+      if (fn === undefined) {
+        return {stderr: `no ${scenario.commandType} function on module`, success: false}
+      }
+
+      await fn(ctx)
+      return {success: true}
+    } catch (error) {
+      return {
+        stderr: error instanceof Error ? error.message : String(error),
+        success: false,
+      }
+    }
+  }
+}
diff --git a/test/unit/agent/harness/harness-baseline-runner.test.ts b/test/unit/agent/harness/harness-baseline-runner.test.ts
new file mode 100644
index 000000000..fb7e792f4
--- /dev/null
+++ b/test/unit/agent/harness/harness-baseline-runner.test.ts
@@ -0,0 +1,342 @@
+import {expect} from 'chai'
+import {createSandbox, type SinonSandbox, type SinonStub} from 'sinon'
+
+import type {
+  HarnessContextTools,
+  HarnessVersion,
+  ValidatedEvaluationScenario,
+} from '../../../../src/agent/core/domain/harness/types.js'
+import type {IHarnessStore} from '../../../../src/agent/core/interfaces/i-harness-store.js'
+
+import {NoOpLogger} from '../../../../src/agent/core/interfaces/i-logger.js'
+import {
+  BASELINE_MAX_COUNT,
+  BASELINE_MIN_SCENARIOS,
+  HarnessBaselineRunner,
+  HarnessBaselineRunnerError,
+} from '../../../../src/agent/infra/harness/harness-baseline-runner.js'
+
+// ─── Helpers ─────────────────────────────────────────────────────────────
+
+const PROJECT_ID = 'baseline-test'
+
+function makeVersion(overrides: Partial<HarnessVersion> = {}): HarnessVersion {
+  const passThroughCode = `
+    exports.meta = function() {
+      return {
+        capabilities: ['curate'],
+        commandType: 'curate',
+        projectPatterns: ['**/*'],
+        version: 1,
+      }
+    }
+    exports.curate = async function(ctx) { return ctx.tools.curate([]) }
+  `
+  return {
+    code: passThroughCode,
+    commandType: 'curate',
+    createdAt: 1_700_000_000_000,
+    heuristic: 0.3,
+    id: 'v-baseline-test',
+    metadata: {
+      capabilities: ['curate'],
+      commandType: 'curate',
+      projectPatterns: ['**/*'],
+      version: 1,
+    },
+    projectId: PROJECT_ID,
+    projectType: 'generic',
+    version: 1,
+    ...overrides,
+  }
+}
+
+function makeScenario(id: string): ValidatedEvaluationScenario {
+  return {
+    code: 'test code',
+    commandType: 'curate',
+    createdAt: 1_700_000_000_000,
+    expectedBehavior: 'ok',
+    id,
+    projectId: PROJECT_ID,
+    projectType: 'generic',
+    taskDescription: 'test task',
+  }
+}
+
+function makeStoreStub(sb: SinonSandbox): {
+  readonly getLatest: SinonStub
+  readonly listScenarios: SinonStub
+  readonly store: IHarnessStore
+} {
+  const getLatest = sb.stub()
+  const listScenarios = sb.stub()
+  const store = {
+    deleteOutcomes: sb.stub(),
+    deleteScenario: sb.stub(),
+    getLatest,
+    getVersion: sb.stub(),
+    listOutcomes: sb.stub(),
+    listScenarios,
+    listVersions: sb.stub(),
+    pruneOldVersions: sb.stub(),
+    recordFeedback: sb.stub(),
+    saveOutcome: sb.stub(),
+    saveScenario: sb.stub(),
+    saveVersion: sb.stub(),
+  } satisfies IHarnessStore
+
+  return {getLatest, listScenarios, store}
+}
+
+/**
+ * Build a tools factory whose `curate` rejects/resolves per-arm.
+ * Discriminates arms by a side-channel counter: each factory call
+ * alternates. Tests use this to control per-arm outcomes.
+ */
+function makeTwoArmToolsFactory(
+  sb: SinonSandbox,
+  spec: {readonly harness: () => void; readonly raw: () => void},
+): () => HarnessContextTools {
+  // Call pattern: in `runBaseline`, for EACH scenario, raw arm is
+  // built first (first curate call), then harness arm (second).
+  // Factory is invoked once per single-scenario execution, so even
+  // calls are raw and odd calls are harness — track via counter.
+  let call = 0
+  return () => {
+    call++
+    const isRawArm = call % 2 === 1 // 1st, 3rd, 5th... are raw
+    const curate = sb.stub().callsFake(() => {
+      if (isRawArm) return spec.raw()
+      return spec.harness()
+    })
+    const readFile = sb.stub().resolves()
+    return {curate, readFile} as unknown as HarnessContextTools
+  }
+}
+
+// ─── Tests ────────────────────────────────────────────────────────────────
+
+describe('HarnessBaselineRunner', () => {
+  let sb: SinonSandbox
+
+  beforeEach(() => {
+    sb = createSandbox()
+  })
+
+  afterEach(() => {
+    sb.restore()
+  })
+
+  it('1. throws COUNT_OUT_OF_RANGE when count is 0', async () => {
+    const {store} = makeStoreStub(sb)
+    const runner = new HarnessBaselineRunner(store, new NoOpLogger(), () =>
+      ({curate: sb.stub().resolves(), readFile: sb.stub().resolves()}) as unknown as HarnessContextTools,
+    )
+
+    let caught: unknown
+    try {
+      await runner.runBaseline({commandType: 'curate', count: 0, projectId: PROJECT_ID})
+    } catch (error) {
+      caught = error
+    }
+
+    expect(caught).to.be.instanceOf(HarnessBaselineRunnerError)
+    expect((caught as HarnessBaselineRunnerError).code).to.equal('COUNT_OUT_OF_RANGE')
+  })
+
+  it('2. throws COUNT_OUT_OF_RANGE when count exceeds BASELINE_MAX_COUNT', async () => {
+    const {store} = makeStoreStub(sb)
+    const runner = new HarnessBaselineRunner(store, new NoOpLogger(), () =>
+      ({curate: sb.stub(), readFile: sb.stub()}) as unknown as HarnessContextTools,
+    )
+
+    let caught: unknown
+    try {
+      await runner.runBaseline({
+        commandType: 'curate',
+        count: BASELINE_MAX_COUNT + 1,
+        projectId: PROJECT_ID,
+      })
+    } catch (error) {
+      caught = error
+    }
+
+    expect((caught as HarnessBaselineRunnerError).code).to.equal('COUNT_OUT_OF_RANGE')
+  })
+
+  it('3. throws UNSUPPORTED_COMMAND_TYPE for query / chat (v1.0 curate-only)', async () => {
+    const {store} = makeStoreStub(sb)
+    const runner = new HarnessBaselineRunner(store, new NoOpLogger(), () =>
+      ({curate: sb.stub(), readFile: sb.stub()}) as unknown as HarnessContextTools,
+    )
+
+    for (const cmd of ['query', 'chat'] as const) {
+      let caught: unknown
+      try {
+        // eslint-disable-next-line no-await-in-loop
+        await runner.runBaseline({commandType: cmd, count: 10, projectId: PROJECT_ID})
+      } catch (error) {
+        caught = error
+      }
+
+      expect(
+        (caught as HarnessBaselineRunnerError).code,
+        `mismatch for commandType=${cmd}`,
+      ).to.equal('UNSUPPORTED_COMMAND_TYPE')
+    }
+  })
+
+  it('4. throws INSUFFICIENT_SCENARIOS when < 3 scenarios exist', async () => {
+    const {getLatest, listScenarios, store} = makeStoreStub(sb)
+    listScenarios.resolves([makeScenario('s1'), makeScenario('s2')])
+    getLatest.resolves(makeVersion())
+    const runner = new HarnessBaselineRunner(store, new NoOpLogger(), () =>
+      ({curate: sb.stub(), readFile: sb.stub()}) as unknown as HarnessContextTools,
+    )
+
+    let caught: unknown
+    try {
+      await runner.runBaseline({commandType: 'curate', count: 10, projectId: PROJECT_ID})
+    } catch (error) {
+      caught = error
+    }
+
+    const err = caught as HarnessBaselineRunnerError
+    expect(err.code).to.equal('INSUFFICIENT_SCENARIOS')
+    expect(err.details.found).to.equal(2)
+    expect(err.details.required).to.equal(BASELINE_MIN_SCENARIOS)
+  })
+
+  it('5. throws NO_CURRENT_VERSION when the pair has no stored version', async () => {
+    const {getLatest, listScenarios, store} = makeStoreStub(sb)
+    listScenarios.resolves([makeScenario('s1'), makeScenario('s2'), makeScenario('s3')])
+    getLatest.resolves()
+    const runner = new HarnessBaselineRunner(store, new NoOpLogger(), () =>
+      ({curate: sb.stub(), readFile: sb.stub()}) as unknown as HarnessContextTools,
+    )
+
+    let caught: unknown
+    try {
+      await runner.runBaseline({commandType: 'curate', count: 10, projectId: PROJECT_ID})
+    } catch (error) {
+      caught = error
+    }
+
+    expect((caught as HarnessBaselineRunnerError).code).to.equal('NO_CURRENT_VERSION')
+  })
+
+  it('6. happy path: raw fails every scenario, harness succeeds → delta = 100%', async () => {
+    const {getLatest, listScenarios, store} = makeStoreStub(sb)
+    const scenarios = [
+      makeScenario('s1'),
+      makeScenario('s2'),
+      makeScenario('s3'),
+      makeScenario('s4'),
+    ]
+    listScenarios.resolves(scenarios)
+    getLatest.resolves(makeVersion())
+
+    const factory = makeTwoArmToolsFactory(sb, {
+      harness: () => Promise.resolve(),
+      raw() {
+        throw new Error('raw arm failure')
+      },
+    })
+
+    const runner = new HarnessBaselineRunner(store, new NoOpLogger(), factory)
+    const report = await runner.runBaseline({
+      commandType: 'curate',
+      count: 10,
+      projectId: PROJECT_ID,
+    })
+
+    expect(report.scenarioCount).to.equal(4)
+    expect(report.rawSuccessRate).to.equal(0)
+    expect(report.harnessSuccessRate).to.equal(1)
+    expect(report.delta).to.equal(1)
+    for (const result of report.perScenario) {
+      expect(result.rawSuccess).to.equal(false)
+      expect(result.rawStderr).to.match(/raw arm failure/)
+      expect(result.harnessSuccess).to.equal(true)
+    }
+  })
+
+  it('7. mixed path: both arms pass → delta = 0', async () => {
+    const {getLatest, listScenarios, store} = makeStoreStub(sb)
+    listScenarios.resolves([makeScenario('s1'), makeScenario('s2'), makeScenario('s3')])
+    getLatest.resolves(makeVersion())
+
+    const factory = makeTwoArmToolsFactory(sb, {
+      harness: () => Promise.resolve(),
+      raw: () => Promise.resolve(),
+    })
+
+    const runner = new HarnessBaselineRunner(store, new NoOpLogger(), factory)
+    const report = await runner.runBaseline({
+      commandType: 'curate',
+      count: 3,
+      projectId: PROJECT_ID,
+    })
+
+    expect(report.rawSuccessRate).to.equal(1)
+    expect(report.harnessSuccessRate).to.equal(1)
+    expect(report.delta).to.equal(0)
+  })
+
+  it('8. count caps the scenarios slice', async () => {
+    const {getLatest, listScenarios, store} = makeStoreStub(sb)
+    // 10 scenarios in store; ask for 5 → only 5 run.
+    const scenarios = Array.from({length: 10}, (_, i) => makeScenario(`s${i}`))
+    listScenarios.resolves(scenarios)
+    getLatest.resolves(makeVersion())
+
+    const factory = makeTwoArmToolsFactory(sb, {
+      harness: () => Promise.resolve(),
+      raw: () => Promise.resolve(),
+    })
+
+    const runner = new HarnessBaselineRunner(store, new NoOpLogger(), factory)
+    const report = await runner.runBaseline({
+      commandType: 'curate',
+      count: 5,
+      projectId: PROJECT_ID,
+    })
+
+    expect(report.scenarioCount).to.equal(5)
+    expect(report.perScenario.map((r) => r.scenarioId)).to.deep.equal([
+      's0',
+      's1',
+      's2',
+      's3',
+      's4',
+    ])
+  })
+
+  it('9. harness run that throws is captured as failure with stderr', async () => {
+    const {getLatest, listScenarios, store} = makeStoreStub(sb)
+    listScenarios.resolves([makeScenario('s1'), makeScenario('s2'), makeScenario('s3')])
+    getLatest.resolves(makeVersion())
+
+    const factory = makeTwoArmToolsFactory(sb, {
+      harness() {
+        throw new Error('harness cratered')
+      },
+      raw: () => Promise.resolve(),
+    })
+
+    const runner = new HarnessBaselineRunner(store, new NoOpLogger(), factory)
+    const report = await runner.runBaseline({
+      commandType: 'curate',
+      count: 3,
+      projectId: PROJECT_ID,
+    })
+
+    expect(report.rawSuccessRate).to.equal(1)
+    expect(report.harnessSuccessRate).to.equal(0)
+    expect(report.delta).to.equal(-1)
+    for (const r of report.perScenario) {
+      expect(r.harnessStderr).to.match(/cratered/)
+    }
+  })
+})

From 8c5c0d1d61f88065acd894e5c6f48c90a2742871 Mon Sep 17 00:00:00 2001
From: Danh Doan <danhdoancv@gmail.com>
Date: Wed, 22 Apr 2026 16:40:46 +0700
Subject: [PATCH 2/3] test: [ENG-2325] add deleteOutcome stub after rebase on
 proj

Proj/autoharness-v2 picked up a new IHarnessStore.deleteOutcome method
between branch-point and PR CI. Rebase picked up the interface change;
this commit updates the makeStoreStub helper to satisfy it.
---
 test/unit/agent/harness/harness-baseline-runner.test.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/unit/agent/harness/harness-baseline-runner.test.ts b/test/unit/agent/harness/harness-baseline-runner.test.ts
index fb7e792f4..5fb59380d 100644
--- a/test/unit/agent/harness/harness-baseline-runner.test.ts
+++ b/test/unit/agent/harness/harness-baseline-runner.test.ts
@@ -72,6 +72,7 @@ function makeStoreStub(sb: SinonSandbox): {
   const getLatest = sb.stub()
   const listScenarios = sb.stub()
   const store = {
+    deleteOutcome: sb.stub(),
     deleteOutcomes: sb.stub(),
     deleteScenario: sb.stub(),
     getLatest,

From 6d0b90db32d456735c718d7fb0b70601acec0308 Mon Sep 17 00:00:00 2001
From: Danh Doan <danhdoancv@gmail.com>
Date: Wed, 22 Apr 2026 16:47:39 +0700
Subject: [PATCH 3/3] review: [ENG-2325] address PR 526 review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Fix INSUFFICIENT_SCENARIOS to guard on store coverage, not the sliced
  window (caller passing a too-low --count now surfaces as a valid run,
  not a misleading "run curate N more times" error). Add test 4b.
- Hoist HarnessModule to top-level import type block; drop inline
  dynamic type import.
- Replace `as SupportedCommandType` cast with an isSupportedBaselineCommand
  type predicate so TypeScript narrows at the guard.
- Swap ternary fn-lookup for an explicit fnMap — prevents silent
  fall-through if chat is ever added to the supported set.
- Document the raw-first call-order coupling in makeTwoArmToolsFactory.
---
 .../infra/harness/harness-baseline-runner.ts  | 40 +++++++++++--------
 .../harness/harness-baseline-runner.test.ts   | 29 ++++++++++++++
 2 files changed, 53 insertions(+), 16 deletions(-)

diff --git a/src/agent/infra/harness/harness-baseline-runner.ts b/src/agent/infra/harness/harness-baseline-runner.ts
index f43f18de5..37b22b5dd 100644
--- a/src/agent/infra/harness/harness-baseline-runner.ts
+++ b/src/agent/infra/harness/harness-baseline-runner.ts
@@ -21,7 +21,7 @@
  * wires `HarnessToolsFactory` to `SandboxService.buildHarnessTools({dryRun: true})`.
  */
 
-import type {HarnessContext, ProjectType, ValidatedEvaluationScenario} from '../../core/domain/harness/types.js'
+import type {HarnessContext, HarnessModule, ProjectType, ValidatedEvaluationScenario} from '../../core/domain/harness/types.js'
 import type {IHarnessStore} from '../../core/interfaces/i-harness-store.js'
 import type {ILogger} from '../../core/interfaces/i-logger.js'
 import type {HarnessToolsFactory} from './harness-evaluator.js'
@@ -81,6 +81,10 @@ export const BASELINE_MAX_COUNT = 50
  */
 const SUPPORTED_BASELINE_COMMANDS: ReadonlySet<string> = new Set<SupportedCommandType>(['curate'])
 
+function isSupportedBaselineCommand(cmd: string): cmd is SupportedCommandType {
+  return SUPPORTED_BASELINE_COMMANDS.has(cmd)
+}
+
 export class HarnessBaselineRunner {
   private readonly moduleBuilder: HarnessModuleBuilder
 
@@ -107,7 +111,7 @@ export class HarnessBaselineRunner {
       )
     }
 
-    if (!SUPPORTED_BASELINE_COMMANDS.has(commandType)) {
+    if (!isSupportedBaselineCommand(commandType)) {
       throw new HarnessBaselineRunnerError(
         `baseline is only supported for commandType 'curate' in v1.0 (got '${commandType}'). Query/chat templates land in a follow-up.`,
         'UNSUPPORTED_COMMAND_TYPE',
@@ -116,16 +120,19 @@ export class HarnessBaselineRunner {
     }
 
     const allScenarios = await this.harnessStore.listScenarios(projectId, commandType)
-    const scenarios = allScenarios.slice(0, count)
 
-    if (scenarios.length < BASELINE_MIN_SCENARIOS) {
+    // Guard on store coverage, not the requested window: `--count 2` with
+    // 10 stored scenarios is a bad request (count too low), not missing data.
+    if (allScenarios.length < BASELINE_MIN_SCENARIOS) {
       throw new HarnessBaselineRunnerError(
-        `not enough scenarios — baseline needs at least ${BASELINE_MIN_SCENARIOS}, found ${scenarios.length}. Run curate ${BASELINE_MIN_SCENARIOS - scenarios.length} more time(s) first.`,
+        `not enough scenarios — baseline needs at least ${BASELINE_MIN_SCENARIOS}, found ${allScenarios.length}. Run curate ${BASELINE_MIN_SCENARIOS - allScenarios.length} more time(s) first.`,
         'INSUFFICIENT_SCENARIOS',
-        {found: scenarios.length, required: BASELINE_MIN_SCENARIOS},
+        {found: allScenarios.length, required: BASELINE_MIN_SCENARIOS},
       )
     }
 
+    const scenarios = allScenarios.slice(0, count)
+
     const currentVersion = await this.harnessStore.getLatest(projectId, commandType)
     if (currentVersion === undefined) {
       throw new HarnessBaselineRunnerError(
@@ -135,10 +142,7 @@ export class HarnessBaselineRunner {
       )
     }
 
-    const rawCode = getTemplate(
-      commandType as SupportedCommandType,
-      currentVersion.projectType,
-    ).code
+    const rawCode = getTemplate(commandType, currentVersion.projectType).code
 
     // Build both modules. Either failing to build is a bug-level
     // error — the current version came from the store (previously
@@ -196,7 +200,7 @@ export class HarnessBaselineRunner {
    * side-by-side, different semantics, different test surface).
    */
   private async runSingleScenario(
-    module: import('../../core/domain/harness/types.js').HarnessModule,
+    module: HarnessModule,
     scenario: ValidatedEvaluationScenario,
   ): Promise<{stderr?: string; success: boolean}> {
     const tools = this.toolsFactory()
@@ -210,12 +214,16 @@ export class HarnessBaselineRunner {
       tools,
     }
 
-    try {
-      const fn = scenario.commandType === 'query' ? module.query : module.curate
-      if (fn === undefined) {
-        return {stderr: `no ${scenario.commandType} function on module`, success: false}
-      }
+    const fnMap: Partial<Record<string, (ctx: HarnessContext) => Promise<unknown>>> = {
+      curate: module.curate,
+      query: module.query,
+    }
+    const fn = fnMap[scenario.commandType]
+    if (fn === undefined) {
+      return {stderr: `no ${scenario.commandType} function on module`, success: false}
+    }
 
+    try {
       await fn(ctx)
       return {success: true}
     } catch (error) {
diff --git a/test/unit/agent/harness/harness-baseline-runner.test.ts b/test/unit/agent/harness/harness-baseline-runner.test.ts
index 5fb59380d..836fa8836 100644
--- a/test/unit/agent/harness/harness-baseline-runner.test.ts
+++ b/test/unit/agent/harness/harness-baseline-runner.test.ts
@@ -94,6 +94,13 @@ function makeStoreStub(sb: SinonSandbox): {
  * Build a tools factory whose `curate` rejects/resolves per-arm.
  * Discriminates arms by a side-channel counter: each factory call
  * alternates. Tests use this to control per-arm outcomes.
+ *
+ * IMPLEMENTATION COUPLING: this helper assumes `runBaseline` invokes
+ * the RAW arm before the HARNESS arm within each scenario (odd call =
+ * raw, even = harness). If that order is ever reversed, arm
+ * assertions invert silently. A more resilient scheme would tag the
+ * arm via `ctx.env` and discriminate on the tag; kept simple here
+ * because the single `runBaseline` caller is serial and deliberate.
  */
 function makeTwoArmToolsFactory(
   sb: SinonSandbox,
@@ -209,6 +216,28 @@ describe('HarnessBaselineRunner', () => {
     expect(err.details.required).to.equal(BASELINE_MIN_SCENARIOS)
   })
 
+  it('4b. INSUFFICIENT_SCENARIOS reflects STORE coverage, not the sliced window', async () => {
+    // Store has plenty (10); caller passes --count=2. Error should report
+    // the requested window as the bad input, not claim missing data.
+    const {getLatest, listScenarios, store} = makeStoreStub(sb)
+    listScenarios.resolves(Array.from({length: 10}, (_, i) => makeScenario(`s${i}`)))
+    getLatest.resolves(makeVersion())
+    const runner = new HarnessBaselineRunner(store, new NoOpLogger(), () =>
+      ({curate: sb.stub(), readFile: sb.stub()}) as unknown as HarnessContextTools,
+    )
+
+    let caught: unknown
+    try {
+      await runner.runBaseline({commandType: 'curate', count: 2, projectId: PROJECT_ID})
+    } catch (error) {
+      caught = error
+    }
+
+    // With the fix, 10 stored ≥ 3, so guard does NOT fire — the run proceeds
+    // and completes on the 2-scenario slice (count itself is valid: [1, 50]).
+    expect(caught).to.equal(undefined)
+  })
+
   it('5. throws NO_CURRENT_VERSION when the pair has no stored version', async () => {
     const {getLatest, listScenarios, store} = makeStoreStub(sb)
     listScenarios.resolves([makeScenario('s1'), makeScenario('s2'), makeScenario('s3')])