Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

236 changes: 236 additions & 0 deletions src/agent/infra/harness/harness-baseline-runner.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
/**
* AutoHarness V2 — HarnessBaselineRunner.
*
* Powers `brv harness baseline` (Tier 1 Q1 brutal-review item).
* Replays the last N stored scenarios through two arms:
* - Raw: the Option C pass-through template for the current
* `projectType`
* - Harness: the pair's current (latest) stored version
*
* Each scenario runs once per arm. Success = harness function
* invocation completed without throwing. Returns per-scenario
* outcomes + overall rates + `delta = harness - raw`.
*
* Complementary to the reference KPI harness (`scripts/autoharness-kpi/`):
* - That runs a FIXED task set on a FIXED model for the release-
* notes headline.
* - This runs the USER's scenarios against the USER's current
* harness for the personal "is it working for me?" signal.
*
* `dryRun` enforcement is the caller's responsibility — production
* wires `HarnessToolsFactory` to `SandboxService.buildHarnessTools({dryRun: true})`.
*/

import type {HarnessContext, HarnessModule, ProjectType, ValidatedEvaluationScenario} from '../../core/domain/harness/types.js'
import type {IHarnessStore} from '../../core/interfaces/i-harness-store.js'
import type {ILogger} from '../../core/interfaces/i-logger.js'
import type {HarnessToolsFactory} from './harness-evaluator.js'

import {HarnessModuleBuilder} from './harness-module-builder.js'
import {getTemplate, type SupportedCommandType} from './templates/index.js'

// ─── Public types ────────────────────────────────────────────────────────

export interface BaselineScenarioResult {
readonly harnessStderr?: string
readonly harnessSuccess: boolean
readonly rawStderr?: string
readonly rawSuccess: boolean
readonly scenarioId: string
}

export interface BaselineReport {
readonly delta: number
readonly harnessSuccessRate: number
readonly perScenario: readonly BaselineScenarioResult[]
readonly rawSuccessRate: number
readonly scenarioCount: number
}

export type BaselineRunnerErrorCode =
| 'COUNT_OUT_OF_RANGE'
| 'INSUFFICIENT_SCENARIOS'
| 'NO_CURRENT_VERSION'
| 'UNSUPPORTED_COMMAND_TYPE'

export class HarnessBaselineRunnerError extends Error {
constructor(
message: string,
public readonly code: BaselineRunnerErrorCode,
public readonly details: Readonly<Record<string, unknown>> = {},
) {
super(message)
this.name = 'HarnessBaselineRunnerError'
}
}

// ─── Constants ───────────────────────────────────────────────────────────

/** Smallest scenario count that produces a meaningful baseline number. */
export const BASELINE_MIN_SCENARIOS = 3

/** Hard ceiling on `--count` — 10 runs × 50 scenarios × 2 arms = 1000 runs, already slow. */
export const BASELINE_MAX_COUNT = 50

// ─── Runner ──────────────────────────────────────────────────────────────

/**
* v1.0 scope narrowing (per Phase 4 Task 4.3): only `curate` templates
* exist, so baseline only supports `commandType === 'curate'` for now.
* When query templates ship, widen this set.
*/
const SUPPORTED_BASELINE_COMMANDS: ReadonlySet<string> = new Set<SupportedCommandType>(['curate'])

function isSupportedBaselineCommand(cmd: string): cmd is SupportedCommandType {
return SUPPORTED_BASELINE_COMMANDS.has(cmd)
}

export class HarnessBaselineRunner {
private readonly moduleBuilder: HarnessModuleBuilder

constructor(
private readonly harnessStore: IHarnessStore,
private readonly logger: ILogger,
private readonly toolsFactory: HarnessToolsFactory,
) {
this.moduleBuilder = new HarnessModuleBuilder(logger)
}

async runBaseline(params: {
readonly commandType: 'chat' | 'curate' | 'query'
readonly count: number
readonly projectId: string
}): Promise<BaselineReport> {
const {commandType, count, projectId} = params

if (count < 1 || count > BASELINE_MAX_COUNT) {
throw new HarnessBaselineRunnerError(
`--count must be in [1, ${BASELINE_MAX_COUNT}], got ${count}`,
'COUNT_OUT_OF_RANGE',
{count, max: BASELINE_MAX_COUNT},
)
}

if (!isSupportedBaselineCommand(commandType)) {
throw new HarnessBaselineRunnerError(
`baseline is only supported for commandType 'curate' in v1.0 (got '${commandType}'). Query/chat templates land in a follow-up.`,
'UNSUPPORTED_COMMAND_TYPE',
{commandType},
)
}

const allScenarios = await this.harnessStore.listScenarios(projectId, commandType)

// Guard on store coverage, not the requested window: `--count 2` with
// 10 stored scenarios is a bad request (count too low), not missing data.
if (allScenarios.length < BASELINE_MIN_SCENARIOS) {
throw new HarnessBaselineRunnerError(
`not enough scenarios — baseline needs at least ${BASELINE_MIN_SCENARIOS}, found ${allScenarios.length}. Run curate ${BASELINE_MIN_SCENARIOS - allScenarios.length} more time(s) first.`,
'INSUFFICIENT_SCENARIOS',
{found: allScenarios.length, required: BASELINE_MIN_SCENARIOS},
)
}

const scenarios = allScenarios.slice(0, count)

const currentVersion = await this.harnessStore.getLatest(projectId, commandType)
if (currentVersion === undefined) {
throw new HarnessBaselineRunnerError(
`no current harness version for (${projectId}, ${commandType}) — bootstrap first by running curate once.`,
'NO_CURRENT_VERSION',
{commandType, projectId},
)
}

const rawCode = getTemplate(commandType, currentVersion.projectType).code

// Build both modules. Either failing to build is a bug-level
// error — the current version came from the store (previously
// validated), and the raw template is a shipped constant.
const rawBuild = this.moduleBuilder.build({
...currentVersion,
code: rawCode,
id: `${currentVersion.id}:raw`,
})
if (!rawBuild.loaded) {
throw new Error(`raw template failed to build: ${rawBuild.reason}`)
}

const harnessBuild = this.moduleBuilder.build(currentVersion)
if (!harnessBuild.loaded) {
throw new Error(`current version failed to build: ${harnessBuild.reason}`)
}

// Run each scenario once per arm. Serial over scenarios to keep
// outcomes traceable in logs; concurrent per-arm inside each
// scenario would add ordering noise for tiny latency savings.
const perScenario: BaselineScenarioResult[] = []
for (const scenario of scenarios) {
// eslint-disable-next-line no-await-in-loop
const rawRun = await this.runSingleScenario(rawBuild.module, scenario)
// eslint-disable-next-line no-await-in-loop
const harnessRun = await this.runSingleScenario(harnessBuild.module, scenario)
perScenario.push({
harnessStderr: harnessRun.stderr,
harnessSuccess: harnessRun.success,
rawStderr: rawRun.stderr,
rawSuccess: rawRun.success,
scenarioId: scenario.id,
})
}

const rawSuccesses = perScenario.filter((r) => r.rawSuccess).length
const harnessSuccesses = perScenario.filter((r) => r.harnessSuccess).length
const rawSuccessRate = rawSuccesses / perScenario.length
const harnessSuccessRate = harnessSuccesses / perScenario.length

return {
delta: harnessSuccessRate - rawSuccessRate,
harnessSuccessRate,
perScenario,
rawSuccessRate,
scenarioCount: perScenario.length,
}
}

/**
* Execute one scenario against one arm's module. Mirrors the
* logic in HarnessEvaluator.executeSingleRun (intentionally not
* shared — evaluator uses 10-run means; baseline uses single-run
* side-by-side, different semantics, different test surface).
*/
private async runSingleScenario(
module: HarnessModule,
scenario: ValidatedEvaluationScenario,
): Promise<{stderr?: string; success: boolean}> {
const tools = this.toolsFactory()
const ctx: HarnessContext = {
abort: new AbortController().signal,
env: {
commandType: scenario.commandType as 'chat' | 'curate' | 'query',
projectType: scenario.projectType as ProjectType,
workingDirectory: '/baseline',
},
tools,
}

const fnMap: Partial<Record<string, (ctx: HarnessContext) => Promise<unknown>>> = {
curate: module.curate,
query: module.query,
}
const fn = fnMap[scenario.commandType]
if (fn === undefined) {
return {stderr: `no ${scenario.commandType} function on module`, success: false}
}

try {
await fn(ctx)
return {success: true}
} catch (error) {
return {
stderr: error instanceof Error ? error.message : String(error),
success: false,
}
}
}
}
Loading
Loading