From 7bec1754ac0da3fead50d2ceac55d56ba2122d17 Mon Sep 17 00:00:00 2001 From: Danh Doan Date: Wed, 22 Apr 2026 15:44:14 +0700 Subject: [PATCH 1/2] =?UTF-8?q?feat:=20[ENG-2332]=20AutoHarness=20V2=20KPI?= =?UTF-8?q?=20harness=20=E2=80=94=20scaffolding=20+=20fixture=20+=20stub-L?= =?UTF-8?q?LM=20runner?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 8 Task 8.4 — Tier 4 S1 brutal-review item. Ships the release-engineering measurement tool that produces the v1.0 ship gate's headline number: the Llama 3.1 8B success-rate delta on a standardized curate task set, raw `tools.*` vs the current harness. v1.0 ship gate: delta >= 30 percentage points on the reference fixture or the release is blocked. ## Layout All release-engineering tooling for this feature lives together under `scripts/autoharness-kpi/`: scripts/autoharness-kpi/ runner.ts — main script (parseArgs + runArm + computeKpiReport + exitCodeForReport + main — all pure, unit-testable) fixture-tasks.json — 20 curate tasks hand-crafted against this codebase (stable across releases) expected-baseline.json — placeholder; populated at v1.0 ship with the real-LLM measurement test/unit/scripts/autoharness-kpi/ runner.test.ts — 19 unit tests package.json "kpi:harness" — runs via `tsx scripts/autoharness-kpi/runner.ts` Chose the `scripts//` namespace over a top-level `kpi/` directory: KPI tooling is a subsystem concern (AutoHarness V2 only), not a first-class project concern. Big-tech repos (Kubernetes / Rust / CockroachDB / VSCode) consistently keep per-feature release tooling under an umbrella like `scripts/`, `hack/`, or `tools/` — never at repo root. ## Scope — scaffolding + stub LLM only `--llm stub` is fully operational with deterministic canned arm rates (delta = 50pp, well above the 30pp gate) for script-logic validation. `--llm real` currently throws a clear "not yet implemented" error — wiring the real Llama 3.1 8B path into the agent LLM service lands in a follow-up once Phase 7 finalizes the command surface. The fixture + math + CLI + exit-code logic all ship now (zero conflict risk with Phat's in-flight Phase 6 work) so the follow-up real-LLM PR is ~50 LOC of wiring. ## Fixture design 20 curate tasks covering concrete byterover-cli subjects: exported classes under harness/, HarnessConfigSchema fields, IHarnessStore methods, Zod schema enumeration, project-type detection rules, mode thresholds, Mode C cap values, feedback weights, etc. Each task has {id, taskDescription, expectedBehavior}. Hand-crafted so a reviewer can eye-ball realism. Aims at the "weak model needs hand-holding" band — tasks too easy (any model succeeds) or too hard (no model succeeds) wouldn't discriminate between arms. ## Tests (19) - loadFixture: shipped fixture parses; malformed/empty rejected - makeStubLlmClient: deterministic per-(task, arm); unknown-id fallback to both-arms-succeed - runArm: aggregates correctly; overall = flat mean across runs - computeKpiReport: delta math; metadata carried through - exitCodeForReport: 0 at exactly 0.30 (inclusive floor), 1 at 0.29, 1 on negative delta - parseArgs: defaults; flag parsing; validates --llm + --runs - main (end-to-end with stub): shipped fixture runs; --output writes valid JSON; --llm real surfaces the clear "not yet implemented" error ## Not merging yet Per team process, this PR opens for review; Danh decides separately whether to merge to proj/autoharness-v2 once review feedback is in. --- package.json | 1 + .../autoharness-kpi/expected-baseline.json | 10 + scripts/autoharness-kpi/fixture-tasks.json | 109 +++++ scripts/autoharness-kpi/runner.ts | 399 ++++++++++++++++++ .../scripts/autoharness-kpi/runner.test.ts | 314 ++++++++++++++ 5 files changed, 833 insertions(+) create mode 100644 scripts/autoharness-kpi/expected-baseline.json create mode 100644 scripts/autoharness-kpi/fixture-tasks.json create mode 100644 scripts/autoharness-kpi/runner.ts create mode 100644 test/unit/scripts/autoharness-kpi/runner.test.ts diff --git a/package.json b/package.json index 082dd9687..f19d1386c 100644 --- a/package.json +++ b/package.json @@ -185,6 +185,7 @@ "lint": "eslint", "lint:fix": "npm run lint -- --fix", "postpack": "shx rm -f oclif.manifest.json", + "kpi:harness": "tsx scripts/autoharness-kpi/runner.ts", "prepack": "npm run build && BRV_ENV=production oclif manifest", "prepare": "husky", "test": "mocha --forbid-only \"test/**/*.test.ts\"", diff --git a/scripts/autoharness-kpi/expected-baseline.json b/scripts/autoharness-kpi/expected-baseline.json new file mode 100644 index 000000000..6c6fef9e6 --- /dev/null +++ b/scripts/autoharness-kpi/expected-baseline.json @@ -0,0 +1,10 @@ +{ + "$comment": "Populated by the first clean KPI run at v1.0 ship time. Until then, this file is a placeholder. On each subsequent ship, compare against these numbers — a significant regression (harnessSuccessRate drops by > 5pp or delta drops by > 5pp) should block the release.", + "fixtureVersion": "1.0.0", + "targetModel": "llama-3.1-8b-instruct", + "runsPerArm": 10, + "measuredAt": null, + "harnessSuccessRate": null, + "rawSuccessRate": null, + "delta": null +} diff --git a/scripts/autoharness-kpi/fixture-tasks.json b/scripts/autoharness-kpi/fixture-tasks.json new file mode 100644 index 000000000..268e9e9e6 --- /dev/null +++ b/scripts/autoharness-kpi/fixture-tasks.json @@ -0,0 +1,109 @@ +{ + "$comment": "AutoHarness V2 KPI reference fixture — 20 curate tasks run once with raw tools.* and once with the current harness, both against Llama 3.1 8B Instruct. Stable across ships; regenerate only on deliberate fixture updates. See features/autoharness-v2/tasks/phase_8/task_04-kpi-harness.md for the methodology.", + "$schema": "./fixture-tasks.schema.json", + "fixtureVersion": "1.0.0", + "targetModel": "llama-3.1-8b-instruct", + "commandType": "curate", + "tasks": [ + { + "id": "t01-list-exports", + "taskDescription": "Curate every exported class name from `src/agent/infra/harness/` along with its source file path.", + "expectedBehavior": "Returns a non-empty list of `{className, filePath}` entries covering HarnessStore, HarnessBootstrap, HarnessModuleBuilder, HarnessOutcomeRecorder, HarnessSynthesizer, HarnessEvaluator, HarnessScenarioCapture (at minimum)." + }, + { + "id": "t02-config-fields", + "taskDescription": "Curate all fields defined on HarnessConfigSchema in `src/agent/infra/agent/agent-schemas.ts`.", + "expectedBehavior": "Returns the full field list: autoLearn, enabled, language, maxVersions, modeOverride, refinementModel." + }, + { + "id": "t03-interface-methods", + "taskDescription": "Curate every method name on the IHarnessStore interface.", + "expectedBehavior": "Returns all ~11 methods (deleteOutcomes, getLatest, getVersion, listOutcomes, listScenarios, listVersions, pruneOldVersions, recordFeedback, saveOutcome, saveScenario, saveVersion)." + }, + { + "id": "t04-zod-schemas", + "taskDescription": "List every Zod schema exported from `src/agent/core/domain/harness/types.ts`.", + "expectedBehavior": "Returns HarnessCapabilitySchema, HarnessLanguageSchema, HarnessMetaSchema, HarnessModeSchema, HarnessVersionSchema, CodeExecOutcomeSchema, EvaluationScenarioSchema, ProjectTypeSchema." + }, + { + "id": "t05-error-codes", + "taskDescription": "Curate all error codes on HarnessStoreErrorCode.", + "expectedBehavior": "Returns OUTCOME_NOT_FOUND, VERSION_CONFLICT." + }, + { + "id": "t06-template-names", + "taskDescription": "Curate all harness template files under `src/agent/infra/harness/templates/`.", + "expectedBehavior": "Returns the 3 shipped curate templates: curate/typescript.ts, curate/python.ts, curate/generic.ts." + }, + { + "id": "t07-mode-thresholds", + "taskDescription": "Curate the heuristic thresholds that map H to HarnessMode.", + "expectedBehavior": "Returns {assisted: 0.30, filter: 0.60, policy: 0.85} or the equivalent ranges." + }, + { + "id": "t08-oclif-commands", + "taskDescription": "List every `brv harness ` command file under `src/oclif/commands/harness/`.", + "expectedBehavior": "Returns the shipped subcommand names. (Minimum set grows as Phase 7 lands; empty result when phase not shipped is acceptable.)" + }, + { + "id": "t09-sandbox-services", + "taskDescription": "Curate every `set*` method on SandboxService.", + "expectedBehavior": "Returns setHarnessConfig, setHarnessStore, setHarnessModuleBuilder, setHarnessOutcomeRecorder, setEnvironmentContext, setFileSystem, setContentGenerator, etc." + }, + { + "id": "t10-event-payloads", + "taskDescription": "Curate the payload fields of the harness:refinement-completed event.", + "expectedBehavior": "Returns fromVersionId, accepted, toVersionId?, deltaH?, reason?." + }, + { + "id": "t11-tools-sdk-methods", + "taskDescription": "Curate every method surface exposed by HarnessContextTools in types.ts.", + "expectedBehavior": "Returns curate (+ signature) and readFile (+ signature). v1.0 surface is exactly those two." + }, + { + "id": "t12-capability-tags", + "taskDescription": "Curate every HarnessCapability tag.", + "expectedBehavior": "Returns discover, extract, buildOps, search, gather, curate, answer." + }, + { + "id": "t13-phase-task-docs", + "taskDescription": "Curate every task doc under `features/autoharness-v2/tasks/phase_5/`.", + "expectedBehavior": "Returns the 5 task docs: task_01 mode selector, task_02 prompt contributor, task_03 Mode C caps, task_04 agent wiring, task_05 integration test." + }, + { + "id": "t14-numeric-parameters", + "taskDescription": "Curate the numeric parameters locked in v1-design-decisions.md §2.1 (heuristic formula weights + windowing).", + "expectedBehavior": "Returns formula weights {successRate: 0.2, errorRate: 0.3, realHarnessRate: 0.5}, window size 50, min-sample floor 10, recency 30-day half-life, pass-through cap 0.55." + }, + { + "id": "t15-mode-c-caps", + "taskDescription": "Curate the two Mode C safety caps and their values.", + "expectedBehavior": "Returns ops cap 50 per curate() call, rate cap 30 mapExtract/60s process-wide." + }, + { + "id": "t16-refinement-gates", + "taskDescription": "Curate the refinement acceptance thresholds.", + "expectedBehavior": "Returns ΔH improvement ≥ 0.05, evaluation runs per candidate 10, max versions per pair 20, refinement queue concurrency 1 per pair." + }, + { + "id": "t17-feedback-weights", + "taskDescription": "Curate the user-feedback insertion weights.", + "expectedBehavior": "Returns bad → 3 synthetic failures, good → 1 synthetic success, cap 10 per window." + }, + { + "id": "t18-project-detector-rules", + "taskDescription": "Curate the project-type detection rules in project-detector.ts.", + "expectedBehavior": "Returns: tsconfig.json → typescript; package.json with typescript dep → typescript; pyproject.toml → python; setup.py → python; setup.cfg → python; polyglot → both." + }, + { + "id": "t19-weak-model-block-list", + "taskDescription": "Curate the weak-model block list entries (refinement skipped).", + "expectedBehavior": "Returns all entries in REFINEMENT_MODEL_BLOCKLIST (llama-3.1-8b-instruct, mistral-7b-instruct, qwen-2.5-7b-instruct, gemma-2-9b-it, phi-3-mini)." + }, + { + "id": "t20-phase-ship-gates", + "taskDescription": "Curate each phase's ship-gate integration test file path.", + "expectedBehavior": "Returns Phase 3 isolation.test.ts, Phase 4 cold-start.test.ts, Phase 5 mode-selection.test.ts, Phase 6 learning-loop.test.ts, Phase 7 cli-lifecycle.test.ts, Phase 8 full-lifecycle.test.ts." + } + ] +} diff --git a/scripts/autoharness-kpi/runner.ts b/scripts/autoharness-kpi/runner.ts new file mode 100644 index 000000000..884395b92 --- /dev/null +++ b/scripts/autoharness-kpi/runner.ts @@ -0,0 +1,399 @@ +#!/usr/bin/env node +/** + * AutoHarness V2 — KPI harness (Phase 8 Task 8.4). + * + * Runs the reference fixture (`scripts/autoharness-kpi/fixture-tasks.json`) + * through two arms: + * - Arm A: raw `tools.*` orchestration (no harness) + * - Arm B: current harness version + * + * Reports the success-rate delta. v1.0 ship gate: + * delta >= 0.30 → exit 0 + * delta < 0.30 → exit 1 (block the release) + * + * Usage: + * npm run kpi:harness -- [--fixture ] + * [--runs ] + * [--output ] + * [--llm stub|real] + * + * Defaults: `--llm stub` (deterministic synthetic arm results for + * CI validation of the script itself). Real-LLM measurement runs + * are triggered pre-ship via `--llm real` and require the agent + * stack to be wired — landing in a follow-up PR. + */ + +import {readFileSync, writeFileSync} from 'node:fs' +import {resolve} from 'node:path' + +// ───────────────────────────────────────────────────────────────────────── +// Types +// ───────────────────────────────────────────────────────────────────────── + +export interface FixtureTask { + readonly expectedBehavior: string + readonly id: string + readonly taskDescription: string +} + +export interface Fixture { + readonly commandType: 'chat' | 'curate' | 'query' + readonly fixtureVersion: string + readonly targetModel: string + readonly tasks: readonly FixtureTask[] +} + +export type ArmName = 'harness' | 'raw' + +export interface ArmRunResult { + readonly runs: readonly boolean[] // true = success + readonly successRate: number // mean across runs + readonly taskId: string +} + +export interface ArmResult { + readonly arm: ArmName + readonly overallSuccessRate: number + readonly perTask: readonly ArmRunResult[] +} + +export interface KpiReport { + readonly delta: number + readonly fixtureVersion: string + readonly harnessSuccessRate: number + readonly measuredAt: number // Date.now() + readonly perTask: ReadonlyArray<{ + readonly harnessSuccessRate: number + readonly rawSuccessRate: number + readonly taskId: string + }> + readonly rawSuccessRate: number + readonly runsPerArm: number + readonly targetModel: string +} + +/** + * Shape of an LLM backend the KPI runner delegates to. Stubbed in + * the default `--llm stub` path; real path (follow-up PR) wires this + * into the agent stack with a Llama 3.1 8B provider. + */ +export interface KpiLlmClient { + runTask(task: FixtureTask, arm: ArmName): Promise +} + +// ───────────────────────────────────────────────────────────────────────── +// Constants — ship-gate delta + stub arm success rates +// ───────────────────────────────────────────────────────────────────────── + +/** v1.0 ship-gate threshold per `v1-design-decisions.md §2.7`. */ +export const SHIP_GATE_DELTA = 0.3 + +// Stub success rates, per task id. Deterministic so the script's +// exit-code logic is testable. Mix designed to produce +// delta = 0.50 (well above ship gate) on the default 10-run config: +// - Tasks t01-t10: raw always fails, harness always succeeds (delta 100%) +// - Tasks t11-t20: both arms always succeed (delta 0%) +// - Mean delta across 20 tasks = 0.50 +const STUB_RATES: Readonly> = { + t01: {harness: 1, raw: 0}, + t02: {harness: 1, raw: 0}, + t03: {harness: 1, raw: 0}, + t04: {harness: 1, raw: 0}, + t05: {harness: 1, raw: 0}, + t06: {harness: 1, raw: 0}, + t07: {harness: 1, raw: 0}, + t08: {harness: 1, raw: 0}, + t09: {harness: 1, raw: 0}, + t10: {harness: 1, raw: 0}, + t11: {harness: 1, raw: 1}, + t12: {harness: 1, raw: 1}, + t13: {harness: 1, raw: 1}, + t14: {harness: 1, raw: 1}, + t15: {harness: 1, raw: 1}, + t16: {harness: 1, raw: 1}, + t17: {harness: 1, raw: 1}, + t18: {harness: 1, raw: 1}, + t19: {harness: 1, raw: 1}, + t20: {harness: 1, raw: 1}, +} + +// ───────────────────────────────────────────────────────────────────────── +// Pure functions (unit-tested) +// ───────────────────────────────────────────────────────────────────────── + +/** Parse and validate a fixture file. Throws on schema mismatch. */ +export function loadFixture(path: string): Fixture { + const raw = JSON.parse(readFileSync(path, 'utf8')) as unknown + if (typeof raw !== 'object' || raw === null) { + throw new TypeError(`fixture is not an object: ${path}`) + } + + const parsed = raw as Record + const {tasks} = parsed + if (!Array.isArray(tasks) || tasks.length === 0) { + throw new Error(`fixture has no tasks: ${path}`) + } + + for (const task of tasks) { + if ( + typeof task !== 'object' || + task === null || + typeof (task as FixtureTask).id !== 'string' || + typeof (task as FixtureTask).taskDescription !== 'string' || + typeof (task as FixtureTask).expectedBehavior !== 'string' + ) { + throw new Error(`fixture task malformed: ${JSON.stringify(task)}`) + } + } + + return { + commandType: (parsed.commandType as Fixture['commandType']) ?? 'curate', + fixtureVersion: (parsed.fixtureVersion as string) ?? 'unversioned', + targetModel: (parsed.targetModel as string) ?? 'unknown', + tasks: tasks as FixtureTask[], + } +} + +/** + * Deterministic stub LLM client. Used by `--llm stub` and by unit + * tests. Keeps the script's delta math + exit logic testable + * without touching a real model. + */ +export function makeStubLlmClient(): KpiLlmClient { + return { + async runTask(task: FixtureTask, arm: ArmName): Promise { + const idKey = task.id.split('-')[0] // e.g. "t01" from "t01-list-exports" + const rates = idKey === undefined ? undefined : STUB_RATES[idKey] + if (rates === undefined) { + // Unknown task id → baseline "both arms succeed" to keep + // stub output predictable for custom fixtures. + return true + } + + return arm === 'raw' ? rates.raw === 1 : rates.harness === 1 + }, + } +} + +/** Run one arm over the full fixture, `runs` times per task. */ +export async function runArm( + arm: ArmName, + tasks: readonly FixtureTask[], + runs: number, + client: KpiLlmClient, +): Promise { + const perTask: ArmRunResult[] = [] + + for (const task of tasks) { + const results: boolean[] = [] + for (let i = 0; i < runs; i++) { + // eslint-disable-next-line no-await-in-loop + const ok = await client.runTask(task, arm) + results.push(ok) + } + + const successRate = results.filter(Boolean).length / results.length + perTask.push({runs: results, successRate, taskId: task.id}) + } + + const totalRuns = perTask.reduce((acc, t) => acc + t.runs.length, 0) + const totalSuccesses = perTask.reduce( + (acc, t) => acc + t.runs.filter(Boolean).length, + 0, + ) + const overallSuccessRate = totalRuns === 0 ? 0 : totalSuccesses / totalRuns + + return {arm, overallSuccessRate, perTask} +} + +/** Combine two arm results into the final KPI report. */ +export function computeKpiReport(args: { + readonly fixture: Fixture + readonly harnessArm: ArmResult + readonly measuredAt?: number + readonly rawArm: ArmResult + readonly runsPerArm: number +}): KpiReport { + const {fixture, harnessArm, rawArm, runsPerArm} = args + const harnessSuccessRate = harnessArm.overallSuccessRate + const rawSuccessRate = rawArm.overallSuccessRate + + const byTaskId = new Map(rawArm.perTask.map((t) => [t.taskId, t.successRate])) + const perTask = harnessArm.perTask.map((h) => ({ + harnessSuccessRate: h.successRate, + rawSuccessRate: byTaskId.get(h.taskId) ?? 0, + taskId: h.taskId, + })) + + return { + delta: harnessSuccessRate - rawSuccessRate, + fixtureVersion: fixture.fixtureVersion, + harnessSuccessRate, + measuredAt: args.measuredAt ?? Date.now(), + perTask, + rawSuccessRate, + runsPerArm, + targetModel: fixture.targetModel, + } +} + +/** Exit code from a report: 0 if delta ≥ ship gate, else 1. */ +export function exitCodeForReport(report: KpiReport): 0 | 1 { + return report.delta >= SHIP_GATE_DELTA ? 0 : 1 +} + +/** Human-readable table for stdout. */ +export function renderReport(report: KpiReport): string { + const lines: string[] = [] + lines.push( + `KPI report — fixture ${report.fixtureVersion} × model ${report.targetModel}`, + `runs/arm: ${report.runsPerArm}`, + '', + `task raw harness`, + `─`.repeat(50), + ) + for (const t of report.perTask) { + const id = t.taskId.padEnd(28) + const raw = `${Math.round(t.rawSuccessRate * 100)}%`.padStart(5) + const harness = `${Math.round(t.harnessSuccessRate * 100)}%`.padStart(8) + lines.push(`${id} ${raw} ${harness}`) + } + + lines.push( + `─`.repeat(50), + `overall: ${(report.rawSuccessRate * 100).toFixed(1).padStart(5)}% ${(report.harnessSuccessRate * 100).toFixed(1).padStart(7)}%`, + `delta: ${(report.delta >= 0 ? '+' : '') + (report.delta * 100).toFixed(1)}pp`, + report.delta >= SHIP_GATE_DELTA + ? `✓ ship gate met (>= ${SHIP_GATE_DELTA * 100}pp)` + : `✗ ship gate NOT met — shortfall ${((SHIP_GATE_DELTA - report.delta) * 100).toFixed(1)}pp`, + ) + + return lines.join('\n') +} + +// ───────────────────────────────────────────────────────────────────────── +// CLI +// ───────────────────────────────────────────────────────────────────────── + +interface CliArgs { + readonly fixture: string + readonly llm: 'real' | 'stub' + readonly output?: string + readonly runs: number +} + +export function parseArgs(argv: readonly string[]): CliArgs { + let fixture = 'scripts/autoharness-kpi/fixture-tasks.json' + let output: string | undefined + let runs = 10 + let llm: CliArgs['llm'] = 'stub' + + for (let i = 0; i < argv.length; i++) { + const a = argv[i] + switch (a) { + case '--fixture': { + const next = argv[i + 1] + if (next === undefined) throw new Error('--fixture requires a path') + fixture = next + i++ + + break; + } + + case '--llm': { + const next = argv[i + 1] + if (next !== 'stub' && next !== 'real') { + throw new Error(`--llm must be 'stub' or 'real', got: ${next ?? ''}`) + } + + llm = next + i++ + + break; + } + + case '--output': { + const next = argv[i + 1] + if (next === undefined) throw new Error('--output requires a path') + output = next + i++ + + break; + } + + case '--runs': { + const next = argv[i + 1] + if (next === undefined) throw new Error('--runs requires a number') + runs = Number.parseInt(next, 10) + if (!Number.isFinite(runs) || runs <= 0) { + throw new Error(`--runs must be a positive integer, got: ${next}`) + } + + i++ + + break; + } + // No default + } + } + + return {fixture, llm, output, runs} +} + +export async function main(argv: readonly string[]): Promise { + const args = parseArgs(argv) + const fixture = loadFixture(resolve(args.fixture)) + + let client: KpiLlmClient + if (args.llm === 'stub') { + client = makeStubLlmClient() + } else { + // Real-LLM path — requires the agent stack wired for Llama 3.1 8B. + // Lands in a follow-up PR once the KPI run is exercised pre-ship. + throw new Error( + "--llm real is not yet implemented in this PR — use --llm stub for script validation. " + + 'The real-LLM path will wire into the agent LLM service in a follow-up; see task_04-kpi-harness.md.', + ) + } + + const rawArm = await runArm('raw', fixture.tasks, args.runs, client) + const harnessArm = await runArm('harness', fixture.tasks, args.runs, client) + const report = computeKpiReport({ + fixture, + harnessArm, + rawArm, + runsPerArm: args.runs, + }) + + + console.log(renderReport(report)) + if (args.output !== undefined) { + writeFileSync(resolve(args.output), JSON.stringify(report, null, 2)) + } + + return exitCodeForReport(report) +} + +// ───────────────────────────────────────────────────────────────────────── +// Entry point (only runs when invoked directly via `tsx`/`node`) +// ───────────────────────────────────────────────────────────────────────── + +// `process.exit()` is the right API for a CLI that needs explicit +// ship-gate exit codes (0 / 1 / 2). `unicorn/prefer-top-level-await` +// is fine here because the guarded block only runs in the direct- +// invocation path, not when test files import this module. + +const invokedDirectly = import.meta.url === `file://${process.argv[1]}` +if (invokedDirectly) { + try { + const code = await main(process.argv.slice(2)) + // eslint-disable-next-line n/no-process-exit + process.exit(code) + } catch (error) { + console.error( + `kpi-harness failed: ${error instanceof Error ? error.message : String(error)}`, + ) + // eslint-disable-next-line n/no-process-exit + process.exit(2) + } +} diff --git a/test/unit/scripts/autoharness-kpi/runner.test.ts b/test/unit/scripts/autoharness-kpi/runner.test.ts new file mode 100644 index 000000000..da5ac465c --- /dev/null +++ b/test/unit/scripts/autoharness-kpi/runner.test.ts @@ -0,0 +1,314 @@ +import {expect} from 'chai' +import {mkdtempSync, readFileSync, rmSync, writeFileSync} from 'node:fs' +import {tmpdir} from 'node:os' +import {join} from 'node:path' + +import type { + ArmResult, + Fixture, + FixtureTask, + KpiLlmClient, + KpiReport, +} from '../../../../scripts/autoharness-kpi/runner.js' + +import { + computeKpiReport, + exitCodeForReport, + loadFixture, + main, + makeStubLlmClient, + parseArgs, + runArm, + SHIP_GATE_DELTA, +} from '../../../../scripts/autoharness-kpi/runner.js' + +// ───────────────────────────────────────────────────────────────────────── +// Helpers +// ───────────────────────────────────────────────────────────────────────── + +const REPO_FIXTURE_PATH = 'scripts/autoharness-kpi/fixture-tasks.json' + +function makeFixture(tasks: FixtureTask[]): Fixture { + return { + commandType: 'curate', + fixtureVersion: 'test-1', + targetModel: 'stub', + tasks, + } +} + +function makeArmResult( + arm: 'harness' | 'raw', + taskIds: readonly string[], + rate: number, + runs: number, +): ArmResult { + const perTask = taskIds.map((id) => { + const successes = Math.round(rate * runs) + const runsResult: boolean[] = [] + for (let i = 0; i < runs; i++) runsResult.push(i < successes) + return {runs: runsResult, successRate: rate, taskId: id} + }) + return {arm, overallSuccessRate: rate, perTask} +} + +// ───────────────────────────────────────────────────────────────────────── +// Tests +// ───────────────────────────────────────────────────────────────────────── + +describe('KPI harness', () => { + describe('loadFixture', () => { + it('parses the shipped reference fixture', () => { + const fixture = loadFixture(REPO_FIXTURE_PATH) + expect(fixture.fixtureVersion).to.equal('1.0.0') + expect(fixture.targetModel).to.equal('llama-3.1-8b-instruct') + expect(fixture.commandType).to.equal('curate') + expect(fixture.tasks).to.have.length(20) + for (const task of fixture.tasks) { + expect(task.id).to.be.a('string').and.match(/^t\d{2}-/) + expect(task.taskDescription).to.be.a('string').and.not.be.empty + expect(task.expectedBehavior).to.be.a('string').and.not.be.empty + } + }) + + it('throws on a malformed fixture', () => { + const dir = mkdtempSync(join(tmpdir(), 'kpi-bad-')) + try { + const bad = join(dir, 'bad.json') + writeFileSync(bad, JSON.stringify({tasks: [{id: 1}]})) + expect(() => loadFixture(bad)).to.throw(/malformed/) + } finally { + rmSync(dir, {force: true, recursive: true}) + } + }) + + it('throws when tasks array is empty', () => { + const dir = mkdtempSync(join(tmpdir(), 'kpi-empty-')) + try { + const empty = join(dir, 'empty.json') + writeFileSync(empty, JSON.stringify({tasks: []})) + expect(() => loadFixture(empty)).to.throw(/no tasks/) + } finally { + rmSync(dir, {force: true, recursive: true}) + } + }) + }) + + describe('makeStubLlmClient', () => { + it('returns deterministic results per task + arm', async () => { + const client = makeStubLlmClient() + const task: FixtureTask = { + expectedBehavior: 'x', + id: 't01-list-exports', + taskDescription: 'x', + } + // t01: raw=0, harness=1 + expect(await client.runTask(task, 'raw')).to.equal(false) + expect(await client.runTask(task, 'harness')).to.equal(true) + // Deterministic — repeated calls agree. + expect(await client.runTask(task, 'raw')).to.equal(false) + expect(await client.runTask(task, 'harness')).to.equal(true) + }) + + it('defaults to both-arms-succeed for unknown task ids', async () => { + const client = makeStubLlmClient() + const task: FixtureTask = { + expectedBehavior: 'x', + id: 'unknown-task', + taskDescription: 'x', + } + expect(await client.runTask(task, 'raw')).to.equal(true) + expect(await client.runTask(task, 'harness')).to.equal(true) + }) + }) + + describe('runArm', () => { + it('aggregates success rate across N runs per task', async () => { + const client: KpiLlmClient = { + runTask: async () => true, + } + const tasks: FixtureTask[] = [ + {expectedBehavior: 'x', id: 'a', taskDescription: 'x'}, + {expectedBehavior: 'y', id: 'b', taskDescription: 'y'}, + ] + const result = await runArm('raw', tasks, 5, client) + expect(result.perTask).to.have.length(2) + expect(result.perTask[0].runs).to.have.length(5) + expect(result.overallSuccessRate).to.equal(1) + }) + + it('computes overall success rate as the flat mean across all runs', async () => { + // Half of the 4 tasks always succeed; other half always fail. + // With 2 runs each: 4 successes / 8 runs = 0.5. + const client: KpiLlmClient = { + runTask: async (task) => task.id.startsWith('good'), + } + const tasks: FixtureTask[] = [ + {expectedBehavior: 'x', id: 'good-1', taskDescription: 'x'}, + {expectedBehavior: 'x', id: 'good-2', taskDescription: 'x'}, + {expectedBehavior: 'x', id: 'bad-1', taskDescription: 'x'}, + {expectedBehavior: 'x', id: 'bad-2', taskDescription: 'x'}, + ] + const result = await runArm('harness', tasks, 2, client) + expect(result.overallSuccessRate).to.equal(0.5) + }) + }) + + describe('computeKpiReport', () => { + it('computes delta as harness minus raw', () => { + const fixture = makeFixture([ + {expectedBehavior: 'x', id: 'a', taskDescription: 'x'}, + ]) + const rawArm = makeArmResult('raw', ['a'], 0.4, 10) + const harnessArm = makeArmResult('harness', ['a'], 0.8, 10) + const report = computeKpiReport({fixture, harnessArm, rawArm, runsPerArm: 10}) + expect(report.rawSuccessRate).to.equal(0.4) + expect(report.harnessSuccessRate).to.equal(0.8) + expect(report.delta).to.be.closeTo(0.4, 1e-9) + }) + + it('carries fixture + model metadata through to the report', () => { + const fixture = makeFixture([ + {expectedBehavior: 'x', id: 'a', taskDescription: 'x'}, + ]) + const rawArm = makeArmResult('raw', ['a'], 0, 1) + const harnessArm = makeArmResult('harness', ['a'], 1, 1) + const report = computeKpiReport({ + fixture, + harnessArm, + measuredAt: 1_700_000_000_000, + rawArm, + runsPerArm: 1, + }) + expect(report.fixtureVersion).to.equal('test-1') + expect(report.targetModel).to.equal('stub') + expect(report.runsPerArm).to.equal(1) + expect(report.measuredAt).to.equal(1_700_000_000_000) + }) + }) + + describe('exitCodeForReport', () => { + it('exits 0 when delta ≥ ship-gate threshold (0.30)', () => { + const report: KpiReport = { + delta: SHIP_GATE_DELTA, + fixtureVersion: 'x', + harnessSuccessRate: 0.7, + measuredAt: 0, + perTask: [], + rawSuccessRate: 0.4, + runsPerArm: 10, + targetModel: 'x', + } + expect(exitCodeForReport(report)).to.equal(0) + }) + + it('exits 1 when delta is just below (0.29)', () => { + const report: KpiReport = { + delta: 0.29, + fixtureVersion: 'x', + harnessSuccessRate: 0.69, + measuredAt: 0, + perTask: [], + rawSuccessRate: 0.4, + runsPerArm: 10, + targetModel: 'x', + } + expect(exitCodeForReport(report)).to.equal(1) + }) + + it('exits 1 for zero / negative delta (harness worse)', () => { + const report: KpiReport = { + delta: -0.1, + fixtureVersion: 'x', + harnessSuccessRate: 0.3, + measuredAt: 0, + perTask: [], + rawSuccessRate: 0.4, + runsPerArm: 10, + targetModel: 'x', + } + expect(exitCodeForReport(report)).to.equal(1) + }) + }) + + describe('parseArgs', () => { + it('uses sane defaults', () => { + const args = parseArgs([]) + expect(args.fixture).to.equal('scripts/autoharness-kpi/fixture-tasks.json') + expect(args.llm).to.equal('stub') + expect(args.runs).to.equal(10) + expect(args.output).to.equal(undefined) + }) + + it('honors --fixture, --runs, --output, --llm', () => { + const args = parseArgs([ + '--fixture', + '/tmp/f.json', + '--runs', + '5', + '--output', + '/tmp/o.json', + '--llm', + 'stub', + ]) + expect(args.fixture).to.equal('/tmp/f.json') + expect(args.runs).to.equal(5) + expect(args.output).to.equal('/tmp/o.json') + expect(args.llm).to.equal('stub') + }) + + it('rejects --llm other than stub|real', () => { + expect(() => parseArgs(['--llm', 'fake'])).to.throw(/must be 'stub' or 'real'/) + }) + + it('rejects --runs non-numeric', () => { + expect(() => parseArgs(['--runs', 'abc'])).to.throw(/positive integer/) + }) + }) + + describe('main (end-to-end with stub LLM)', () => { + it('runs the shipped fixture and exits 0 at the ship gate', async function () { + this.timeout(5000) + const code = await main(['--fixture', REPO_FIXTURE_PATH, '--runs', '1']) + // Stub rates: 10 tasks with delta 100%, 10 with delta 0% → + // overall delta 0.50, well above the 0.30 gate. + expect(code).to.equal(0) + }) + + it('writes the report JSON when --output is given', async function () { + this.timeout(5000) + const dir = mkdtempSync(join(tmpdir(), 'kpi-out-')) + try { + const out = join(dir, 'report.json') + const code = await main([ + '--fixture', + REPO_FIXTURE_PATH, + '--runs', + '1', + '--output', + out, + ]) + expect(code).to.equal(0) + const report = JSON.parse(readFileSync(out, 'utf8')) as KpiReport + expect(report.rawSuccessRate).to.equal(0.5) + expect(report.harnessSuccessRate).to.equal(1) + expect(report.delta).to.equal(0.5) + expect(report.perTask).to.have.length(20) + } finally { + rmSync(dir, {force: true, recursive: true}) + } + }) + + it('throws a clear error when --llm real is requested (follow-up work)', async () => { + let caught: unknown + try { + await main(['--fixture', REPO_FIXTURE_PATH, '--llm', 'real']) + } catch (error) { + caught = error + } + + expect(caught).to.be.an('error') + expect((caught as Error).message).to.match(/--llm real is not yet implemented/) + }) + }) +}) From 9c366d3138c422f15cdb134400c95a22b45eb0e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Thu=E1=BA=ADn=20Ph=C3=A1t?= Date: Thu, 23 Apr 2026 16:55:23 +0700 Subject: [PATCH 2/2] refactor: [ENG-2332] address review-agent feedback on KPI harness --- scripts/autoharness-kpi/fixture-tasks.json | 1 - scripts/autoharness-kpi/runner.ts | 60 +++++++++---- .../scripts/autoharness-kpi/runner.test.ts | 86 ++++++++++++++++++- 3 files changed, 127 insertions(+), 20 deletions(-) diff --git a/scripts/autoharness-kpi/fixture-tasks.json b/scripts/autoharness-kpi/fixture-tasks.json index 268e9e9e6..889817e34 100644 --- a/scripts/autoharness-kpi/fixture-tasks.json +++ b/scripts/autoharness-kpi/fixture-tasks.json @@ -1,6 +1,5 @@ { "$comment": "AutoHarness V2 KPI reference fixture — 20 curate tasks run once with raw tools.* and once with the current harness, both against Llama 3.1 8B Instruct. Stable across ships; regenerate only on deliberate fixture updates. See features/autoharness-v2/tasks/phase_8/task_04-kpi-harness.md for the methodology.", - "$schema": "./fixture-tasks.schema.json", "fixtureVersion": "1.0.0", "targetModel": "llama-3.1-8b-instruct", "commandType": "curate", diff --git a/scripts/autoharness-kpi/runner.ts b/scripts/autoharness-kpi/runner.ts index 884395b92..6f467e3a8 100644 --- a/scripts/autoharness-kpi/runner.ts +++ b/scripts/autoharness-kpi/runner.ts @@ -121,9 +121,31 @@ const STUB_RATES: Readonly> = { // Pure functions (unit-tested) // ───────────────────────────────────────────────────────────────────────── +function isFixtureTask(value: unknown): value is FixtureTask { + if (typeof value !== 'object' || value === null) return false + const obj = value as Record + return ( + typeof obj.id === 'string' && + typeof obj.taskDescription === 'string' && + typeof obj.expectedBehavior === 'string' + ) +} + +const VALID_COMMAND_TYPES = new Set(['chat', 'curate', 'query']) + +function isValidCommandType(v: unknown): v is Fixture['commandType'] { + return typeof v === 'string' && VALID_COMMAND_TYPES.has(v) +} + /** Parse and validate a fixture file. Throws on schema mismatch. */ export function loadFixture(path: string): Fixture { - const raw = JSON.parse(readFileSync(path, 'utf8')) as unknown + let raw: unknown + try { + raw = JSON.parse(readFileSync(path, 'utf8')) as unknown + } catch (error) { + throw new Error(`fixture file is not valid JSON: ${path} — ${error instanceof Error ? error.message : String(error)}`) + } + if (typeof raw !== 'object' || raw === null) { throw new TypeError(`fixture is not an object: ${path}`) } @@ -134,23 +156,20 @@ export function loadFixture(path: string): Fixture { throw new Error(`fixture has no tasks: ${path}`) } + const validatedTasks: FixtureTask[] = [] for (const task of tasks) { - if ( - typeof task !== 'object' || - task === null || - typeof (task as FixtureTask).id !== 'string' || - typeof (task as FixtureTask).taskDescription !== 'string' || - typeof (task as FixtureTask).expectedBehavior !== 'string' - ) { + if (!isFixtureTask(task)) { throw new Error(`fixture task malformed: ${JSON.stringify(task)}`) } + + validatedTasks.push(task) } return { - commandType: (parsed.commandType as Fixture['commandType']) ?? 'curate', - fixtureVersion: (parsed.fixtureVersion as string) ?? 'unversioned', - targetModel: (parsed.targetModel as string) ?? 'unknown', - tasks: tasks as FixtureTask[], + commandType: isValidCommandType(parsed.commandType) ? parsed.commandType : 'curate', + fixtureVersion: typeof parsed.fixtureVersion === 'string' ? parsed.fixtureVersion : 'unversioned', + targetModel: typeof parsed.targetModel === 'string' ? parsed.targetModel : 'unknown', + tasks: validatedTasks, } } @@ -219,11 +238,18 @@ export function computeKpiReport(args: { const rawSuccessRate = rawArm.overallSuccessRate const byTaskId = new Map(rawArm.perTask.map((t) => [t.taskId, t.successRate])) - const perTask = harnessArm.perTask.map((h) => ({ - harnessSuccessRate: h.successRate, - rawSuccessRate: byTaskId.get(h.taskId) ?? 0, - taskId: h.taskId, - })) + const perTask = harnessArm.perTask.map((h) => { + const rawRate = byTaskId.get(h.taskId) + if (rawRate === undefined) { + throw new Error(`task ID mismatch: harness arm has '${h.taskId}' but raw arm does not`) + } + + return { + harnessSuccessRate: h.successRate, + rawSuccessRate: rawRate, + taskId: h.taskId, + } + }) return { delta: harnessSuccessRate - rawSuccessRate, diff --git a/test/unit/scripts/autoharness-kpi/runner.test.ts b/test/unit/scripts/autoharness-kpi/runner.test.ts index da5ac465c..4b35ee475 100644 --- a/test/unit/scripts/autoharness-kpi/runner.test.ts +++ b/test/unit/scripts/autoharness-kpi/runner.test.ts @@ -18,6 +18,7 @@ import { main, makeStubLlmClient, parseArgs, + renderReport, runArm, SHIP_GATE_DELTA, } from '../../../../scripts/autoharness-kpi/runner.js' @@ -82,6 +83,17 @@ describe('KPI harness', () => { } }) + it('throws a clear error on invalid JSON', () => { + const dir = mkdtempSync(join(tmpdir(), 'kpi-invalid-json-')) + try { + const bad = join(dir, 'not-json.json') + writeFileSync(bad, '{not: valid json}') + expect(() => loadFixture(bad)).to.throw(/not valid JSON/) + } finally { + rmSync(dir, {force: true, recursive: true}) + } + }) + it('throws when tasks array is empty', () => { const dir = mkdtempSync(join(tmpdir(), 'kpi-empty-')) try { @@ -167,6 +179,16 @@ describe('KPI harness', () => { expect(report.delta).to.be.closeTo(0.4, 1e-9) }) + it('throws on task-ID mismatch between arms', () => { + const fixture = makeFixture([ + {expectedBehavior: 'x', id: 'a', taskDescription: 'x'}, + ]) + const rawArm = makeArmResult('raw', ['b'], 0.5, 10) // 'b' not 'a' + const harnessArm = makeArmResult('harness', ['a'], 0.8, 10) + expect(() => computeKpiReport({fixture, harnessArm, rawArm, runsPerArm: 10})) + .to.throw(/task ID mismatch/) + }) + it('carries fixture + model metadata through to the report', () => { const fixture = makeFixture([ {expectedBehavior: 'x', id: 'a', taskDescription: 'x'}, @@ -264,6 +286,65 @@ describe('KPI harness', () => { it('rejects --runs non-numeric', () => { expect(() => parseArgs(['--runs', 'abc'])).to.throw(/positive integer/) }) + + it('rejects --runs negative', () => { + expect(() => parseArgs(['--runs', '-1'])).to.throw(/positive integer/) + }) + + it('rejects --runs zero', () => { + expect(() => parseArgs(['--runs', '0'])).to.throw(/positive integer/) + }) + + it('rejects --fixture with no argument', () => { + expect(() => parseArgs(['--fixture'])).to.throw(/requires a path/) + }) + + it('rejects --output with no argument', () => { + expect(() => parseArgs(['--output'])).to.throw(/requires a path/) + }) + + it('rejects --runs with no argument', () => { + expect(() => parseArgs(['--runs'])).to.throw(/requires a number/) + }) + }) + + describe('renderReport', () => { + it('includes per-task rates, overall, delta, and ship-gate verdict', () => { + const report: KpiReport = { + delta: 0.4, + fixtureVersion: 'test-1', + harnessSuccessRate: 0.9, + measuredAt: 0, + perTask: [ + {harnessSuccessRate: 0.9, rawSuccessRate: 0.5, taskId: 't01-test'}, + ], + rawSuccessRate: 0.5, + runsPerArm: 10, + targetModel: 'test-model', + } + const output = renderReport(report) + expect(output).to.include('t01-test') + expect(output).to.include('50%') + expect(output).to.include('90%') + expect(output).to.include('+40.0pp') + expect(output).to.include('ship gate met') + }) + + it('shows shortfall when delta is below ship gate', () => { + const report: KpiReport = { + delta: 0.2, + fixtureVersion: 'x', + harnessSuccessRate: 0.6, + measuredAt: 0, + perTask: [], + rawSuccessRate: 0.4, + runsPerArm: 10, + targetModel: 'x', + } + const output = renderReport(report) + expect(output).to.include('NOT met') + expect(output).to.include('10.0pp') + }) }) describe('main (end-to-end with stub LLM)', () => { @@ -307,8 +388,9 @@ describe('KPI harness', () => { caught = error } - expect(caught).to.be.an('error') - expect((caught as Error).message).to.match(/--llm real is not yet implemented/) + expect(caught).to.be.instanceOf(Error) + if (!(caught instanceof Error)) throw new Error('unreachable') + expect(caught.message).to.match(/--llm real is not yet implemented/) }) }) })