From 7bec1754ac0da3fead50d2ceac55d56ba2122d17 Mon Sep 17 00:00:00 2001
From: Danh Doan <danhdoancv@gmail.com>
Date: Wed, 22 Apr 2026 15:44:14 +0700
Subject: [PATCH 1/2] =?UTF-8?q?feat:=20[ENG-2332]=20AutoHarness=20V2=20KPI?=
 =?UTF-8?q?=20harness=20=E2=80=94=20scaffolding=20+=20fixture=20+=20stub-L?=
 =?UTF-8?q?LM=20runner?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 8 Task 8.4 — Tier 4 S1 brutal-review item. Ships the
release-engineering measurement tool that produces the v1.0 ship
gate's headline number: the Llama 3.1 8B success-rate delta on a
standardized curate task set, raw `tools.*` vs the current harness.

v1.0 ship gate: delta >= 30 percentage points on the reference
fixture or the release is blocked.

## Layout

All release-engineering tooling for this feature lives together
under `scripts/autoharness-kpi/`:

  scripts/autoharness-kpi/
    runner.ts              — main script (parseArgs + runArm +
                             computeKpiReport + exitCodeForReport
                             + main — all pure, unit-testable)
    fixture-tasks.json     — 20 curate tasks hand-crafted against
                             this codebase (stable across releases)
    expected-baseline.json — placeholder; populated at v1.0 ship
                             with the real-LLM measurement
  test/unit/scripts/autoharness-kpi/
    runner.test.ts         — 19 unit tests
  package.json
    "kpi:harness" — runs via `tsx scripts/autoharness-kpi/runner.ts`

Chose the `scripts/<feature>/` namespace over a top-level `kpi/`
directory: KPI tooling is a subsystem concern (AutoHarness V2 only),
not a first-class project concern. Big-tech repos (Kubernetes /
Rust / CockroachDB / VSCode) consistently keep per-feature
release tooling under an umbrella like `scripts/`, `hack/`, or
`tools/` — never at repo root.

## Scope — scaffolding + stub LLM only

`--llm stub` is fully operational with deterministic canned arm
rates (delta = 50pp, well above the 30pp gate) for script-logic
validation. `--llm real` currently throws a clear "not yet
implemented" error — wiring the real Llama 3.1 8B path into the
agent LLM service lands in a follow-up once Phase 7 finalizes the
command surface.

The fixture + math + CLI + exit-code logic all ship now (zero
conflict risk with Phat's in-flight Phase 6 work) so the
follow-up real-LLM PR is ~50 LOC of wiring.

## Fixture design

20 curate tasks covering concrete byterover-cli subjects:
exported classes under harness/, HarnessConfigSchema fields,
IHarnessStore methods, Zod schema enumeration, project-type
detection rules, mode thresholds, Mode C cap values, feedback
weights, etc. Each task has {id, taskDescription,
expectedBehavior}. Hand-crafted so a reviewer can eye-ball
realism. Aims at the "weak model needs hand-holding" band —
tasks too easy (any model succeeds) or too hard (no model
succeeds) wouldn't discriminate between arms.

## Tests (19)

- loadFixture: shipped fixture parses; malformed/empty rejected
- makeStubLlmClient: deterministic per-(task, arm); unknown-id
  fallback to both-arms-succeed
- runArm: aggregates correctly; overall = flat mean across runs
- computeKpiReport: delta math; metadata carried through
- exitCodeForReport: 0 at exactly 0.30 (inclusive floor), 1 at
  0.29, 1 on negative delta
- parseArgs: defaults; flag parsing; validates --llm + --runs
- main (end-to-end with stub): shipped fixture runs; --output
  writes valid JSON; --llm real surfaces the clear
  "not yet implemented" error

## Not merging yet

Per team process, this PR opens for review; Danh decides separately
whether to merge to proj/autoharness-v2 once review feedback is in.
---
 package.json                                  |   1 +
 .../autoharness-kpi/expected-baseline.json    |  10 +
 scripts/autoharness-kpi/fixture-tasks.json    | 109 +++++
 scripts/autoharness-kpi/runner.ts             | 399 ++++++++++++++++++
 .../scripts/autoharness-kpi/runner.test.ts    | 314 ++++++++++++++
 5 files changed, 833 insertions(+)
 create mode 100644 scripts/autoharness-kpi/expected-baseline.json
 create mode 100644 scripts/autoharness-kpi/fixture-tasks.json
 create mode 100644 scripts/autoharness-kpi/runner.ts
 create mode 100644 test/unit/scripts/autoharness-kpi/runner.test.ts
diff --git a/package.json b/package.json
index 082dd9687..f19d1386c 100644
--- a/package.json
+++ b/package.json
@@ -185,6 +185,7 @@
     "lint": "eslint",
     "lint:fix": "npm run lint -- --fix",
     "postpack": "shx rm -f oclif.manifest.json",
+    "kpi:harness": "tsx scripts/autoharness-kpi/runner.ts",
     "prepack": "npm run build && BRV_ENV=production oclif manifest",
     "prepare": "husky",
     "test": "mocha --forbid-only \"test/**/*.test.ts\"",
diff --git a/scripts/autoharness-kpi/expected-baseline.json b/scripts/autoharness-kpi/expected-baseline.json
new file mode 100644
index 000000000..6c6fef9e6
--- /dev/null
+++ b/scripts/autoharness-kpi/expected-baseline.json
@@ -0,0 +1,10 @@
+{
+  "$comment": "Populated by the first clean KPI run at v1.0 ship time. Until then, this file is a placeholder. On each subsequent ship, compare against these numbers — a significant regression (harnessSuccessRate drops by > 5pp or delta drops by > 5pp) should block the release.",
+  "fixtureVersion": "1.0.0",
+  "targetModel": "llama-3.1-8b-instruct",
+  "runsPerArm": 10,
+  "measuredAt": null,
+  "harnessSuccessRate": null,
+  "rawSuccessRate": null,
+  "delta": null
+}
diff --git a/scripts/autoharness-kpi/fixture-tasks.json b/scripts/autoharness-kpi/fixture-tasks.json
new file mode 100644
index 000000000..268e9e9e6
--- /dev/null
+++ b/scripts/autoharness-kpi/fixture-tasks.json
@@ -0,0 +1,109 @@
+{
+  "$comment": "AutoHarness V2 KPI reference fixture — 20 curate tasks run once with raw tools.* and once with the current harness, both against Llama 3.1 8B Instruct. Stable across ships; regenerate only on deliberate fixture updates. See features/autoharness-v2/tasks/phase_8/task_04-kpi-harness.md for the methodology.",
+  "$schema": "./fixture-tasks.schema.json",
+  "fixtureVersion": "1.0.0",
+  "targetModel": "llama-3.1-8b-instruct",
+  "commandType": "curate",
+  "tasks": [
+    {
+      "id": "t01-list-exports",
+      "taskDescription": "Curate every exported class name from `src/agent/infra/harness/` along with its source file path.",
+      "expectedBehavior": "Returns a non-empty list of `{className, filePath}` entries covering HarnessStore, HarnessBootstrap, HarnessModuleBuilder, HarnessOutcomeRecorder, HarnessSynthesizer, HarnessEvaluator, HarnessScenarioCapture (at minimum)."
+    },
+    {
+      "id": "t02-config-fields",
+      "taskDescription": "Curate all fields defined on HarnessConfigSchema in `src/agent/infra/agent/agent-schemas.ts`.",
+      "expectedBehavior": "Returns the full field list: autoLearn, enabled, language, maxVersions, modeOverride, refinementModel."
+    },
+    {
+      "id": "t03-interface-methods",
+      "taskDescription": "Curate every method name on the IHarnessStore interface.",
+      "expectedBehavior": "Returns all ~11 methods (deleteOutcomes, getLatest, getVersion, listOutcomes, listScenarios, listVersions, pruneOldVersions, recordFeedback, saveOutcome, saveScenario, saveVersion)."
+    },
+    {
+      "id": "t04-zod-schemas",
+      "taskDescription": "List every Zod schema exported from `src/agent/core/domain/harness/types.ts`.",
+      "expectedBehavior": "Returns HarnessCapabilitySchema, HarnessLanguageSchema, HarnessMetaSchema, HarnessModeSchema, HarnessVersionSchema, CodeExecOutcomeSchema, EvaluationScenarioSchema, ProjectTypeSchema."
+    },
+    {
+      "id": "t05-error-codes",
+      "taskDescription": "Curate all error codes on HarnessStoreErrorCode.",
+      "expectedBehavior": "Returns OUTCOME_NOT_FOUND, VERSION_CONFLICT."
+    },
+    {
+      "id": "t06-template-names",
+      "taskDescription": "Curate all harness template files under `src/agent/infra/harness/templates/`.",
+      "expectedBehavior": "Returns the 3 shipped curate templates: curate/typescript.ts, curate/python.ts, curate/generic.ts."
+    },
+    {
+      "id": "t07-mode-thresholds",
+      "taskDescription": "Curate the heuristic thresholds that map H to HarnessMode.",
+      "expectedBehavior": "Returns {assisted: 0.30, filter: 0.60, policy: 0.85} or the equivalent ranges."
+    },
+    {
+      "id": "t08-oclif-commands",
+      "taskDescription": "List every `brv harness <subcommand>` command file under `src/oclif/commands/harness/`.",
+      "expectedBehavior": "Returns the shipped subcommand names. (Minimum set grows as Phase 7 lands; empty result when phase not shipped is acceptable.)"
+    },
+    {
+      "id": "t09-sandbox-services",
+      "taskDescription": "Curate every `set*` method on SandboxService.",
+      "expectedBehavior": "Returns setHarnessConfig, setHarnessStore, setHarnessModuleBuilder, setHarnessOutcomeRecorder, setEnvironmentContext, setFileSystem, setContentGenerator, etc."
+    },
+    {
+      "id": "t10-event-payloads",
+      "taskDescription": "Curate the payload fields of the harness:refinement-completed event.",
+      "expectedBehavior": "Returns fromVersionId, accepted, toVersionId?, deltaH?, reason?."
+    },
+    {
+      "id": "t11-tools-sdk-methods",
+      "taskDescription": "Curate every method surface exposed by HarnessContextTools in types.ts.",
+      "expectedBehavior": "Returns curate (+ signature) and readFile (+ signature). v1.0 surface is exactly those two."
+    },
+    {
+      "id": "t12-capability-tags",
+      "taskDescription": "Curate every HarnessCapability tag.",
+      "expectedBehavior": "Returns discover, extract, buildOps, search, gather, curate, answer."
+    },
+    {
+      "id": "t13-phase-task-docs",
+      "taskDescription": "Curate every task doc under `features/autoharness-v2/tasks/phase_5/`.",
+      "expectedBehavior": "Returns the 5 task docs: task_01 mode selector, task_02 prompt contributor, task_03 Mode C caps, task_04 agent wiring, task_05 integration test."
+    },
+    {
+      "id": "t14-numeric-parameters",
+      "taskDescription": "Curate the numeric parameters locked in v1-design-decisions.md §2.1 (heuristic formula weights + windowing).",
+      "expectedBehavior": "Returns formula weights {successRate: 0.2, errorRate: 0.3, realHarnessRate: 0.5}, window size 50, min-sample floor 10, recency 30-day half-life, pass-through cap 0.55."
+    },
+    {
+      "id": "t15-mode-c-caps",
+      "taskDescription": "Curate the two Mode C safety caps and their values.",
+      "expectedBehavior": "Returns ops cap 50 per curate() call, rate cap 30 mapExtract/60s process-wide."
+    },
+    {
+      "id": "t16-refinement-gates",
+      "taskDescription": "Curate the refinement acceptance thresholds.",
+      "expectedBehavior": "Returns ΔH improvement ≥ 0.05, evaluation runs per candidate 10, max versions per pair 20, refinement queue concurrency 1 per pair."
+    },
+    {
+      "id": "t17-feedback-weights",
+      "taskDescription": "Curate the user-feedback insertion weights.",
+      "expectedBehavior": "Returns bad → 3 synthetic failures, good → 1 synthetic success, cap 10 per window."
+    },
+    {
+      "id": "t18-project-detector-rules",
+      "taskDescription": "Curate the project-type detection rules in project-detector.ts.",
+      "expectedBehavior": "Returns: tsconfig.json → typescript; package.json with typescript dep → typescript; pyproject.toml → python; setup.py → python; setup.cfg → python; polyglot → both."
+    },
+    {
+      "id": "t19-weak-model-block-list",
+      "taskDescription": "Curate the weak-model block list entries (refinement skipped).",
+      "expectedBehavior": "Returns all entries in REFINEMENT_MODEL_BLOCKLIST (llama-3.1-8b-instruct, mistral-7b-instruct, qwen-2.5-7b-instruct, gemma-2-9b-it, phi-3-mini)."
+    },
+    {
+      "id": "t20-phase-ship-gates",
+      "taskDescription": "Curate each phase's ship-gate integration test file path.",
+      "expectedBehavior": "Returns Phase 3 isolation.test.ts, Phase 4 cold-start.test.ts, Phase 5 mode-selection.test.ts, Phase 6 learning-loop.test.ts, Phase 7 cli-lifecycle.test.ts, Phase 8 full-lifecycle.test.ts."
+    }
+  ]
+}
diff --git a/scripts/autoharness-kpi/runner.ts b/scripts/autoharness-kpi/runner.ts
new file mode 100644
index 000000000..884395b92
--- /dev/null
+++ b/scripts/autoharness-kpi/runner.ts
@@ -0,0 +1,399 @@
+#!/usr/bin/env node
+/**
+ * AutoHarness V2 — KPI harness (Phase 8 Task 8.4).
+ *
+ * Runs the reference fixture (`scripts/autoharness-kpi/fixture-tasks.json`)
+ * through two arms:
+ *   - Arm A: raw `tools.*` orchestration (no harness)
+ *   - Arm B: current harness version
+ *
+ * Reports the success-rate delta. v1.0 ship gate:
+ *   delta >= 0.30   → exit 0
+ *   delta <  0.30   → exit 1 (block the release)
+ *
+ * Usage:
+ *   npm run kpi:harness -- [--fixture <path>]
+ *                          [--runs <N>]
+ *                          [--output <path>]
+ *                          [--llm stub|real]
+ *
+ * Defaults: `--llm stub` (deterministic synthetic arm results for
+ * CI validation of the script itself). Real-LLM measurement runs
+ * are triggered pre-ship via `--llm real` and require the agent
+ * stack to be wired — landing in a follow-up PR.
+ */
+
+import {readFileSync, writeFileSync} from 'node:fs'
+import {resolve} from 'node:path'
+
+// ─────────────────────────────────────────────────────────────────────────
+// Types
+// ─────────────────────────────────────────────────────────────────────────
+
+export interface FixtureTask {
+  readonly expectedBehavior: string
+  readonly id: string
+  readonly taskDescription: string
+}
+
+export interface Fixture {
+  readonly commandType: 'chat' | 'curate' | 'query'
+  readonly fixtureVersion: string
+  readonly targetModel: string
+  readonly tasks: readonly FixtureTask[]
+}
+
+export type ArmName = 'harness' | 'raw'
+
+export interface ArmRunResult {
+  readonly runs: readonly boolean[] // true = success
+  readonly successRate: number // mean across runs
+  readonly taskId: string
+}
+
+export interface ArmResult {
+  readonly arm: ArmName
+  readonly overallSuccessRate: number
+  readonly perTask: readonly ArmRunResult[]
+}
+
+export interface KpiReport {
+  readonly delta: number
+  readonly fixtureVersion: string
+  readonly harnessSuccessRate: number
+  readonly measuredAt: number // Date.now()
+  readonly perTask: ReadonlyArray<{
+    readonly harnessSuccessRate: number
+    readonly rawSuccessRate: number
+    readonly taskId: string
+  }>
+  readonly rawSuccessRate: number
+  readonly runsPerArm: number
+  readonly targetModel: string
+}
+
+/**
+ * Shape of an LLM backend the KPI runner delegates to. Stubbed in
+ * the default `--llm stub` path; real path (follow-up PR) wires this
+ * into the agent stack with a Llama 3.1 8B provider.
+ */
+export interface KpiLlmClient {
+  runTask(task: FixtureTask, arm: ArmName): Promise<boolean>
+}
+
+// ─────────────────────────────────────────────────────────────────────────
+// Constants — ship-gate delta + stub arm success rates
+// ─────────────────────────────────────────────────────────────────────────
+
+/** v1.0 ship-gate threshold per `v1-design-decisions.md §2.7`. */
+export const SHIP_GATE_DELTA = 0.3
+
+// Stub success rates, per task id. Deterministic so the script's
+// exit-code logic is testable. Mix designed to produce
+// delta = 0.50 (well above ship gate) on the default 10-run config:
+//   - Tasks t01-t10: raw always fails, harness always succeeds (delta 100%)
+//   - Tasks t11-t20: both arms always succeed (delta 0%)
+//   - Mean delta across 20 tasks = 0.50
+const STUB_RATES: Readonly<Record<string, {harness: number; raw: number;}>> = {
+  t01: {harness: 1, raw: 0},
+  t02: {harness: 1, raw: 0},
+  t03: {harness: 1, raw: 0},
+  t04: {harness: 1, raw: 0},
+  t05: {harness: 1, raw: 0},
+  t06: {harness: 1, raw: 0},
+  t07: {harness: 1, raw: 0},
+  t08: {harness: 1, raw: 0},
+  t09: {harness: 1, raw: 0},
+  t10: {harness: 1, raw: 0},
+  t11: {harness: 1, raw: 1},
+  t12: {harness: 1, raw: 1},
+  t13: {harness: 1, raw: 1},
+  t14: {harness: 1, raw: 1},
+  t15: {harness: 1, raw: 1},
+  t16: {harness: 1, raw: 1},
+  t17: {harness: 1, raw: 1},
+  t18: {harness: 1, raw: 1},
+  t19: {harness: 1, raw: 1},
+  t20: {harness: 1, raw: 1},
+}
+
+// ─────────────────────────────────────────────────────────────────────────
+// Pure functions (unit-tested)
+// ─────────────────────────────────────────────────────────────────────────
+
+/** Parse and validate a fixture file. Throws on schema mismatch. */
+export function loadFixture(path: string): Fixture {
+  const raw = JSON.parse(readFileSync(path, 'utf8')) as unknown
+  if (typeof raw !== 'object' || raw === null) {
+    throw new TypeError(`fixture is not an object: ${path}`)
+  }
+
+  const parsed = raw as Record<string, unknown>
+  const {tasks} = parsed
+  if (!Array.isArray(tasks) || tasks.length === 0) {
+    throw new Error(`fixture has no tasks: ${path}`)
+  }
+
+  for (const task of tasks) {
+    if (
+      typeof task !== 'object' ||
+      task === null ||
+      typeof (task as FixtureTask).id !== 'string' ||
+      typeof (task as FixtureTask).taskDescription !== 'string' ||
+      typeof (task as FixtureTask).expectedBehavior !== 'string'
+    ) {
+      throw new Error(`fixture task malformed: ${JSON.stringify(task)}`)
+    }
+  }
+
+  return {
+    commandType: (parsed.commandType as Fixture['commandType']) ?? 'curate',
+    fixtureVersion: (parsed.fixtureVersion as string) ?? 'unversioned',
+    targetModel: (parsed.targetModel as string) ?? 'unknown',
+    tasks: tasks as FixtureTask[],
+  }
+}
+
+/**
+ * Deterministic stub LLM client. Used by `--llm stub` and by unit
+ * tests. Keeps the script's delta math + exit logic testable
+ * without touching a real model.
+ */
+export function makeStubLlmClient(): KpiLlmClient {
+  return {
+    async runTask(task: FixtureTask, arm: ArmName): Promise<boolean> {
+      const idKey = task.id.split('-')[0] // e.g. "t01" from "t01-list-exports"
+      const rates = idKey === undefined ? undefined : STUB_RATES[idKey]
+      if (rates === undefined) {
+        // Unknown task id → baseline "both arms succeed" to keep
+        // stub output predictable for custom fixtures.
+        return true
+      }
+
+      return arm === 'raw' ? rates.raw === 1 : rates.harness === 1
+    },
+  }
+}
+
+/** Run one arm over the full fixture, `runs` times per task. */
+export async function runArm(
+  arm: ArmName,
+  tasks: readonly FixtureTask[],
+  runs: number,
+  client: KpiLlmClient,
+): Promise<ArmResult> {
+  const perTask: ArmRunResult[] = []
+
+  for (const task of tasks) {
+    const results: boolean[] = []
+    for (let i = 0; i < runs; i++) {
+      // eslint-disable-next-line no-await-in-loop
+      const ok = await client.runTask(task, arm)
+      results.push(ok)
+    }
+
+    const successRate = results.filter(Boolean).length / results.length
+    perTask.push({runs: results, successRate, taskId: task.id})
+  }
+
+  const totalRuns = perTask.reduce((acc, t) => acc + t.runs.length, 0)
+  const totalSuccesses = perTask.reduce(
+    (acc, t) => acc + t.runs.filter(Boolean).length,
+    0,
+  )
+  const overallSuccessRate = totalRuns === 0 ? 0 : totalSuccesses / totalRuns
+
+  return {arm, overallSuccessRate, perTask}
+}
+
+/** Combine two arm results into the final KPI report. */
+export function computeKpiReport(args: {
+  readonly fixture: Fixture
+  readonly harnessArm: ArmResult
+  readonly measuredAt?: number
+  readonly rawArm: ArmResult
+  readonly runsPerArm: number
+}): KpiReport {
+  const {fixture, harnessArm, rawArm, runsPerArm} = args
+  const harnessSuccessRate = harnessArm.overallSuccessRate
+  const rawSuccessRate = rawArm.overallSuccessRate
+
+  const byTaskId = new Map(rawArm.perTask.map((t) => [t.taskId, t.successRate]))
+  const perTask = harnessArm.perTask.map((h) => ({
+    harnessSuccessRate: h.successRate,
+    rawSuccessRate: byTaskId.get(h.taskId) ?? 0,
+    taskId: h.taskId,
+  }))
+
+  return {
+    delta: harnessSuccessRate - rawSuccessRate,
+    fixtureVersion: fixture.fixtureVersion,
+    harnessSuccessRate,
+    measuredAt: args.measuredAt ?? Date.now(),
+    perTask,
+    rawSuccessRate,
+    runsPerArm,
+    targetModel: fixture.targetModel,
+  }
+}
+
+/** Exit code from a report: 0 if delta ≥ ship gate, else 1. */
+export function exitCodeForReport(report: KpiReport): 0 | 1 {
+  return report.delta >= SHIP_GATE_DELTA ? 0 : 1
+}
+
+/** Human-readable table for stdout. */
+export function renderReport(report: KpiReport): string {
+  const lines: string[] = []
+  lines.push(
+    `KPI report — fixture ${report.fixtureVersion} × model ${report.targetModel}`,
+    `runs/arm: ${report.runsPerArm}`,
+    '',
+    `task                         raw     harness`,
+    `─`.repeat(50),
+  )
+  for (const t of report.perTask) {
+    const id = t.taskId.padEnd(28)
+    const raw = `${Math.round(t.rawSuccessRate * 100)}%`.padStart(5)
+    const harness = `${Math.round(t.harnessSuccessRate * 100)}%`.padStart(8)
+    lines.push(`${id} ${raw}   ${harness}`)
+  }
+
+  lines.push(
+    `─`.repeat(50),
+    `overall:                     ${(report.rawSuccessRate * 100).toFixed(1).padStart(5)}%  ${(report.harnessSuccessRate * 100).toFixed(1).padStart(7)}%`,
+    `delta:                       ${(report.delta >= 0 ? '+' : '') + (report.delta * 100).toFixed(1)}pp`,
+    report.delta >= SHIP_GATE_DELTA
+      ? `✓ ship gate met (>= ${SHIP_GATE_DELTA * 100}pp)`
+      : `✗ ship gate NOT met — shortfall ${((SHIP_GATE_DELTA - report.delta) * 100).toFixed(1)}pp`,
+  )
+
+  return lines.join('\n')
+}
+
+// ─────────────────────────────────────────────────────────────────────────
+// CLI
+// ─────────────────────────────────────────────────────────────────────────
+
+interface CliArgs {
+  readonly fixture: string
+  readonly llm: 'real' | 'stub'
+  readonly output?: string
+  readonly runs: number
+}
+
+export function parseArgs(argv: readonly string[]): CliArgs {
+  let fixture = 'scripts/autoharness-kpi/fixture-tasks.json'
+  let output: string | undefined
+  let runs = 10
+  let llm: CliArgs['llm'] = 'stub'
+
+  for (let i = 0; i < argv.length; i++) {
+    const a = argv[i]
+    switch (a) {
+    case '--fixture': {
+      const next = argv[i + 1]
+      if (next === undefined) throw new Error('--fixture requires a path')
+      fixture = next
+      i++
+    
+    break;
+    }
+
+    case '--llm': {
+      const next = argv[i + 1]
+      if (next !== 'stub' && next !== 'real') {
+        throw new Error(`--llm must be 'stub' or 'real', got: ${next ?? '<missing>'}`)
+      }
+
+      llm = next
+      i++
+    
+    break;
+    }
+
+    case '--output': {
+      const next = argv[i + 1]
+      if (next === undefined) throw new Error('--output requires a path')
+      output = next
+      i++
+    
+    break;
+    }
+
+    case '--runs': {
+      const next = argv[i + 1]
+      if (next === undefined) throw new Error('--runs requires a number')
+      runs = Number.parseInt(next, 10)
+      if (!Number.isFinite(runs) || runs <= 0) {
+        throw new Error(`--runs must be a positive integer, got: ${next}`)
+      }
+
+      i++
+    
+    break;
+    }
+    // No default
+    }
+  }
+
+  return {fixture, llm, output, runs}
+}
+
+export async function main(argv: readonly string[]): Promise<number> {
+  const args = parseArgs(argv)
+  const fixture = loadFixture(resolve(args.fixture))
+
+  let client: KpiLlmClient
+  if (args.llm === 'stub') {
+    client = makeStubLlmClient()
+  } else {
+    // Real-LLM path — requires the agent stack wired for Llama 3.1 8B.
+    // Lands in a follow-up PR once the KPI run is exercised pre-ship.
+    throw new Error(
+      "--llm real is not yet implemented in this PR — use --llm stub for script validation. " +
+        'The real-LLM path will wire into the agent LLM service in a follow-up; see task_04-kpi-harness.md.',
+    )
+  }
+
+  const rawArm = await runArm('raw', fixture.tasks, args.runs, client)
+  const harnessArm = await runArm('harness', fixture.tasks, args.runs, client)
+  const report = computeKpiReport({
+    fixture,
+    harnessArm,
+    rawArm,
+    runsPerArm: args.runs,
+  })
+
+   
+  console.log(renderReport(report))
+  if (args.output !== undefined) {
+    writeFileSync(resolve(args.output), JSON.stringify(report, null, 2))
+  }
+
+  return exitCodeForReport(report)
+}
+
+// ─────────────────────────────────────────────────────────────────────────
+// Entry point (only runs when invoked directly via `tsx`/`node`)
+// ─────────────────────────────────────────────────────────────────────────
+
+// `process.exit()` is the right API for a CLI that needs explicit
+// ship-gate exit codes (0 / 1 / 2). `unicorn/prefer-top-level-await`
+// is fine here because the guarded block only runs in the direct-
+// invocation path, not when test files import this module.
+
+const invokedDirectly = import.meta.url === `file://${process.argv[1]}`
+if (invokedDirectly) {
+  try {
+    const code = await main(process.argv.slice(2))
+    // eslint-disable-next-line n/no-process-exit
+    process.exit(code)
+  } catch (error) {
+    console.error(
+      `kpi-harness failed: ${error instanceof Error ? error.message : String(error)}`,
+    )
+    // eslint-disable-next-line n/no-process-exit
+    process.exit(2)
+  }
+}
diff --git a/test/unit/scripts/autoharness-kpi/runner.test.ts b/test/unit/scripts/autoharness-kpi/runner.test.ts
new file mode 100644
index 000000000..da5ac465c
--- /dev/null
+++ b/test/unit/scripts/autoharness-kpi/runner.test.ts
@@ -0,0 +1,314 @@
+import {expect} from 'chai'
+import {mkdtempSync, readFileSync, rmSync, writeFileSync} from 'node:fs'
+import {tmpdir} from 'node:os'
+import {join} from 'node:path'
+
+import type {
+  ArmResult,
+  Fixture,
+  FixtureTask,
+  KpiLlmClient,
+  KpiReport,
+} from '../../../../scripts/autoharness-kpi/runner.js'
+
+import {
+  computeKpiReport,
+  exitCodeForReport,
+  loadFixture,
+  main,
+  makeStubLlmClient,
+  parseArgs,
+  runArm,
+  SHIP_GATE_DELTA,
+} from '../../../../scripts/autoharness-kpi/runner.js'
+
+// ─────────────────────────────────────────────────────────────────────────
+// Helpers
+// ─────────────────────────────────────────────────────────────────────────
+
+const REPO_FIXTURE_PATH = 'scripts/autoharness-kpi/fixture-tasks.json'
+
+function makeFixture(tasks: FixtureTask[]): Fixture {
+  return {
+    commandType: 'curate',
+    fixtureVersion: 'test-1',
+    targetModel: 'stub',
+    tasks,
+  }
+}
+
+function makeArmResult(
+  arm: 'harness' | 'raw',
+  taskIds: readonly string[],
+  rate: number,
+  runs: number,
+): ArmResult {
+  const perTask = taskIds.map((id) => {
+    const successes = Math.round(rate * runs)
+    const runsResult: boolean[] = []
+    for (let i = 0; i < runs; i++) runsResult.push(i < successes)
+    return {runs: runsResult, successRate: rate, taskId: id}
+  })
+  return {arm, overallSuccessRate: rate, perTask}
+}
+
+// ─────────────────────────────────────────────────────────────────────────
+// Tests
+// ─────────────────────────────────────────────────────────────────────────
+
+describe('KPI harness', () => {
+  describe('loadFixture', () => {
+    it('parses the shipped reference fixture', () => {
+      const fixture = loadFixture(REPO_FIXTURE_PATH)
+      expect(fixture.fixtureVersion).to.equal('1.0.0')
+      expect(fixture.targetModel).to.equal('llama-3.1-8b-instruct')
+      expect(fixture.commandType).to.equal('curate')
+      expect(fixture.tasks).to.have.length(20)
+      for (const task of fixture.tasks) {
+        expect(task.id).to.be.a('string').and.match(/^t\d{2}-/)
+        expect(task.taskDescription).to.be.a('string').and.not.be.empty
+        expect(task.expectedBehavior).to.be.a('string').and.not.be.empty
+      }
+    })
+
+    it('throws on a malformed fixture', () => {
+      const dir = mkdtempSync(join(tmpdir(), 'kpi-bad-'))
+      try {
+        const bad = join(dir, 'bad.json')
+        writeFileSync(bad, JSON.stringify({tasks: [{id: 1}]}))
+        expect(() => loadFixture(bad)).to.throw(/malformed/)
+      } finally {
+        rmSync(dir, {force: true, recursive: true})
+      }
+    })
+
+    it('throws when tasks array is empty', () => {
+      const dir = mkdtempSync(join(tmpdir(), 'kpi-empty-'))
+      try {
+        const empty = join(dir, 'empty.json')
+        writeFileSync(empty, JSON.stringify({tasks: []}))
+        expect(() => loadFixture(empty)).to.throw(/no tasks/)
+      } finally {
+        rmSync(dir, {force: true, recursive: true})
+      }
+    })
+  })
+
+  describe('makeStubLlmClient', () => {
+    it('returns deterministic results per task + arm', async () => {
+      const client = makeStubLlmClient()
+      const task: FixtureTask = {
+        expectedBehavior: 'x',
+        id: 't01-list-exports',
+        taskDescription: 'x',
+      }
+      // t01: raw=0, harness=1
+      expect(await client.runTask(task, 'raw')).to.equal(false)
+      expect(await client.runTask(task, 'harness')).to.equal(true)
+      // Deterministic — repeated calls agree.
+      expect(await client.runTask(task, 'raw')).to.equal(false)
+      expect(await client.runTask(task, 'harness')).to.equal(true)
+    })
+
+    it('defaults to both-arms-succeed for unknown task ids', async () => {
+      const client = makeStubLlmClient()
+      const task: FixtureTask = {
+        expectedBehavior: 'x',
+        id: 'unknown-task',
+        taskDescription: 'x',
+      }
+      expect(await client.runTask(task, 'raw')).to.equal(true)
+      expect(await client.runTask(task, 'harness')).to.equal(true)
+    })
+  })
+
+  describe('runArm', () => {
+    it('aggregates success rate across N runs per task', async () => {
+      const client: KpiLlmClient = {
+        runTask: async () => true,
+      }
+      const tasks: FixtureTask[] = [
+        {expectedBehavior: 'x', id: 'a', taskDescription: 'x'},
+        {expectedBehavior: 'y', id: 'b', taskDescription: 'y'},
+      ]
+      const result = await runArm('raw', tasks, 5, client)
+      expect(result.perTask).to.have.length(2)
+      expect(result.perTask[0].runs).to.have.length(5)
+      expect(result.overallSuccessRate).to.equal(1)
+    })
+
+    it('computes overall success rate as the flat mean across all runs', async () => {
+      // Half of the 4 tasks always succeed; other half always fail.
+      // With 2 runs each: 4 successes / 8 runs = 0.5.
+      const client: KpiLlmClient = {
+        runTask: async (task) => task.id.startsWith('good'),
+      }
+      const tasks: FixtureTask[] = [
+        {expectedBehavior: 'x', id: 'good-1', taskDescription: 'x'},
+        {expectedBehavior: 'x', id: 'good-2', taskDescription: 'x'},
+        {expectedBehavior: 'x', id: 'bad-1', taskDescription: 'x'},
+        {expectedBehavior: 'x', id: 'bad-2', taskDescription: 'x'},
+      ]
+      const result = await runArm('harness', tasks, 2, client)
+      expect(result.overallSuccessRate).to.equal(0.5)
+    })
+  })
+
+  describe('computeKpiReport', () => {
+    it('computes delta as harness minus raw', () => {
+      const fixture = makeFixture([
+        {expectedBehavior: 'x', id: 'a', taskDescription: 'x'},
+      ])
+      const rawArm = makeArmResult('raw', ['a'], 0.4, 10)
+      const harnessArm = makeArmResult('harness', ['a'], 0.8, 10)
+      const report = computeKpiReport({fixture, harnessArm, rawArm, runsPerArm: 10})
+      expect(report.rawSuccessRate).to.equal(0.4)
+      expect(report.harnessSuccessRate).to.equal(0.8)
+      expect(report.delta).to.be.closeTo(0.4, 1e-9)
+    })
+
+    it('carries fixture + model metadata through to the report', () => {
+      const fixture = makeFixture([
+        {expectedBehavior: 'x', id: 'a', taskDescription: 'x'},
+      ])
+      const rawArm = makeArmResult('raw', ['a'], 0, 1)
+      const harnessArm = makeArmResult('harness', ['a'], 1, 1)
+      const report = computeKpiReport({
+        fixture,
+        harnessArm,
+        measuredAt: 1_700_000_000_000,
+        rawArm,
+        runsPerArm: 1,
+      })
+      expect(report.fixtureVersion).to.equal('test-1')
+      expect(report.targetModel).to.equal('stub')
+      expect(report.runsPerArm).to.equal(1)
+      expect(report.measuredAt).to.equal(1_700_000_000_000)
+    })
+  })
+
+  describe('exitCodeForReport', () => {
+    it('exits 0 when delta ≥ ship-gate threshold (0.30)', () => {
+      const report: KpiReport = {
+        delta: SHIP_GATE_DELTA,
+        fixtureVersion: 'x',
+        harnessSuccessRate: 0.7,
+        measuredAt: 0,
+        perTask: [],
+        rawSuccessRate: 0.4,
+        runsPerArm: 10,
+        targetModel: 'x',
+      }
+      expect(exitCodeForReport(report)).to.equal(0)
+    })
+
+    it('exits 1 when delta is just below (0.29)', () => {
+      const report: KpiReport = {
+        delta: 0.29,
+        fixtureVersion: 'x',
+        harnessSuccessRate: 0.69,
+        measuredAt: 0,
+        perTask: [],
+        rawSuccessRate: 0.4,
+        runsPerArm: 10,
+        targetModel: 'x',
+      }
+      expect(exitCodeForReport(report)).to.equal(1)
+    })
+
+    it('exits 1 for zero / negative delta (harness worse)', () => {
+      const report: KpiReport = {
+        delta: -0.1,
+        fixtureVersion: 'x',
+        harnessSuccessRate: 0.3,
+        measuredAt: 0,
+        perTask: [],
+        rawSuccessRate: 0.4,
+        runsPerArm: 10,
+        targetModel: 'x',
+      }
+      expect(exitCodeForReport(report)).to.equal(1)
+    })
+  })
+
+  describe('parseArgs', () => {
+    it('uses sane defaults', () => {
+      const args = parseArgs([])
+      expect(args.fixture).to.equal('scripts/autoharness-kpi/fixture-tasks.json')
+      expect(args.llm).to.equal('stub')
+      expect(args.runs).to.equal(10)
+      expect(args.output).to.equal(undefined)
+    })
+
+    it('honors --fixture, --runs, --output, --llm', () => {
+      const args = parseArgs([
+        '--fixture',
+        '/tmp/f.json',
+        '--runs',
+        '5',
+        '--output',
+        '/tmp/o.json',
+        '--llm',
+        'stub',
+      ])
+      expect(args.fixture).to.equal('/tmp/f.json')
+      expect(args.runs).to.equal(5)
+      expect(args.output).to.equal('/tmp/o.json')
+      expect(args.llm).to.equal('stub')
+    })
+
+    it('rejects --llm other than stub|real', () => {
+      expect(() => parseArgs(['--llm', 'fake'])).to.throw(/must be 'stub' or 'real'/)
+    })
+
+    it('rejects --runs non-numeric', () => {
+      expect(() => parseArgs(['--runs', 'abc'])).to.throw(/positive integer/)
+    })
+  })
+
+  describe('main (end-to-end with stub LLM)', () => {
+    it('runs the shipped fixture and exits 0 at the ship gate', async function () {
+      this.timeout(5000)
+      const code = await main(['--fixture', REPO_FIXTURE_PATH, '--runs', '1'])
+      // Stub rates: 10 tasks with delta 100%, 10 with delta 0% →
+      // overall delta 0.50, well above the 0.30 gate.
+      expect(code).to.equal(0)
+    })
+
+    it('writes the report JSON when --output is given', async function () {
+      this.timeout(5000)
+      const dir = mkdtempSync(join(tmpdir(), 'kpi-out-'))
+      try {
+        const out = join(dir, 'report.json')
+        const code = await main([
+          '--fixture',
+          REPO_FIXTURE_PATH,
+          '--runs',
+          '1',
+          '--output',
+          out,
+        ])
+        expect(code).to.equal(0)
+        const report = JSON.parse(readFileSync(out, 'utf8')) as KpiReport
+        expect(report.rawSuccessRate).to.equal(0.5)
+        expect(report.harnessSuccessRate).to.equal(1)
+        expect(report.delta).to.equal(0.5)
+        expect(report.perTask).to.have.length(20)
+      } finally {
+        rmSync(dir, {force: true, recursive: true})
+      }
+    })
+
+    it('throws a clear error when --llm real is requested (follow-up work)', async () => {
+      let caught: unknown
+      try {
+        await main(['--fixture', REPO_FIXTURE_PATH, '--llm', 'real'])
+      } catch (error) {
+        caught = error
+      }
+
+      expect(caught).to.be.an('error')
+      expect((caught as Error).message).to.match(/--llm real is not yet implemented/)
+    })
+  })
+})

From 9c366d3138c422f15cdb134400c95a22b45eb0e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nguy=E1=BB=85n=20Thu=E1=BA=ADn=20Ph=C3=A1t?=
 <nguyenthuanphatvl@gmail.com>
Date: Thu, 23 Apr 2026 16:55:23 +0700
Subject: [PATCH 2/2] refactor: [ENG-2332] address review-agent feedback on KPI
 harness

---
 scripts/autoharness-kpi/fixture-tasks.json    |  1 -
 scripts/autoharness-kpi/runner.ts             | 60 +++++++++----
 .../scripts/autoharness-kpi/runner.test.ts    | 86 ++++++++++++++++++-
 3 files changed, 127 insertions(+), 20 deletions(-)

diff --git a/scripts/autoharness-kpi/fixture-tasks.json b/scripts/autoharness-kpi/fixture-tasks.json
index 268e9e9e6..889817e34 100644
--- a/scripts/autoharness-kpi/fixture-tasks.json
+++ b/scripts/autoharness-kpi/fixture-tasks.json
@@ -1,6 +1,5 @@
 {
   "$comment": "AutoHarness V2 KPI reference fixture — 20 curate tasks run once with raw tools.* and once with the current harness, both against Llama 3.1 8B Instruct. Stable across ships; regenerate only on deliberate fixture updates. See features/autoharness-v2/tasks/phase_8/task_04-kpi-harness.md for the methodology.",
-  "$schema": "./fixture-tasks.schema.json",
   "fixtureVersion": "1.0.0",
   "targetModel": "llama-3.1-8b-instruct",
   "commandType": "curate",
diff --git a/scripts/autoharness-kpi/runner.ts b/scripts/autoharness-kpi/runner.ts
index 884395b92..6f467e3a8 100644
--- a/scripts/autoharness-kpi/runner.ts
+++ b/scripts/autoharness-kpi/runner.ts
@@ -121,9 +121,31 @@ const STUB_RATES: Readonly<Record<string, {harness: number; raw: number;}>> = {
 // Pure functions (unit-tested)
 // ─────────────────────────────────────────────────────────────────────────
 
+function isFixtureTask(value: unknown): value is FixtureTask {
+  if (typeof value !== 'object' || value === null) return false
+  const obj = value as Record<string, unknown>
+  return (
+    typeof obj.id === 'string' &&
+    typeof obj.taskDescription === 'string' &&
+    typeof obj.expectedBehavior === 'string'
+  )
+}
+
+const VALID_COMMAND_TYPES = new Set<string>(['chat', 'curate', 'query'])
+
+function isValidCommandType(v: unknown): v is Fixture['commandType'] {
+  return typeof v === 'string' && VALID_COMMAND_TYPES.has(v)
+}
+
 /** Parse and validate a fixture file. Throws on schema mismatch. */
 export function loadFixture(path: string): Fixture {
-  const raw = JSON.parse(readFileSync(path, 'utf8')) as unknown
+  let raw: unknown
+  try {
+    raw = JSON.parse(readFileSync(path, 'utf8')) as unknown
+  } catch (error) {
+    throw new Error(`fixture file is not valid JSON: ${path} — ${error instanceof Error ? error.message : String(error)}`)
+  }
+
   if (typeof raw !== 'object' || raw === null) {
     throw new TypeError(`fixture is not an object: ${path}`)
   }
@@ -134,23 +156,20 @@ export function loadFixture(path: string): Fixture {
     throw new Error(`fixture has no tasks: ${path}`)
   }
 
+  const validatedTasks: FixtureTask[] = []
   for (const task of tasks) {
-    if (
-      typeof task !== 'object' ||
-      task === null ||
-      typeof (task as FixtureTask).id !== 'string' ||
-      typeof (task as FixtureTask).taskDescription !== 'string' ||
-      typeof (task as FixtureTask).expectedBehavior !== 'string'
-    ) {
+    if (!isFixtureTask(task)) {
       throw new Error(`fixture task malformed: ${JSON.stringify(task)}`)
     }
+
+    validatedTasks.push(task)
   }
 
   return {
-    commandType: (parsed.commandType as Fixture['commandType']) ?? 'curate',
-    fixtureVersion: (parsed.fixtureVersion as string) ?? 'unversioned',
-    targetModel: (parsed.targetModel as string) ?? 'unknown',
-    tasks: tasks as FixtureTask[],
+    commandType: isValidCommandType(parsed.commandType) ? parsed.commandType : 'curate',
+    fixtureVersion: typeof parsed.fixtureVersion === 'string' ? parsed.fixtureVersion : 'unversioned',
+    targetModel: typeof parsed.targetModel === 'string' ? parsed.targetModel : 'unknown',
+    tasks: validatedTasks,
   }
 }
 
@@ -219,11 +238,18 @@ export function computeKpiReport(args: {
   const rawSuccessRate = rawArm.overallSuccessRate
 
   const byTaskId = new Map(rawArm.perTask.map((t) => [t.taskId, t.successRate]))
-  const perTask = harnessArm.perTask.map((h) => ({
-    harnessSuccessRate: h.successRate,
-    rawSuccessRate: byTaskId.get(h.taskId) ?? 0,
-    taskId: h.taskId,
-  }))
+  const perTask = harnessArm.perTask.map((h) => {
+    const rawRate = byTaskId.get(h.taskId)
+    if (rawRate === undefined) {
+      throw new Error(`task ID mismatch: harness arm has '${h.taskId}' but raw arm does not`)
+    }
+
+    return {
+      harnessSuccessRate: h.successRate,
+      rawSuccessRate: rawRate,
+      taskId: h.taskId,
+    }
+  })
 
   return {
     delta: harnessSuccessRate - rawSuccessRate,
diff --git a/test/unit/scripts/autoharness-kpi/runner.test.ts b/test/unit/scripts/autoharness-kpi/runner.test.ts
index da5ac465c..4b35ee475 100644
--- a/test/unit/scripts/autoharness-kpi/runner.test.ts
+++ b/test/unit/scripts/autoharness-kpi/runner.test.ts
@@ -18,6 +18,7 @@ import {
   main,
   makeStubLlmClient,
   parseArgs,
+  renderReport,
   runArm,
   SHIP_GATE_DELTA,
 } from '../../../../scripts/autoharness-kpi/runner.js'
@@ -82,6 +83,17 @@ describe('KPI harness', () => {
       }
     })
 
+    it('throws a clear error on invalid JSON', () => {
+      const dir = mkdtempSync(join(tmpdir(), 'kpi-invalid-json-'))
+      try {
+        const bad = join(dir, 'not-json.json')
+        writeFileSync(bad, '{not: valid json}')
+        expect(() => loadFixture(bad)).to.throw(/not valid JSON/)
+      } finally {
+        rmSync(dir, {force: true, recursive: true})
+      }
+    })
+
     it('throws when tasks array is empty', () => {
       const dir = mkdtempSync(join(tmpdir(), 'kpi-empty-'))
       try {
@@ -167,6 +179,16 @@ describe('KPI harness', () => {
       expect(report.delta).to.be.closeTo(0.4, 1e-9)
     })
 
+    it('throws on task-ID mismatch between arms', () => {
+      const fixture = makeFixture([
+        {expectedBehavior: 'x', id: 'a', taskDescription: 'x'},
+      ])
+      const rawArm = makeArmResult('raw', ['b'], 0.5, 10) // 'b' not 'a'
+      const harnessArm = makeArmResult('harness', ['a'], 0.8, 10)
+      expect(() => computeKpiReport({fixture, harnessArm, rawArm, runsPerArm: 10}))
+        .to.throw(/task ID mismatch/)
+    })
+
     it('carries fixture + model metadata through to the report', () => {
       const fixture = makeFixture([
         {expectedBehavior: 'x', id: 'a', taskDescription: 'x'},
@@ -264,6 +286,65 @@ describe('KPI harness', () => {
     it('rejects --runs non-numeric', () => {
       expect(() => parseArgs(['--runs', 'abc'])).to.throw(/positive integer/)
     })
+
+    it('rejects --runs negative', () => {
+      expect(() => parseArgs(['--runs', '-1'])).to.throw(/positive integer/)
+    })
+
+    it('rejects --runs zero', () => {
+      expect(() => parseArgs(['--runs', '0'])).to.throw(/positive integer/)
+    })
+
+    it('rejects --fixture with no argument', () => {
+      expect(() => parseArgs(['--fixture'])).to.throw(/requires a path/)
+    })
+
+    it('rejects --output with no argument', () => {
+      expect(() => parseArgs(['--output'])).to.throw(/requires a path/)
+    })
+
+    it('rejects --runs with no argument', () => {
+      expect(() => parseArgs(['--runs'])).to.throw(/requires a number/)
+    })
+  })
+
+  describe('renderReport', () => {
+    it('includes per-task rates, overall, delta, and ship-gate verdict', () => {
+      const report: KpiReport = {
+        delta: 0.4,
+        fixtureVersion: 'test-1',
+        harnessSuccessRate: 0.9,
+        measuredAt: 0,
+        perTask: [
+          {harnessSuccessRate: 0.9, rawSuccessRate: 0.5, taskId: 't01-test'},
+        ],
+        rawSuccessRate: 0.5,
+        runsPerArm: 10,
+        targetModel: 'test-model',
+      }
+      const output = renderReport(report)
+      expect(output).to.include('t01-test')
+      expect(output).to.include('50%')
+      expect(output).to.include('90%')
+      expect(output).to.include('+40.0pp')
+      expect(output).to.include('ship gate met')
+    })
+
+    it('shows shortfall when delta is below ship gate', () => {
+      const report: KpiReport = {
+        delta: 0.2,
+        fixtureVersion: 'x',
+        harnessSuccessRate: 0.6,
+        measuredAt: 0,
+        perTask: [],
+        rawSuccessRate: 0.4,
+        runsPerArm: 10,
+        targetModel: 'x',
+      }
+      const output = renderReport(report)
+      expect(output).to.include('NOT met')
+      expect(output).to.include('10.0pp')
+    })
   })
 
   describe('main (end-to-end with stub LLM)', () => {
@@ -307,8 +388,9 @@ describe('KPI harness', () => {
         caught = error
       }
 
-      expect(caught).to.be.an('error')
-      expect((caught as Error).message).to.match(/--llm real is not yet implemented/)
+      expect(caught).to.be.instanceOf(Error)
+      if (!(caught instanceof Error)) throw new Error('unreachable')
+      expect(caught.message).to.match(/--llm real is not yet implemented/)
     })
   })
 })