From 567e6e0d3bc6e8b971dcbb5aa9918923daa8cbca Mon Sep 17 00:00:00 2001
From: Brad Cunningham <cunningham.be@gmail.com>
Date: Wed, 3 Jun 2026 10:18:24 -0400
Subject: [PATCH] fix(ci): calibrate gate must fail when it measures nothing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The calibrate job passed GREEN whenever all bench apps failed to boot:
aggregate-calibration.mjs emitted precision=1.0/recall=1.0 on an empty
aggregate and exit(0), and the PR comment rendered ✅. In practice every
bench app has been failing (health-check timeout + SurfaceMCP never
started in CI), so the gate has certified success while measuring zero
data — pure false assurance, the same pattern as the read-only "100%".

- aggregate: empty result now flags `calibrationRan: false` with null
  precision/recall (never 1.0), still exit 0 so the artifact + comment
  emit.
- comment: renders "❌ Calibration DID NOT RUN" instead of green ✅.
- calibrate.yml: new "Enforce calibration actually ran" step fails the
  job (red) when calibrationRan is false OR any wired kind violates
  threshold.

This makes the calibrate check RED until the bench infra is resurrected
in CI (boot the 5 app servers + start SurfaceMCP/camofox). main is not
branch-protected, so this is informational, not merge-blocking. A gate
that lies is worse than no gate.

Verified locally: empty -> calibrationRan:false + enforce exits 1;
valid report -> calibrationRan:true; comment renders the failure.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .github/workflows/calibrate.yml      | 20 +++++++++++++++++
 scripts/aggregate-calibration.mjs    | 16 +++++++++-----
 scripts/post-calibration-comment.mjs | 33 ++++++++++++++++++----------
 3 files changed, 52 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/calibrate.yml b/.github/workflows/calibrate.yml
index 43b3949a..01a82e46 100644
--- a/.github/workflows/calibrate.yml
+++ b/.github/workflows/calibrate.yml
@@ -124,6 +124,26 @@ jobs:
             const fn = await import(`${process.env.GITHUB_WORKSPACE}/scripts/post-calibration-comment.mjs`);
             await fn.default(github, context, '/tmp/aggregate.json');
 
+      # The gate must go RED when calibration measured nothing (0 apps booted) or
+      # any wired kind falls below threshold. Previously the job passed green on an
+      # empty aggregate (precision=1/recall=1) — a false-assurance bug. Runs after
+      # the comment so the PR still shows why it failed.
+      - name: Enforce calibration actually ran
+        run: |
+          node -e "
+            const a = require('/tmp/aggregate.json');
+            if (a.calibrationRan === false || (a.appsIncluded || []).length === 0) {
+              console.error('::error::Calibration produced NO data — 0 bench apps came up (health-check timeout / SurfaceMCP not started). The gate must not pass green on zero data.');
+              process.exit(1);
+            }
+            const v = a.thresholdViolations || [];
+            if (v.length > 0) {
+              console.error('::error::Calibration threshold violations: ' + v.join(', '));
+              process.exit(1);
+            }
+            console.log('Calibration ran: ' + (a.appsIncluded || []).join(', ') + ' (tp=' + a.overall.tp + ' fp=' + a.overall.fp + ' fn=' + a.overall.fn + ')');
+          "
+
       - name: Update README calibration block (push to main only)
         if: github.event_name == 'push' && github.ref == 'refs/heads/main'
         run: |
diff --git a/scripts/aggregate-calibration.mjs b/scripts/aggregate-calibration.mjs
index 0707f819..6cdf0684 100644
--- a/scripts/aggregate-calibration.mjs
+++ b/scripts/aggregate-calibration.mjs
@@ -38,18 +38,21 @@ for (const p of reportPaths) {
   }
 }
 if (reports.length === 0) {
-  // Every bench app failed (likely health-check timeout or threshold violation
-  // upstream in BugHunter-bench). Emit a vacuous aggregate so the workflow can
-  // continue and upload the artifact showing which apps failed; exit cleanly so
-  // CI doesn't go red on bench-app flake.
-  process.stderr.write('All bench apps failed; emitting empty aggregate.\n');
+  // Every bench app failed to produce a report (health-check timeout, missing
+  // SurfaceMCP, etc.). This is NOT a pass — the gate measured nothing. Flag
+  // `calibrationRan: false` with null precision/recall (never 1.0 — that false
+  // green is the bug this replaces) so the comment renders a failure and the
+  // calibrate.yml "Enforce calibration ran" step turns the check red. We still
+  // exit 0 so the artifact uploads and the PR comment posts.
+  process.stderr.write('All bench apps failed to produce a report — calibration did NOT run.\n');
   process.stdout.write(JSON.stringify({
     version: 1,
     schemaVersion: 'v0.44.0',
     generatedAt: new Date().toISOString(),
+    calibrationRan: false,
     appsIncluded: [],
     appsFailed,
-    overall: { totalClusters: 0, totalGoldEntries: 0, tp: 0, fp: 0, fn: 0, tn: 0, precision: 1.0, recall: 1.0, f1: 0 },
+    overall: { totalClusters: 0, totalGoldEntries: 0, tp: 0, fp: 0, fn: 0, tn: 0, precision: null, recall: null, f1: null },
     perKind: {},
     thresholdViolations: [],
   }, null, 2) + '\n');
@@ -124,6 +127,7 @@ const aggregate = {
   version: 1,
   schemaVersion: 'v0.44.0',
   generatedAt: new Date().toISOString(),
+  calibrationRan: true,
   bughunterVersion: reports[0]?.bughunterVersion ?? 'unknown',
   bughunterCommit: reports[0]?.bughunterCommit ?? 'unknown',
   appsIncluded: reports.map(r => r.benchAppId),
diff --git a/scripts/post-calibration-comment.mjs b/scripts/post-calibration-comment.mjs
index ab5c6e95..a07ae946 100644
--- a/scripts/post-calibration-comment.mjs
+++ b/scripts/post-calibration-comment.mjs
@@ -13,6 +13,21 @@ export default async function postCalibrationComment(github, context, aggregateP
   const agg = JSON.parse(fs.readFileSync(aggregatePath, 'utf-8'));
   const { overall, perKind, thresholdViolations, appsIncluded, generatedAt } = agg;
 
+  // Calibration did not run — no bench app produced a report. Render a loud
+  // failure instead of a green ✅ with vacuous precision=1/recall=1.
+  if (agg.calibrationRan === false || (appsIncluded?.length ?? 0) === 0) {
+    const failed = (agg.appsFailed ?? []).map(f => `\`${f.path?.split('/').slice(-2, -1)[0] ?? f.path}\` (${f.reason})`).join(', ');
+    const failBody =
+      `❌ **BugHunter Calibration DID NOT RUN** | ${generatedAt?.slice(0, 10)}\n\n` +
+      `**0 bench apps produced a report** — the calibration gate measured nothing ` +
+      `(no precision/recall data). This is a failure, not a pass.\n\n` +
+      (failed ? `Failed: ${failed}\n\n` : '') +
+      `Likely cause: bench apps did not boot (health-check timeout) or SurfaceMCP ` +
+      `was not started. See the calibrate job log.\n`;
+    await upsertComment(github, context, failBody);
+    return;
+  }
+
   const statusIcon = thresholdViolations.length === 0 ? '✅' : '❌';
   const headerLine = `${statusIcon} **BugHunter Calibration** | ${appsIncluded?.join(', ')} | ${generatedAt?.slice(0, 10)}`;
 
@@ -41,23 +56,19 @@ export default async function postCalibrationComment(github, context, aggregateP
   const body =
     `${headerLine}\n\n${overallLine}\n\n${table}${violationsSection}\n`;
 
+  await upsertComment(github, context, body);
+}
+
+// Create or update the single "BugHunter Calibration" PR comment.
+async function upsertComment(github, context, body) {
   const { data: comments } = await github.rest.issues.listComments({
     ...context.repo,
     issue_number: context.issue.number,
   });
-
   const existing = comments.find(c => c.body?.includes('BugHunter Calibration'));
   if (existing) {
-    await github.rest.issues.updateComment({
-      ...context.repo,
-      comment_id: existing.id,
-      body,
-    });
+    await github.rest.issues.updateComment({ ...context.repo, comment_id: existing.id, body });
   } else {
-    await github.rest.issues.createComment({
-      ...context.repo,
-      issue_number: context.issue.number,
-      body,
-    });
+    await github.rest.issues.createComment({ ...context.repo, issue_number: context.issue.number, body });
   }
 }