From 567e6e0d3bc6e8b971dcbb5aa9918923daa8cbca Mon Sep 17 00:00:00 2001 From: Brad Cunningham Date: Wed, 3 Jun 2026 10:18:24 -0400 Subject: [PATCH] fix(ci): calibrate gate must fail when it measures nothing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The calibrate job passed GREEN whenever all bench apps failed to boot: aggregate-calibration.mjs emitted precision=1.0/recall=1.0 on an empty aggregate and exit(0), and the PR comment rendered ✅. In practice every bench app has been failing (health-check timeout + SurfaceMCP never started in CI), so the gate has certified success while measuring zero data — pure false assurance, the same pattern as the read-only "100%". - aggregate: empty result now flags `calibrationRan: false` with null precision/recall (never 1.0), still exit 0 so the artifact + comment emit. - comment: renders "❌ Calibration DID NOT RUN" instead of green ✅. - calibrate.yml: new "Enforce calibration actually ran" step fails the job (red) when calibrationRan is false OR any wired kind violates threshold. This makes the calibrate check RED until the bench infra is resurrected in CI (boot the 5 app servers + start SurfaceMCP/camofox). main is not branch-protected, so this is informational, not merge-blocking. A gate that lies is worse than no gate. Verified locally: empty -> calibrationRan:false + enforce exits 1; valid report -> calibrationRan:true; comment renders the failure. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/calibrate.yml | 20 +++++++++++++++++ scripts/aggregate-calibration.mjs | 16 +++++++++----- scripts/post-calibration-comment.mjs | 33 ++++++++++++++++++---------- 3 files changed, 52 insertions(+), 17 deletions(-) diff --git a/.github/workflows/calibrate.yml b/.github/workflows/calibrate.yml index 43b3949a..01a82e46 100644 --- a/.github/workflows/calibrate.yml +++ b/.github/workflows/calibrate.yml @@ -124,6 +124,26 @@ jobs: const fn = await import(`${process.env.GITHUB_WORKSPACE}/scripts/post-calibration-comment.mjs`); await fn.default(github, context, '/tmp/aggregate.json'); + # The gate must go RED when calibration measured nothing (0 apps booted) or + # any wired kind falls below threshold. Previously the job passed green on an + # empty aggregate (precision=1/recall=1) — a false-assurance bug. Runs after + # the comment so the PR still shows why it failed. + - name: Enforce calibration actually ran + run: | + node -e " + const a = require('/tmp/aggregate.json'); + if (a.calibrationRan === false || (a.appsIncluded || []).length === 0) { + console.error('::error::Calibration produced NO data — 0 bench apps came up (health-check timeout / SurfaceMCP not started). The gate must not pass green on zero data.'); + process.exit(1); + } + const v = a.thresholdViolations || []; + if (v.length > 0) { + console.error('::error::Calibration threshold violations: ' + v.join(', ')); + process.exit(1); + } + console.log('Calibration ran: ' + (a.appsIncluded || []).join(', ') + ' (tp=' + a.overall.tp + ' fp=' + a.overall.fp + ' fn=' + a.overall.fn + ')'); + " + - name: Update README calibration block (push to main only) if: github.event_name == 'push' && github.ref == 'refs/heads/main' run: | diff --git a/scripts/aggregate-calibration.mjs b/scripts/aggregate-calibration.mjs index 0707f819..6cdf0684 100644 --- a/scripts/aggregate-calibration.mjs +++ b/scripts/aggregate-calibration.mjs @@ -38,18 +38,21 @@ for (const p of reportPaths) { } } if (reports.length === 0) { - // Every bench app failed (likely health-check timeout or threshold violation - // upstream in BugHunter-bench). Emit a vacuous aggregate so the workflow can - // continue and upload the artifact showing which apps failed; exit cleanly so - // CI doesn't go red on bench-app flake. - process.stderr.write('All bench apps failed; emitting empty aggregate.\n'); + // Every bench app failed to produce a report (health-check timeout, missing + // SurfaceMCP, etc.). This is NOT a pass — the gate measured nothing. Flag + // `calibrationRan: false` with null precision/recall (never 1.0 — that false + // green is the bug this replaces) so the comment renders a failure and the + // calibrate.yml "Enforce calibration ran" step turns the check red. We still + // exit 0 so the artifact uploads and the PR comment posts. + process.stderr.write('All bench apps failed to produce a report — calibration did NOT run.\n'); process.stdout.write(JSON.stringify({ version: 1, schemaVersion: 'v0.44.0', generatedAt: new Date().toISOString(), + calibrationRan: false, appsIncluded: [], appsFailed, - overall: { totalClusters: 0, totalGoldEntries: 0, tp: 0, fp: 0, fn: 0, tn: 0, precision: 1.0, recall: 1.0, f1: 0 }, + overall: { totalClusters: 0, totalGoldEntries: 0, tp: 0, fp: 0, fn: 0, tn: 0, precision: null, recall: null, f1: null }, perKind: {}, thresholdViolations: [], }, null, 2) + '\n'); @@ -124,6 +127,7 @@ const aggregate = { version: 1, schemaVersion: 'v0.44.0', generatedAt: new Date().toISOString(), + calibrationRan: true, bughunterVersion: reports[0]?.bughunterVersion ?? 'unknown', bughunterCommit: reports[0]?.bughunterCommit ?? 'unknown', appsIncluded: reports.map(r => r.benchAppId), diff --git a/scripts/post-calibration-comment.mjs b/scripts/post-calibration-comment.mjs index ab5c6e95..a07ae946 100644 --- a/scripts/post-calibration-comment.mjs +++ b/scripts/post-calibration-comment.mjs @@ -13,6 +13,21 @@ export default async function postCalibrationComment(github, context, aggregateP const agg = JSON.parse(fs.readFileSync(aggregatePath, 'utf-8')); const { overall, perKind, thresholdViolations, appsIncluded, generatedAt } = agg; + // Calibration did not run — no bench app produced a report. Render a loud + // failure instead of a green ✅ with vacuous precision=1/recall=1. + if (agg.calibrationRan === false || (appsIncluded?.length ?? 0) === 0) { + const failed = (agg.appsFailed ?? []).map(f => `\`${f.path?.split('/').slice(-2, -1)[0] ?? f.path}\` (${f.reason})`).join(', '); + const failBody = + `❌ **BugHunter Calibration DID NOT RUN** | ${generatedAt?.slice(0, 10)}\n\n` + + `**0 bench apps produced a report** — the calibration gate measured nothing ` + + `(no precision/recall data). This is a failure, not a pass.\n\n` + + (failed ? `Failed: ${failed}\n\n` : '') + + `Likely cause: bench apps did not boot (health-check timeout) or SurfaceMCP ` + + `was not started. See the calibrate job log.\n`; + await upsertComment(github, context, failBody); + return; + } + const statusIcon = thresholdViolations.length === 0 ? '✅' : '❌'; const headerLine = `${statusIcon} **BugHunter Calibration** | ${appsIncluded?.join(', ')} | ${generatedAt?.slice(0, 10)}`; @@ -41,23 +56,19 @@ export default async function postCalibrationComment(github, context, aggregateP const body = `${headerLine}\n\n${overallLine}\n\n${table}${violationsSection}\n`; + await upsertComment(github, context, body); +} + +// Create or update the single "BugHunter Calibration" PR comment. +async function upsertComment(github, context, body) { const { data: comments } = await github.rest.issues.listComments({ ...context.repo, issue_number: context.issue.number, }); - const existing = comments.find(c => c.body?.includes('BugHunter Calibration')); if (existing) { - await github.rest.issues.updateComment({ - ...context.repo, - comment_id: existing.id, - body, - }); + await github.rest.issues.updateComment({ ...context.repo, comment_id: existing.id, body }); } else { - await github.rest.issues.createComment({ - ...context.repo, - issue_number: context.issue.number, - body, - }); + await github.rest.issues.createComment({ ...context.repo, issue_number: context.issue.number, body }); } }