diff --git a/.github/workflows/calibrate.yml b/.github/workflows/calibrate.yml index 43b3949..01a82e4 100644 --- a/.github/workflows/calibrate.yml +++ b/.github/workflows/calibrate.yml @@ -124,6 +124,26 @@ jobs: const fn = await import(`${process.env.GITHUB_WORKSPACE}/scripts/post-calibration-comment.mjs`); await fn.default(github, context, '/tmp/aggregate.json'); + # The gate must go RED when calibration measured nothing (0 apps booted) or + # any wired kind falls below threshold. Previously the job passed green on an + # empty aggregate (precision=1/recall=1) — a false-assurance bug. Runs after + # the comment so the PR still shows why it failed. + - name: Enforce calibration actually ran + run: | + node -e " + const a = require('/tmp/aggregate.json'); + if (a.calibrationRan === false || (a.appsIncluded || []).length === 0) { + console.error('::error::Calibration produced NO data — 0 bench apps came up (health-check timeout / SurfaceMCP not started). The gate must not pass green on zero data.'); + process.exit(1); + } + const v = a.thresholdViolations || []; + if (v.length > 0) { + console.error('::error::Calibration threshold violations: ' + v.join(', ')); + process.exit(1); + } + console.log('Calibration ran: ' + (a.appsIncluded || []).join(', ') + ' (tp=' + a.overall.tp + ' fp=' + a.overall.fp + ' fn=' + a.overall.fn + ')'); + " + - name: Update README calibration block (push to main only) if: github.event_name == 'push' && github.ref == 'refs/heads/main' run: | diff --git a/scripts/aggregate-calibration.mjs b/scripts/aggregate-calibration.mjs index 0707f81..6cdf068 100644 --- a/scripts/aggregate-calibration.mjs +++ b/scripts/aggregate-calibration.mjs @@ -38,18 +38,21 @@ for (const p of reportPaths) { } } if (reports.length === 0) { - // Every bench app failed (likely health-check timeout or threshold violation - // upstream in BugHunter-bench). Emit a vacuous aggregate so the workflow can - // continue and upload the artifact showing which apps failed; exit cleanly so - // CI doesn't go red on bench-app flake. - process.stderr.write('All bench apps failed; emitting empty aggregate.\n'); + // Every bench app failed to produce a report (health-check timeout, missing + // SurfaceMCP, etc.). This is NOT a pass — the gate measured nothing. Flag + // `calibrationRan: false` with null precision/recall (never 1.0 — that false + // green is the bug this replaces) so the comment renders a failure and the + // calibrate.yml "Enforce calibration ran" step turns the check red. We still + // exit 0 so the artifact uploads and the PR comment posts. + process.stderr.write('All bench apps failed to produce a report — calibration did NOT run.\n'); process.stdout.write(JSON.stringify({ version: 1, schemaVersion: 'v0.44.0', generatedAt: new Date().toISOString(), + calibrationRan: false, appsIncluded: [], appsFailed, - overall: { totalClusters: 0, totalGoldEntries: 0, tp: 0, fp: 0, fn: 0, tn: 0, precision: 1.0, recall: 1.0, f1: 0 }, + overall: { totalClusters: 0, totalGoldEntries: 0, tp: 0, fp: 0, fn: 0, tn: 0, precision: null, recall: null, f1: null }, perKind: {}, thresholdViolations: [], }, null, 2) + '\n'); @@ -124,6 +127,7 @@ const aggregate = { version: 1, schemaVersion: 'v0.44.0', generatedAt: new Date().toISOString(), + calibrationRan: true, bughunterVersion: reports[0]?.bughunterVersion ?? 'unknown', bughunterCommit: reports[0]?.bughunterCommit ?? 'unknown', appsIncluded: reports.map(r => r.benchAppId), diff --git a/scripts/post-calibration-comment.mjs b/scripts/post-calibration-comment.mjs index ab5c6e9..a07ae94 100644 --- a/scripts/post-calibration-comment.mjs +++ b/scripts/post-calibration-comment.mjs @@ -13,6 +13,21 @@ export default async function postCalibrationComment(github, context, aggregateP const agg = JSON.parse(fs.readFileSync(aggregatePath, 'utf-8')); const { overall, perKind, thresholdViolations, appsIncluded, generatedAt } = agg; + // Calibration did not run — no bench app produced a report. Render a loud + // failure instead of a green ✅ with vacuous precision=1/recall=1. + if (agg.calibrationRan === false || (appsIncluded?.length ?? 0) === 0) { + const failed = (agg.appsFailed ?? []).map(f => `\`${f.path?.split('/').slice(-2, -1)[0] ?? f.path}\` (${f.reason})`).join(', '); + const failBody = + `❌ **BugHunter Calibration DID NOT RUN** | ${generatedAt?.slice(0, 10)}\n\n` + + `**0 bench apps produced a report** — the calibration gate measured nothing ` + + `(no precision/recall data). This is a failure, not a pass.\n\n` + + (failed ? `Failed: ${failed}\n\n` : '') + + `Likely cause: bench apps did not boot (health-check timeout) or SurfaceMCP ` + + `was not started. See the calibrate job log.\n`; + await upsertComment(github, context, failBody); + return; + } + const statusIcon = thresholdViolations.length === 0 ? '✅' : '❌'; const headerLine = `${statusIcon} **BugHunter Calibration** | ${appsIncluded?.join(', ')} | ${generatedAt?.slice(0, 10)}`; @@ -41,23 +56,19 @@ export default async function postCalibrationComment(github, context, aggregateP const body = `${headerLine}\n\n${overallLine}\n\n${table}${violationsSection}\n`; + await upsertComment(github, context, body); +} + +// Create or update the single "BugHunter Calibration" PR comment. +async function upsertComment(github, context, body) { const { data: comments } = await github.rest.issues.listComments({ ...context.repo, issue_number: context.issue.number, }); - const existing = comments.find(c => c.body?.includes('BugHunter Calibration')); if (existing) { - await github.rest.issues.updateComment({ - ...context.repo, - comment_id: existing.id, - body, - }); + await github.rest.issues.updateComment({ ...context.repo, comment_id: existing.id, body }); } else { - await github.rest.issues.createComment({ - ...context.repo, - issue_number: context.issue.number, - body, - }); + await github.rest.issues.createComment({ ...context.repo, issue_number: context.issue.number, body }); } }