Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions .github/workflows/calibrate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,26 @@ jobs:
const fn = await import(`${process.env.GITHUB_WORKSPACE}/scripts/post-calibration-comment.mjs`);
await fn.default(github, context, '/tmp/aggregate.json');

# The gate must go RED when calibration measured nothing (0 apps booted) or
# any wired kind falls below threshold. Previously the job passed green on an
# empty aggregate (precision=1/recall=1) — a false-assurance bug. Runs after
# the comment so the PR still shows why it failed.
- name: Enforce calibration actually ran
run: |
node -e "
const a = require('/tmp/aggregate.json');
if (a.calibrationRan === false || (a.appsIncluded || []).length === 0) {
console.error('::error::Calibration produced NO data — 0 bench apps came up (health-check timeout / SurfaceMCP not started). The gate must not pass green on zero data.');
process.exit(1);
}
const v = a.thresholdViolations || [];
if (v.length > 0) {
console.error('::error::Calibration threshold violations: ' + v.join(', '));
process.exit(1);
}
console.log('Calibration ran: ' + (a.appsIncluded || []).join(', ') + ' (tp=' + a.overall.tp + ' fp=' + a.overall.fp + ' fn=' + a.overall.fn + ')');
"

- name: Update README calibration block (push to main only)
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
run: |
Expand Down
16 changes: 10 additions & 6 deletions scripts/aggregate-calibration.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -38,18 +38,21 @@ for (const p of reportPaths) {
}
}
if (reports.length === 0) {
// Every bench app failed (likely health-check timeout or threshold violation
// upstream in BugHunter-bench). Emit a vacuous aggregate so the workflow can
// continue and upload the artifact showing which apps failed; exit cleanly so
// CI doesn't go red on bench-app flake.
process.stderr.write('All bench apps failed; emitting empty aggregate.\n');
// Every bench app failed to produce a report (health-check timeout, missing
// SurfaceMCP, etc.). This is NOT a pass — the gate measured nothing. Flag
// `calibrationRan: false` with null precision/recall (never 1.0 — that false
// green is the bug this replaces) so the comment renders a failure and the
// calibrate.yml "Enforce calibration ran" step turns the check red. We still
// exit 0 so the artifact uploads and the PR comment posts.
process.stderr.write('All bench apps failed to produce a report — calibration did NOT run.\n');
process.stdout.write(JSON.stringify({
version: 1,
schemaVersion: 'v0.44.0',
generatedAt: new Date().toISOString(),
calibrationRan: false,
appsIncluded: [],
appsFailed,
overall: { totalClusters: 0, totalGoldEntries: 0, tp: 0, fp: 0, fn: 0, tn: 0, precision: 1.0, recall: 1.0, f1: 0 },
overall: { totalClusters: 0, totalGoldEntries: 0, tp: 0, fp: 0, fn: 0, tn: 0, precision: null, recall: null, f1: null },
perKind: {},
thresholdViolations: [],
}, null, 2) + '\n');
Expand Down Expand Up @@ -124,6 +127,7 @@ const aggregate = {
version: 1,
schemaVersion: 'v0.44.0',
generatedAt: new Date().toISOString(),
calibrationRan: true,
bughunterVersion: reports[0]?.bughunterVersion ?? 'unknown',
bughunterCommit: reports[0]?.bughunterCommit ?? 'unknown',
appsIncluded: reports.map(r => r.benchAppId),
Expand Down
33 changes: 22 additions & 11 deletions scripts/post-calibration-comment.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,21 @@ export default async function postCalibrationComment(github, context, aggregateP
const agg = JSON.parse(fs.readFileSync(aggregatePath, 'utf-8'));
const { overall, perKind, thresholdViolations, appsIncluded, generatedAt } = agg;

// Calibration did not run — no bench app produced a report. Render a loud
// failure instead of a green ✅ with vacuous precision=1/recall=1.
if (agg.calibrationRan === false || (appsIncluded?.length ?? 0) === 0) {
const failed = (agg.appsFailed ?? []).map(f => `\`${f.path?.split('/').slice(-2, -1)[0] ?? f.path}\` (${f.reason})`).join(', ');
const failBody =
`❌ **BugHunter Calibration DID NOT RUN** | ${generatedAt?.slice(0, 10)}\n\n` +
`**0 bench apps produced a report** — the calibration gate measured nothing ` +
`(no precision/recall data). This is a failure, not a pass.\n\n` +
(failed ? `Failed: ${failed}\n\n` : '') +
`Likely cause: bench apps did not boot (health-check timeout) or SurfaceMCP ` +
`was not started. See the calibrate job log.\n`;
await upsertComment(github, context, failBody);
return;
}

const statusIcon = thresholdViolations.length === 0 ? '✅' : '❌';
const headerLine = `${statusIcon} **BugHunter Calibration** | ${appsIncluded?.join(', ')} | ${generatedAt?.slice(0, 10)}`;

Expand Down Expand Up @@ -41,23 +56,19 @@ export default async function postCalibrationComment(github, context, aggregateP
const body =
`${headerLine}\n\n${overallLine}\n\n${table}${violationsSection}\n`;

await upsertComment(github, context, body);
}

// Create or update the single "BugHunter Calibration" PR comment.
async function upsertComment(github, context, body) {
const { data: comments } = await github.rest.issues.listComments({
...context.repo,
issue_number: context.issue.number,
});

const existing = comments.find(c => c.body?.includes('BugHunter Calibration'));
if (existing) {
await github.rest.issues.updateComment({
...context.repo,
comment_id: existing.id,
body,
});
await github.rest.issues.updateComment({ ...context.repo, comment_id: existing.id, body });
} else {
await github.rest.issues.createComment({
...context.repo,
issue_number: context.issue.number,
body,
});
await github.rest.issues.createComment({ ...context.repo, issue_number: context.issue.number, body });
}
}
Loading