From 1e89ce3432f97057517a27bf1311cffb15827ddc Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Sat, 27 Jun 2026 09:45:27 +0100 Subject: [PATCH 1/4] ci(benchmark): compare PR against a same-runner main build The PR "Benchmark Results" comment classified benchmarks by comparing the PR run against a cached `main` baseline measured on a different runner at a different time. Cross-runner variance dwarfs each run's own min/max spread, so the range-overlap classifier flipped large numbers of unrelated benchmarks to improved/regressed (e.g. #805, a regex-only change, reported non-regex benchmarks as improved). The verdicts were not actionable. Build the PR's base commit in a new `build-main` job and benchmark that `main` build and the PR build back-to-back on the same runner, after a warm-up discard of both binaries, so deltas reflect the diff rather than infrastructure. The range-overlap classifier is unchanged; it now compares two runs measured under the same conditions. The `main`-base run is `continue-on-error` so a PR that adds a benchmark `main` cannot run is not blocked (it degrades to new). The comparison stays advisory (comment-only). - Extract the ~200-line inline comparison into scripts/benchmark-compare.js with unit tests (scripts/test-benchmark-compare.ts), wired into both cli jobs; report the measured median per-run variance as a noise floor. - Regenerate the deterministic profile diff from the same `main` build and remove the dead cross-runner benchmark/profile baseline caches. - Add ADR 0076 and update docs/build-system.md and docs/benchmarks.md. Closes #815 Co-Authored-By: Claude Opus 4.8 --- .github/workflows/ci.yml | 22 +- .github/workflows/pr.yml | 367 +++++++----------- .../0076-same-runner-benchmark-comparison.md | 13 + docs/adr/README.md | 1 + docs/benchmarks.md | 18 +- docs/build-system.md | 4 +- scripts/benchmark-compare.js | 274 +++++++++++++ scripts/profile-diff.js | 2 +- scripts/test-benchmark-compare.ts | 167 ++++++++ 9 files changed, 612 insertions(+), 256 deletions(-) create mode 100644 docs/adr/0076-same-runner-benchmark-comparison.md create mode 100644 scripts/benchmark-compare.js create mode 100644 scripts/test-benchmark-compare.ts diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7f4311fe5..a2db4044f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -779,7 +779,10 @@ jobs: GOCCIA_BENCH_ROUNDS: 7 run: ./build/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress - - name: Run benchmarks and save baseline + # main/x86_64-linux emits JSON so the next step can validate the benchmark + # report shape. PR comparison no longer reads a cached baseline from here โ€” + # each PR builds and benchmarks `main` on its own runner (issue #815). + - name: Run benchmarks (validate JSON output) if: github.ref == 'refs/heads/main' && matrix.target == 'x86_64-linux' env: GOCCIA_BENCH_CALIBRATION_MS: 100 @@ -818,13 +821,6 @@ jobs: sys.exit(1) PY - - name: Cache benchmark baseline - if: github.ref == 'refs/heads/main' && matrix.target == 'x86_64-linux' - uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5 - with: - path: benchmark-${{ matrix.mode }}-results.json - key: benchmark-${{ matrix.mode }}-baseline-${{ github.sha }} - - uses: oven-sh/setup-bun@0c5077e51419868618aeaa5fe8019c62421857d6 # v2 if: github.ref == 'refs/heads/main' && matrix.target == 'x86_64-linux' && matrix.mode == 'bytecode' @@ -855,13 +851,6 @@ jobs: --summary-output benchmark-profile-aggregate.json \ --markdown-output benchmark-profile-aggregate.md - - name: Cache deterministic profile baseline - if: github.ref == 'refs/heads/main' && matrix.target == 'x86_64-linux' && matrix.mode == 'bytecode' - uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5 - with: - path: profile-baseline/ - key: profile-baseline-${{ github.sha }} - - name: Upload benchmark profile if: github.ref == 'refs/heads/main' && matrix.target == 'x86_64-linux' && matrix.mode == 'bytecode' && hashFiles('benchmark-profile-aggregate.json') != '' && hashFiles('profile-baseline/**') != '' uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 @@ -949,6 +938,9 @@ jobs: - name: Check embedded resource CLI behaviour run: bun run scripts/test-cli-embedded-resources.ts + - name: Check benchmark comparison logic + run: bun run scripts/test-benchmark-compare.ts + - name: Check Windows TLS dependencies if: runner.os == 'Windows' shell: pwsh diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index bf3814423..74e3f79f7 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -39,6 +39,35 @@ jobs: retention-days: 1 path: pr-artifacts/* + # Builds the benchmark runner from the PR's base commit (the `main` the PR + # targets) so the `benchmark` job can measure it back-to-back with the PR build + # on the *same* runner. This is the trustworthy half of the same-runner A/B + # comparison that replaces the old cross-runner cached baseline (issue #815). + build-main: + runs-on: ubuntu-latest + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + with: + ref: ${{ github.event.pull_request.base.sha }} + fetch-depth: 0 + persist-credentials: false + + - name: Install FPC + run: sudo apt-get update && sudo apt-get install fpc -qq > /dev/null + + - name: Build benchmark runner (main base) + run: ./build.pas benchmarkrunner --prod + + - name: Upload main benchmark runner + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + with: + name: build-main + retention-days: 1 + path: build/GocciaBenchmarkRunner + docs: runs-on: ubuntu-latest steps: @@ -164,7 +193,7 @@ jobs: fi benchmark: - needs: build + needs: [build, build-main] runs-on: ubuntu-latest strategy: fail-fast: false @@ -185,8 +214,42 @@ jobs: name: build-pr path: build/ + - name: Download main benchmark runner + uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7 + with: + name: build-main + path: build-main/ + - name: Make binaries executable - run: chmod +x build/* + run: | + chmod +x build/* + chmod +x build-main/* + + # Same-runner A/B (issue #815): the PR and `main` builds are measured on + # this one runner so the delta reflects the diff, not cross-runner + # variance. Both benchmark the PR's `benchmarks/` workloads. The warm-up + # runs are discarded on purpose โ€” they shed runner cold-start (page + # faults, sub-turbo clocks) for BOTH binaries so neither the `main` run + # (measured first) nor the PR run is penalised by being measured cold. + - name: Warm up benchmark runners + env: + GOCCIA_BENCH_ROUNDS: 1 + run: | + ./build-main/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress > /dev/null 2>&1 || true + ./build/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress > /dev/null 2>&1 || true + + # `main` is built from the PR's base commit, so it cannot run benchmarks the + # PR newly adds or changes to use new language features. The report is still + # written before a non-zero exit, so continue-on-error keeps such a PR from + # being blocked: a benchmark that errors on `main` simply has no baseline + # entry and renders as ๐Ÿ†• new. The PR's own run below is intentionally NOT + # tolerant โ€” a broken PR benchmark should fail the leg. + - name: Run benchmarks (main base) + continue-on-error: true + env: + GOCCIA_BENCH_CALIBRATION_MS: 100 + GOCCIA_BENCH_ROUNDS: 7 + run: ./build-main/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress --format=json --output=benchmark-${{ matrix.mode }}-main.json - name: Run benchmarks env: @@ -194,6 +257,26 @@ jobs: GOCCIA_BENCH_ROUNDS: 7 run: ./build/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress --format=json --output=benchmark-${{ matrix.mode }}.json + - name: Capture deterministic profiles (main base) + if: matrix.mode == 'bytecode' + continue-on-error: true + run: | + set -euo pipefail + mkdir -p profile-main + while IFS= read -r bench; do + rel="${bench#benchmarks/}" + out="profile-main/${rel%.*}.json" + mkdir -p "$(dirname "$out")" + ./build-main/GocciaBenchmarkRunner "$bench" \ + --mode=bytecode \ + --profile-deterministic \ + --profile=all \ + --profile-output="$out" \ + --no-progress \ + --format=compact-json \ + > /dev/null + done < <(find benchmarks -type f \( -name '*.js' -o -name '*.jsx' -o -name '*.ts' -o -name '*.tsx' -o -name '*.mjs' \) ! -path '*/helpers/*' | sort) + - name: Capture deterministic profiles if: matrix.mode == 'bytecode' continue-on-error: true @@ -227,6 +310,13 @@ jobs: retention-days: 1 path: benchmark-${{ matrix.mode }}.json + - name: Upload main results + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + with: + name: benchmark-${{ matrix.mode }}-main + retention-days: 1 + path: benchmark-${{ matrix.mode }}-main.json + - name: Upload deterministic profiles if: matrix.mode == 'bytecode' uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 @@ -235,6 +325,14 @@ jobs: retention-days: 1 path: profile-current/ + - name: Upload main deterministic profiles + if: matrix.mode == 'bytecode' + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + with: + name: profile-main + retention-days: 1 + path: profile-main/ + cli: needs: build runs-on: ubuntu-latest @@ -276,6 +374,9 @@ jobs: - name: Check embedded resource CLI behaviour run: bun run scripts/test-cli-embedded-resources.ts + - name: Check benchmark comparison logic + run: bun run scripts/test-benchmark-compare.ts + test262: needs: build runs-on: ubuntu-latest @@ -381,259 +482,65 @@ jobs: name: profile-current path: profile-current/ - - name: Restore interpreted baseline - uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5 + - name: Download interpreted main results + uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7 + continue-on-error: true with: - path: benchmark-interpreted-results.json - key: benchmark-interpreted-baseline-will-not-match - restore-keys: benchmark-interpreted-baseline- + name: benchmark-interpreted-main - - name: Restore bytecode baseline - uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5 + - name: Download bytecode main results + uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7 + continue-on-error: true with: - path: benchmark-bytecode-results.json - key: benchmark-bytecode-baseline-will-not-match - restore-keys: benchmark-bytecode-baseline- + name: benchmark-bytecode-main - - name: Restore deterministic profile baseline - uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5 + - name: Download main deterministic profiles + uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7 + continue-on-error: true with: - path: profile-baseline/ - key: profile-baseline-will-not-match - restore-keys: profile-baseline- - - - name: Check baselines - run: | - if [ -f benchmark-interpreted-results.json ]; then - echo "BASELINE_EXISTS=true" >> "$GITHUB_ENV" - else - echo "BASELINE_EXISTS=false" >> "$GITHUB_ENV" - fi - if [ -f benchmark-bytecode-results.json ]; then - echo "BYTECODE_BASELINE_EXISTS=true" >> "$GITHUB_ENV" - else - echo "BYTECODE_BASELINE_EXISTS=false" >> "$GITHUB_ENV" - fi + name: profile-main + path: profile-main/ - name: Compare benchmarks and comment uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 with: script: | const fs = require('fs'); + const { buildBenchmarkComparison } = require('./scripts/benchmark-compare.js'); const { buildProfileDiffSection } = require('./scripts/profile-diff.js'); - const fmtOps = (n) => Math.round(n).toString().replace(/\B(?=(\d{3})+(?!\d))/g, ','); - const fmtPercent = (change) => { - if (change == null) return 'N/A'; - return `${change >= 0 ? '+' : ''}${change.toFixed(1)}%`; - }; - const fmtAvg = (changes) => { - if (changes.length === 0) return ''; - const avg = changes.reduce((a, b) => a + b, 0) / changes.length; - return `avg ${fmtPercent(avg)}`; - }; - const fmtDelta = (classification, change) => { - if (change == null) return classification === 'new' ? '๐Ÿ†•' : 'N/A'; - if (classification === 'improved') return `๐ŸŸข ${fmtPercent(change)}`; - if (classification === 'regressed') return `๐Ÿ”ด ${fmtPercent(change)}`; - if (classification === 'unchanged') return `~ overlap (${fmtPercent(change)})`; - return '๐Ÿ†•'; - }; - const fmtRange = (entry) => { - if (!entry) return 'โ€”'; - if (entry.minOpsPerSec === entry.maxOpsPerSec) return fmtOps(entry.opsPerSec); - return `${fmtOps(entry.minOpsPerSec)}..${fmtOps(entry.maxOpsPerSec)}`; + const read = (p) => JSON.parse(fs.readFileSync(p, 'utf8')); + const tryRead = (p) => { + try { return read(p); } + catch (e) { console.log(`No same-runner main result at ${p}:`, e.message); return null; } }; - const fmtBenchmark = (entry) => { - if (!entry) return 'โ€”'; - if (entry.minOpsPerSec === entry.maxOpsPerSec) return `${fmtOps(entry.opsPerSec)} ops/sec`; - return `${fmtOps(entry.opsPerSec)} ops/sec [${fmtRange(entry)}]`; - }; - const fmtCell = (baseEntry, currentEntry) => { - if (baseEntry) return `${fmtBenchmark(baseEntry)} โ†’ ${fmtBenchmark(currentEntry)}`; - return fmtBenchmark(currentEntry); - }; - const normalizeBenchmark = (benchmark) => ({ - opsPerSec: benchmark.opsPerSec, - minOpsPerSec: benchmark.minOpsPerSec ?? benchmark.opsPerSec, - maxOpsPerSec: benchmark.maxOpsPerSec ?? benchmark.opsPerSec, - variancePercentage: benchmark.variancePercentage ?? 0, - }); - const calculateChange = (baseEntry, currentEntry) => { - if (!baseEntry || !baseEntry.opsPerSec) return null; - return ((currentEntry.opsPerSec - baseEntry.opsPerSec) / baseEntry.opsPerSec) * 100; - }; - const classify = (baseEntry, currentEntry) => { - if (!baseEntry) return 'new'; - if (currentEntry.minOpsPerSec > baseEntry.maxOpsPerSec) return 'improved'; - if (currentEntry.maxOpsPerSec < baseEntry.minOpsPerSec) return 'regressed'; - return 'unchanged'; - }; - - const interpData = JSON.parse(fs.readFileSync('benchmark-interpreted.json', 'utf8')); - const bytecodeData = JSON.parse(fs.readFileSync('benchmark-bytecode.json', 'utf8')); - - let interpBaseline = null; - if (process.env.BASELINE_EXISTS === 'true') { - try { interpBaseline = JSON.parse(fs.readFileSync('benchmark-interpreted-results.json', 'utf8')); } - catch (e) { console.log('Failed to parse interpreted baseline:', e.message); } - } - let bytecodeBaseline = null; - if (process.env.BYTECODE_BASELINE_EXISTS === 'true') { - try { bytecodeBaseline = JSON.parse(fs.readFileSync('benchmark-bytecode-results.json', 'utf8')); } - catch (e) { console.log('Failed to parse bytecode baseline:', e.message); } - } - const hasAnyBaseline = interpBaseline || bytecodeBaseline; - - function buildMap(data) { - const map = {}; - if (!data) return map; - for (const file of data.files) { - const m = {}; - for (const b of file.benchmarks) { - if (!b.error) m[`${b.suite} > ${b.name}`] = normalizeBenchmark(b); - } - map[file.fileName] = m; - } - return map; - } - - const interpBaseMap = buildMap(interpBaseline); - const bytecodeBaseMap = buildMap(bytecodeBaseline); - const bytecodeCurrentMap = buildMap(bytecodeData); - - const newStats = () => ({ improved: 0, regressed: 0, unchanged: 0, new: 0, changes: [] }); - const totals = { interp: newStats(), bytecode: newStats() }; - let totalBenchmarks = 0; - const fileBlocks = []; - - for (const file of interpData.files) { - const fileName = file.fileName; - const iBase = interpBaseMap[fileName] || {}; - const bBase = bytecodeBaseMap[fileName] || {}; - const bCurrent = bytecodeCurrentMap[fileName] || {}; - const fileS = { interp: newStats(), bytecode: newStats() }; - let fileRows = ''; - const benches = file.benchmarks.filter(b => !b.error); - totalBenchmarks += benches.length; - - if (hasAnyBaseline) { - fileRows += '| Benchmark | Interpreted | ฮ” | Bytecode | ฮ” |\n'; - fileRows += '|-----------|------------|---|---------|---|\n'; - } else { - fileRows += '| Benchmark | Interpreted | Bytecode |\n'; - fileRows += '|-----------|------------|----------|\n'; - } - - for (const bench of benches) { - const key = `${bench.suite} > ${bench.name}`; - const iCurrent = normalizeBenchmark(bench); - const bCurrentEntry = bCurrent[key]; - const iBaseEntry = iBase[key]; - const bBaseEntry = bBase[key]; - - if (hasAnyBaseline) { - let iDelta; - if (iBaseEntry) { - const c = calculateChange(iBaseEntry, iCurrent); - const classification = classify(iBaseEntry, iCurrent); - if (c != null) fileS.interp.changes.push(c); - fileS.interp[classification]++; - iDelta = fmtDelta(classification, c); - } else { - iDelta = '๐Ÿ†•'; - fileS.interp.new++; - } - - let bDelta; - if (bCurrentEntry && bBaseEntry) { - const c = calculateChange(bBaseEntry, bCurrentEntry); - const classification = classify(bBaseEntry, bCurrentEntry); - if (c != null) fileS.bytecode.changes.push(c); - fileS.bytecode[classification]++; - bDelta = fmtDelta(classification, c); - } else if (bCurrentEntry) { - bDelta = '๐Ÿ†•'; - fileS.bytecode.new++; - } else { - bDelta = 'โ€”'; - } - - fileRows += `| ${bench.name} | ${fmtCell(iBaseEntry, iCurrent)} | ${iDelta} | ${bCurrentEntry ? fmtCell(bBaseEntry, bCurrentEntry) : 'โ€”'} | ${bDelta} |\n`; - } else { - fileRows += `| ${bench.name} | ${fmtRange(iCurrent)} | ${bCurrentEntry ? fmtRange(bCurrentEntry) : 'โ€”'} |\n`; - } - } - - for (const mode of ['interp', 'bytecode']) { - for (const k of ['improved', 'regressed', 'unchanged', 'new']) totals[mode][k] += fileS[mode][k]; - totals[mode].changes.push(...fileS[mode].changes); - } - - const hasSignificant = fileS.interp.improved + fileS.interp.regressed + fileS.bytecode.improved + fileS.bytecode.regressed > 0; - - function modeSummary(label, s) { - const p = []; - if (s.improved > 0) p.push(`๐ŸŸข ${s.improved}`); - if (s.regressed > 0) p.push(`๐Ÿ”ด ${s.regressed}`); - if (s.new > 0) p.push(`๐Ÿ†• ${s.new}`); - if (s.unchanged > 0) p.push(`${s.unchanged} unch.`); - const avg = fmtAvg(s.changes); - return `${label}: ${p.length > 0 ? p.join(', ') : `${s.changes.length + s.new}`}${avg ? ` ยท ${avg}` : ''}`; - } - - const fileSummary = hasAnyBaseline - ? modeSummary('Interp', fileS.interp) + ' ยท ' + modeSummary('Bytecode', fileS.bytecode) - : `${benches.length} benchmarks`; - - fileBlocks.push({ fileName, fileSummary, fileRows, hasSignificant }); - } - - let body = '## Benchmark Results\n\n'; - - if (hasAnyBaseline) { - function overallMode(label, s) { - const p = []; - if (s.improved > 0) p.push(`๐ŸŸข ${s.improved} improved`); - if (s.regressed > 0) p.push(`๐Ÿ”ด ${s.regressed} regressed`); - if (s.new > 0) p.push(`๐Ÿ†• ${s.new} new`); - if (s.unchanged > 0) p.push(`${s.unchanged} unchanged`); - const avg = fmtAvg(s.changes); - return `**${label}:** ${p.join(' ยท ')}${avg ? ` ยท ${avg}` : ''}`; - } - body += `**${totalBenchmarks}** benchmarks\n\n`; - body += overallMode('Interpreted', totals.interp) + '\\\n'; - body += overallMode('Bytecode', totals.bytecode) + '\n\n'; - } else { - body += `**${totalBenchmarks}** benchmarks (no baseline)\n\n`; - } - - for (const block of fileBlocks) { - const shortName = block.fileName.replace(/^benchmarks\//, ''); - const open = block.hasSignificant ? ' open' : ''; - body += `\n`; - body += `${shortName} โ€” ${block.fileSummary}\n\n`; - body += block.fileRows; - body += `\n\n\n`; - } + // Deterministic profile diff (opcode/allocation counts) is runner + // independent, so it compares the same-runner main build's profiles. const skipProfile = /\[skip-profile-check\]/.test(context.payload.pull_request?.body || ''); + let profileDiff = ''; try { - const profileDiff = buildProfileDiffSection({ - baselineDir: 'profile-baseline', + profileDiff = buildProfileDiffSection({ + baselineDir: 'profile-main', currentDir: 'profile-current', skip: skipProfile, - }); - body += profileDiff.markdown + '\n\n'; + }).markdown; } catch (error) { console.log('Failed to build deterministic profile diff:', error.message); - body += '## Deterministic profile diff\n\nDeterministic profile diff: unavailable due to a comparison error.\n\n'; + profileDiff = '## Deterministic profile diff\n\nDeterministic profile diff: unavailable due to a comparison error.'; } - body += 'Measured on ubuntu-latest x64. Benchmark ranges compare cached main-branch min/max ops/sec with the PR run; overlapping ranges are treated as unchanged noise. Percentage deltas are secondary context.'; + const { body: inner } = buildBenchmarkComparison({ + interpData: read('benchmark-interpreted.json'), + interpMain: tryRead('benchmark-interpreted-main.json'), + bytecodeData: read('benchmark-bytecode.json'), + bytecodeMain: tryRead('benchmark-bytecode-main.json'), + profileDiff, + runnerLabel: 'ubuntu-latest x64', + }); const marker = ''; - body = marker + '\n' + body; + const body = marker + '\n' + inner; const { data: comments } = await github.rest.issues.listComments({ owner: context.repo.owner, diff --git a/docs/adr/0076-same-runner-benchmark-comparison.md b/docs/adr/0076-same-runner-benchmark-comparison.md new file mode 100644 index 000000000..c02f695bf --- /dev/null +++ b/docs/adr/0076-same-runner-benchmark-comparison.md @@ -0,0 +1,13 @@ +# Same-runner benchmark comparison + +**Date:** 2026-06-27 +**Area:** `ci` / `tooling` +**Issue:** [#815](https://github.com/frostney/GocciaScript/issues/815) + +The PR "Benchmark Results" comment previously classified benchmarks as ๐ŸŸข improved / ๐Ÿ”ด regressed by comparing the PR's freshly-measured run against a **cached baseline measured in a different job, on a different GitHub-hosted runner, at a different time** (whatever `main` last pushed). The baseline was written by `ci.yml`'s `benchmark` job via `actions/cache/save` and restored in `pr.yml`'s `benchmark-comment` job by `restore-keys` prefix. Cross-runner/cross-time variance is systematically larger than each run's own min/max spread, so the range-overlap classifier (improved iff PR min > baseline max) flipped large numbers of benchmarks to improved/regressed even when the diff could not possibly affect them โ€” e.g. PR [#805](https://github.com/frostney/GocciaScript/pull/805) (regex-decode only) reported `arraybuffer.js` and `arrays.js`, which use no regex, as broadly improved. The verdicts were confident but not actionable, and a real regression was indistinguishable from runner noise. + +We now build the PR's **base commit** (`github.event.pull_request.base.sha` โ€” the `main` the PR targets) in a dedicated `build-main` job, and the `benchmark` job benchmarks that `main` build and the PR build **back-to-back on the same runner**, after a discarded warm-up run that sheds runner cold-start (page faults, sub-turbo clocks). The range-overlap classifier is unchanged; it now compares two runs measured under the same conditions, so the systematic between-runner offset cancels and only within-run noise remains โ€” which is exactly what range overlap is designed to absorb. The deterministic profile diff (opcode/allocation counts) is regenerated from the same `main` build instead of a cached profile, making the whole comparison self-contained; the cross-runner `benchmark-*-baseline-*` and `profile-baseline-*` caches in `ci.yml` and their restores in `pr.yml` are removed. The ~200-line comparison previously inlined in workflow YAML moved to [`scripts/benchmark-compare.js`](../../scripts/benchmark-compare.js) with unit tests in `scripts/test-benchmark-compare.ts`, so the classifier semantics โ€” including the #815 false-positive case โ€” are locked by a test rather than embedded in `pr.yml`. + +The explicit trade-off is cost: building and benchmarking `main` inside every PR run roughly **doubles the benchmark job's CI time** (one extra `--prod` build of the benchmark runner plus a second benchmark pass per mode). That is the accepted price of a trustworthy comparison. The comparison stays **advisory (comment-only)**, not a merge-blocking gate: even same-runner measurement has a noise floor (re-running the identical `--prod` binary showed ยฑ11โ€“15%, up to ยฑ59%), `VISION.md` (line 43) deprioritizes raw throughput, and test262 remains the gating regression check. The range-overlap rule absorbs that floor as "unchanged overlap," and the comment now reports the measured median per-run variance so readers can judge whether a sub-noise delta is meaningful. + +Alternatives considered and rejected: **normalizing the cached cross-runner baseline** to an in-run calibration reference โ€” cheaper, but a single calibration benchmark cannot cancel per-workload-class variance (memory-bound vs CPU-bound runners scale differently), so false verdicts shrink rather than disappear; a **consistent self-hosted runner** โ€” removes cross-runner variance structurally but adds infrastructure burden and runs untrusted PR code on owned hardware in a public repo; and **demoting timing entirely** in favour of the already-trustworthy deterministic profile diff โ€” honest and cheap, but the deterministic diff only catches regressions that change instruction or allocation counts, not a pure wall-clock-per-op slowdown, which is part of what the gate exists to surface. Per-file interleaving of the two binaries (the issue's "ideally interleaved") is a possible future refinement if residual same-runner drift proves material; the reported noise floor makes such drift visible. See [docs/build-system.md](../build-system.md) `benchmark` job reference and [VISION.md](../../VISION.md). diff --git a/docs/adr/README.md b/docs/adr/README.md index 820ac306d..1be3f122d 100644 --- a/docs/adr/README.md +++ b/docs/adr/README.md @@ -85,3 +85,4 @@ Durable architecture and implementation decisions for GocciaScript. New ADRs use - [0073 โ€” Opt-in ShadowRealm](0073-opt-in-shadowrealm.md) - [0074 โ€” Deferred bytecode call-stack frames](0074-deferred-bytecode-call-stack-frames.md) - [0075 โ€” ShadowRealm full test262 conformance](0075-shadowrealm-conformance.md) +- [0076 โ€” Same-runner benchmark comparison](0076-same-runner-benchmark-comparison.md) diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 46c9e2073..39037c536 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -260,10 +260,10 @@ When a benchmark has a `setup` or `teardown` function, a second line displays th Benchmarks run as part of the CI pipeline in both **interpreter mode** and **bytecode mode**. CI uses `GOCCIA_BENCH_CALIBRATION_MS=100` and `GOCCIA_BENCH_ROUNDS=7` for stable measurements with IQR outlier filtering. On -pushes to `main`, the ubuntu-latest x64 runner saves JSON baselines for each -mode (`benchmark-interpreted-results.json` and `benchmark-bytecode-results.json`) -to the GitHub Actions cache. See [testing.md](testing.md#ci-integration) for the -full pipeline overview. +pushes to `main`, the ubuntu-latest x64 runner emits JSON and validates the +report shape. PR comparison builds and benchmarks `main` on the PR's own runner +rather than reading a cached baseline ([ADR 0076](adr/0076-same-runner-benchmark-comparison.md)). +See [testing.md](testing.md#ci-integration) for the full pipeline overview. Main bytecode benchmark runs also retain deterministic VM profile reports. The GitHub Actions artifact is named `benchmark-profile` and contains: @@ -284,13 +284,15 @@ paths are `benchmark-profiles/runs//aggregate.json.gz`, ### PR Benchmark Comparison -The PR workflow (`.github/workflows/pr.yml`) restores the cached baseline JSON from `main`, runs the benchmark matrix, and posts a collapsible comparison comment on the PR. Each benchmark file gets a **unified table** with both execution modes side by side: +The PR workflow (`.github/workflows/pr.yml`) builds the PR's base commit (`main`) in a `build-main` job and benchmarks that `main` build and the PR build **back-to-back on the same runner** (after a discarded warm-up), so deltas reflect the diff rather than cross-runner variance (issue #815, [ADR 0076](adr/0076-same-runner-benchmark-comparison.md)). The comparison logic lives in [`scripts/benchmark-compare.js`](../scripts/benchmark-compare.js) (unit-tested by `scripts/test-benchmark-compare.ts`). It posts a collapsible comparison comment on the PR; each benchmark file gets a **unified table** with both execution modes side by side: -- Each table row shows `| Benchmark | Interpreted | ฮ” | Bytecode | ฮ” |` with the point estimate and cached/PR min-max range in the form `10,000 ops/sec [9,500..10,500] โ†’ 9,200 ops/sec [8,700..9,700]` -- Classification uses **range overlap** instead of a single point value: if the PR run sits fully above the baseline range it is improved, if it sits fully below it is regressed, and overlapping ranges are treated as unchanged noise +- Each table row shows `| Benchmark | Interpreted (main โ†’ PR) | ฮ” | Bytecode (main โ†’ PR) | ฮ” |` with the point estimate and same-runner `main`/PR min-max range in the form `10,000 ops/sec [9,500..10,500] โ†’ 9,200 ops/sec [8,700..9,700]` +- Classification uses **range overlap** instead of a single point value: if the PR run sits fully above the `main` range it is improved, if it sits fully below it is regressed, and overlapping ranges are treated as unchanged noise - Percentage deltas remain in the `ฮ”` column as secondary context, even when the classifier marks a benchmark as `~ overlap` +- The overall summary reports the measured **median per-run variance** as a noise floor, so sub-noise deltas are read as unchanged - Results are **grouped by file**, each in a collapsible `
` section - Files with significant changes (improvements or regressions) are auto-expanded - Each file summary shows per-mode counts (e.g., `Interp: ๐ŸŸข 1, 7 unch. ยท Bytecode: ๐ŸŸข 2, 6 unch.`) - The **overall PR summary** shows per-mode totals on separate lines with average percentage deltas -- ๐ŸŸข marks non-overlapping improvements, ๐Ÿ”ด marks non-overlapping regressions, `~ overlap` marks overlapping ranges, and ๐Ÿ†• marks new benchmarks with no baseline +- The comparison is **advisory** (comment-only, never merge-blocking) +- ๐ŸŸข marks non-overlapping improvements, ๐Ÿ”ด marks non-overlapping regressions, `~ overlap` marks overlapping ranges, and ๐Ÿ†• marks new benchmarks absent from the `main` build diff --git a/docs/build-system.md b/docs/build-system.md index 1425460c8..99481b4a3 100644 --- a/docs/build-system.md +++ b/docs/build-system.md @@ -669,7 +669,7 @@ Runs on the full platform matrix: **`test262`** (needs build, ubuntu-latest x64 only, **non-blocking**) โ€” Downloads the `gocciascript-x86_64-linux` build, checks out [`tc39/test262`](https://github.com/tc39/test262) at the SHA pinned in `scripts/test262-suite-sha.txt`, runs `bun scripts/run_test262_suite.ts --suite-dir test262-suite --mode=bytecode --jobs=4 --timeout-ms=20000 --output=test262-results.json`, and uploads the report as a 30-day workflow artifact. The run step uses `continue-on-error: true` because the conformance lane is allowed to carry known steady-state failures while the engine closes compatibility gaps. On main, the JSON is also stashed via `actions/cache/save` under `test262-baseline-` so the PR workflow can compute ฮ” vs main, and `cd website && bun run publish-test262 ../test262-results.json` publishes the compressed run report plus its UTC daily dashboard pointer to Vercel Blob when `BLOB_READ_WRITE_TOKEN` is configured. Main runs also upload the `test262-profile` artifact and publish matching aggregate/detail profile payloads under the separate `test262-profiles/` Blob namespace for weekly performance review. The one-off `cd website && bun run backfill-test262` command seeds retained artifact reports and reruns expired historical days directly into Blob. The pin is bumped weekly by `.github/workflows/test262-bump.yml`. See [docs/test262.md](test262.md) for the harness and profile report contracts. -**`benchmark`** (needs build) โ€” Runs all benchmarks on all platforms. On main (ubuntu-latest x64), saves benchmark results as JSON to `actions/cache` for PR comparison. The main bytecode benchmark lane also captures deterministic VM profile details, uploads the `benchmark-profile` artifact, and publishes aggregate/detail profile payloads under the separate `benchmark-profiles/` Blob namespace when `BLOB_READ_WRITE_TOKEN` is configured. +**`benchmark`** (needs build) โ€” Runs all benchmarks on all platforms. On main (ubuntu-latest x64), it additionally emits JSON and validates the report shape. PR comparison no longer reads a cached baseline from here โ€” each PR builds and benchmarks `main` on its own runner ([ADR 0076](adr/0076-same-runner-benchmark-comparison.md)). The main bytecode benchmark lane also captures deterministic VM profile details, uploads the `benchmark-profile` artifact, and publishes aggregate/detail profile payloads under the separate `benchmark-profiles/` Blob namespace when `BLOB_READ_WRITE_TOKEN` is configured. **`cli`** (needs build) โ€” Downloads pre-built binaries and runs CLI behavior smoke tests on all platforms via Bun: options across all apps, lexer numeric-separator rejection, parser error display, config-file loading, and app-specific features including Sandbox Runner seed baselines. Windows runs additionally assert that the loader binary does not link OpenSSL DLLs (HTTPS must use the platform TLS stack statically). @@ -692,7 +692,7 @@ Runs on **ubuntu-latest x64 only** (single runner, no matrix). **`test`** (needs build) โ€” Runs all JavaScript tests and Pascal unit tests. -**`benchmark`** (needs build) โ€” Restores the cached benchmark baseline JSON from main, runs all benchmarks with JSON output, and posts a collapsible comparison comment on the PR grouped by file. Each file section shows the cached baseline and PR `opsPerSec` point estimates side by side, with each point estimate carrying its min/max ops/sec range in brackets. Classification uses range overlap: fully above the baseline range is an improvement, fully below is a regression, and overlapping ranges are treated as unchanged noise. Percentage deltas are still shown as secondary context, and files with significant changes are auto-expanded. If no baseline exists, shows results without comparison. +**`benchmark`** (needs build + build-main) โ€” Compares the PR against a `main` build measured **on the same runner**. A separate `build-main` job builds the benchmark runner from the PR's base commit (`github.event.pull_request.base.sha`); the `benchmark` job then warms up the runner (discarded) and benchmarks the `main` build and the PR build back-to-back, so deltas reflect the diff rather than cross-runner variance (issue #815, [ADR 0076](adr/0076-same-runner-benchmark-comparison.md)). It posts a collapsible comparison comment grouped by file: each file section shows the same-runner `main` and PR `opsPerSec` point estimates side by side, each carrying its min/max ops/sec range in brackets. Classification (in [`scripts/benchmark-compare.js`](../scripts/benchmark-compare.js), unit-tested by `scripts/test-benchmark-compare.ts`) uses range overlap: fully above the `main` range is an improvement, fully below is a regression, and overlapping ranges are treated as unchanged noise. The comment also reports the measured median per-run variance as a noise floor. Percentage deltas are secondary context, files with significant changes are auto-expanded, and the comparison is advisory (comment-only, never merge-blocking). The deterministic profile diff (opcode/allocation counts) is regenerated from the same `main` build. If no `main` build is available, shows results without comparison. **`test262`** (needs build, **non-blocking**) โ€” Checks out `tc39/test262` at the pinned SHA, runs `bun scripts/run_test262_suite.ts --suite-dir test262-suite --mode=bytecode --jobs=4 --timeout-ms=20000 --output=test262-results.json`, and uploads the JSON report. Failing tests do not fail the job. The downstream `test262-comment` job (`if: always()`) restores the most recent `test262-baseline-` cache entry from main, then `bun scripts/run_test262_suite.ts --comment test262-results.json ` builds the markdown body and the workflow posts/updates a comment using marker ``. The comment shows a per-category breakdown (built-ins, harness, intl402, language, staging) with ฮ”-vs-main columns when a baseline is cached, an "Areas closest to 100%" sub-table, and a collapsible per-test delta list. PR CI does not generate or upload the full-corpus test262 profile artifact. diff --git a/scripts/benchmark-compare.js b/scripts/benchmark-compare.js new file mode 100644 index 000000000..1f38cc79d --- /dev/null +++ b/scripts/benchmark-compare.js @@ -0,0 +1,274 @@ +#!/usr/bin/env node + +// Builds the PR "Benchmark Results" comment by comparing the PR build against a +// `main` build measured **on the same runner, back-to-back** in the same job. +// +// Why this module exists: the comparison used to read a `main` baseline from the +// Actions cache, written by a *different* job on a *different* GitHub-hosted +// runner at a *different* time. Cross-runner/cross-time variance dwarfs each +// run's own min/max spread, so the range-overlap classifier flipped large +// numbers of benchmarks to improved/regressed even when the diff could not have +// affected them (issue #815). Feeding the classifier two same-runner runs makes +// the systematic offset cancel; the range-overlap rule then only sees within-run +// noise, which it is designed for. See docs/adr/0076. +// +// The classifier itself is unchanged from the original inline implementation +// (range overlap on each run's own min/max). This module exists so that logic +// is a single tested unit instead of ~200 lines embedded in workflow YAML. + +const fmtOps = (n) => Math.round(n).toString().replace(/\B(?=(\d{3})+(?!\d))/g, ','); + +const fmtPercent = (change) => { + if (change == null) return 'N/A'; + return `${change >= 0 ? '+' : ''}${change.toFixed(1)}%`; +}; + +const fmtAvg = (changes) => { + if (changes.length === 0) return ''; + const avg = changes.reduce((a, b) => a + b, 0) / changes.length; + return `avg ${fmtPercent(avg)}`; +}; + +const fmtDelta = (classification, change) => { + if (change == null) return classification === 'new' ? '๐Ÿ†•' : 'N/A'; + if (classification === 'improved') return `๐ŸŸข ${fmtPercent(change)}`; + if (classification === 'regressed') return `๐Ÿ”ด ${fmtPercent(change)}`; + if (classification === 'unchanged') return `~ overlap (${fmtPercent(change)})`; + return '๐Ÿ†•'; +}; + +const fmtRange = (entry) => { + if (!entry) return 'โ€”'; + if (entry.minOpsPerSec === entry.maxOpsPerSec) return fmtOps(entry.opsPerSec); + return `${fmtOps(entry.minOpsPerSec)}..${fmtOps(entry.maxOpsPerSec)}`; +}; + +const fmtBenchmark = (entry) => { + if (!entry) return 'โ€”'; + if (entry.minOpsPerSec === entry.maxOpsPerSec) return `${fmtOps(entry.opsPerSec)} ops/sec`; + return `${fmtOps(entry.opsPerSec)} ops/sec [${fmtRange(entry)}]`; +}; + +const fmtCell = (baseEntry, currentEntry) => { + if (baseEntry) return `${fmtBenchmark(baseEntry)} โ†’ ${fmtBenchmark(currentEntry)}`; + return fmtBenchmark(currentEntry); +}; + +const normalizeBenchmark = (benchmark) => ({ + opsPerSec: benchmark.opsPerSec, + minOpsPerSec: benchmark.minOpsPerSec ?? benchmark.opsPerSec, + maxOpsPerSec: benchmark.maxOpsPerSec ?? benchmark.opsPerSec, + variancePercentage: benchmark.variancePercentage ?? 0, +}); + +const calculateChange = (baseEntry, currentEntry) => { + if (!baseEntry || !baseEntry.opsPerSec) return null; + return ((currentEntry.opsPerSec - baseEntry.opsPerSec) / baseEntry.opsPerSec) * 100; +}; + +// Range overlap on each run's own min/max range. Because both ranges now come +// from the same runner, the only thing separating them is within-run noise: a +// classification only fires when the PR range sits *entirely* above (improved) +// or *entirely* below (regressed) the main range. Overlap โ†’ unchanged noise. +const classify = (baseEntry, currentEntry) => { + if (!baseEntry) return 'new'; + if (currentEntry.minOpsPerSec > baseEntry.maxOpsPerSec) return 'improved'; + if (currentEntry.maxOpsPerSec < baseEntry.minOpsPerSec) return 'regressed'; + return 'unchanged'; +}; + +function buildMap(data) { + const map = {}; + if (!data) return map; + for (const file of data.files) { + const m = {}; + for (const b of file.benchmarks) { + if (!b.error) m[`${b.suite} > ${b.name}`] = normalizeBenchmark(b); + } + map[file.fileName] = m; + } + return map; +} + +// Median of each benchmark's own within-run variancePercentage: an honest, +// measured "noise floor" surfaced so readers can judge whether a sub-noise delta +// is meaningful. It is the spread the classifier already absorbs as overlap. +function medianVariance(data) { + if (!data) return null; + const values = []; + for (const file of data.files) { + for (const b of file.benchmarks) { + if (!b.error && typeof b.variancePercentage === 'number' && b.variancePercentage > 0) { + values.push(b.variancePercentage); + } + } + } + if (values.length === 0) return null; + values.sort((a, b) => a - b); + const mid = Math.floor(values.length / 2); + return values.length % 2 ? values[mid] : (values[mid - 1] + values[mid]) / 2; +} + +const newStats = () => ({ improved: 0, regressed: 0, unchanged: 0, new: 0, changes: [] }); + +// options: +// interpData, bytecodeData โ€” required PR-run reports (this run) +// interpMain, bytecodeMain โ€” same-runner `main`-build reports (may be null) +// profileDiff โ€” optional pre-rendered deterministic profile markdown +// runnerLabel โ€” e.g. 'ubuntu-latest x64' +// returns { body } โ€” markdown WITHOUT the HTML comment marker. +function buildBenchmarkComparison(options) { + const interpData = options.interpData; + const bytecodeData = options.bytecodeData; + const interpMain = options.interpMain || null; + const bytecodeMain = options.bytecodeMain || null; + const runnerLabel = options.runnerLabel || 'ubuntu-latest x64'; + + const hasAnyMain = interpMain || bytecodeMain; + + const interpMainMap = buildMap(interpMain); + const bytecodeMainMap = buildMap(bytecodeMain); + const bytecodeCurrentMap = buildMap(bytecodeData); + + const totals = { interp: newStats(), bytecode: newStats() }; + let totalBenchmarks = 0; + const fileBlocks = []; + + for (const file of interpData.files) { + const fileName = file.fileName; + const iBase = interpMainMap[fileName] || {}; + const bBase = bytecodeMainMap[fileName] || {}; + const bCurrent = bytecodeCurrentMap[fileName] || {}; + const fileS = { interp: newStats(), bytecode: newStats() }; + let fileRows = ''; + const benches = file.benchmarks.filter((b) => !b.error); + totalBenchmarks += benches.length; + + if (hasAnyMain) { + fileRows += '| Benchmark | Interpreted (main โ†’ PR) | ฮ” | Bytecode (main โ†’ PR) | ฮ” |\n'; + fileRows += '|-----------|------------|---|---------|---|\n'; + } else { + fileRows += '| Benchmark | Interpreted | Bytecode |\n'; + fileRows += '|-----------|------------|----------|\n'; + } + + for (const bench of benches) { + const key = `${bench.suite} > ${bench.name}`; + const iCurrent = normalizeBenchmark(bench); + const bCurrentEntry = bCurrent[key]; + const iBaseEntry = iBase[key]; + const bBaseEntry = bBase[key]; + + if (hasAnyMain) { + let iDelta; + if (iBaseEntry) { + const c = calculateChange(iBaseEntry, iCurrent); + const classification = classify(iBaseEntry, iCurrent); + if (c != null) fileS.interp.changes.push(c); + fileS.interp[classification]++; + iDelta = fmtDelta(classification, c); + } else { + iDelta = '๐Ÿ†•'; + fileS.interp.new++; + } + + let bDelta; + if (bCurrentEntry && bBaseEntry) { + const c = calculateChange(bBaseEntry, bCurrentEntry); + const classification = classify(bBaseEntry, bCurrentEntry); + if (c != null) fileS.bytecode.changes.push(c); + fileS.bytecode[classification]++; + bDelta = fmtDelta(classification, c); + } else if (bCurrentEntry) { + bDelta = '๐Ÿ†•'; + fileS.bytecode.new++; + } else { + bDelta = 'โ€”'; + } + + fileRows += `| ${bench.name} | ${fmtCell(iBaseEntry, iCurrent)} | ${iDelta} | ${bCurrentEntry ? fmtCell(bBaseEntry, bCurrentEntry) : 'โ€”'} | ${bDelta} |\n`; + } else { + fileRows += `| ${bench.name} | ${fmtRange(iCurrent)} | ${bCurrentEntry ? fmtRange(bCurrentEntry) : 'โ€”'} |\n`; + } + } + + for (const mode of ['interp', 'bytecode']) { + for (const k of ['improved', 'regressed', 'unchanged', 'new']) totals[mode][k] += fileS[mode][k]; + totals[mode].changes.push(...fileS[mode].changes); + } + + const hasSignificant = fileS.interp.improved + fileS.interp.regressed + fileS.bytecode.improved + fileS.bytecode.regressed > 0; + + function modeSummary(label, s) { + const p = []; + if (s.improved > 0) p.push(`๐ŸŸข ${s.improved}`); + if (s.regressed > 0) p.push(`๐Ÿ”ด ${s.regressed}`); + if (s.new > 0) p.push(`๐Ÿ†• ${s.new}`); + if (s.unchanged > 0) p.push(`${s.unchanged} unch.`); + const avg = fmtAvg(s.changes); + return `${label}: ${p.length > 0 ? p.join(', ') : `${s.changes.length + s.new}`}${avg ? ` ยท ${avg}` : ''}`; + } + + const fileSummary = hasAnyMain + ? modeSummary('Interp', fileS.interp) + ' ยท ' + modeSummary('Bytecode', fileS.bytecode) + : `${benches.length} benchmarks`; + + fileBlocks.push({ fileName, fileSummary, fileRows, hasSignificant }); + } + + let body = '## Benchmark Results\n\n'; + + if (hasAnyMain) { + function overallMode(label, s) { + const p = []; + if (s.improved > 0) p.push(`๐ŸŸข ${s.improved} improved`); + if (s.regressed > 0) p.push(`๐Ÿ”ด ${s.regressed} regressed`); + if (s.new > 0) p.push(`๐Ÿ†• ${s.new} new`); + if (s.unchanged > 0) p.push(`${s.unchanged} unchanged`); + const avg = fmtAvg(s.changes); + return `**${label}:** ${p.join(' ยท ')}${avg ? ` ยท ${avg}` : ''}`; + } + body += `**${totalBenchmarks}** benchmarks ยท PR vs same-runner \`main\` build\n\n`; + body += overallMode('Interpreted', totals.interp) + '\\\n'; + body += overallMode('Bytecode', totals.bytecode) + '\n\n'; + + const iNoise = medianVariance(interpData); + const bNoise = medianVariance(bytecodeData); + const noiseParts = []; + if (iNoise != null) noiseParts.push(`interpreted ยฑ${iNoise.toFixed(1)}%`); + if (bNoise != null) noiseParts.push(`bytecode ยฑ${bNoise.toFixed(1)}%`); + if (noiseParts.length > 0) { + body += `Typical per-run noise (median variance): ${noiseParts.join(', ')}. Deltas within noise overlap and read as unchanged.\n\n`; + } + } else { + body += `**${totalBenchmarks}** benchmarks (no same-runner \`main\` build to compare)\n\n`; + } + + for (const block of fileBlocks) { + const shortName = block.fileName.replace(/^benchmarks\//, ''); + const open = block.hasSignificant ? ' open' : ''; + body += `\n`; + body += `${shortName} โ€” ${block.fileSummary}\n\n`; + body += block.fileRows; + body += `\n
\n\n`; + } + + if (options.profileDiff) { + body += options.profileDiff + '\n\n'; + } + + body += hasAnyMain + ? `Measured on ${runnerLabel}. Each PR run also builds the \`main\` base and benchmarks it back-to-back on the same runner after a warm-up discard, so the ranges compare two runs measured under the same conditions; overlapping min/max ranges are treated as unchanged noise. Percentage deltas are secondary context. See docs/adr/0076.` + : `Measured on ${runnerLabel}. No \`main\` build was available to compare against, so these are raw PR-run numbers. See docs/adr/0076.`; + + return { body }; +} + +module.exports = { + buildBenchmarkComparison, + classify, + calculateChange, + normalizeBenchmark, + buildMap, + medianVariance, +}; diff --git a/scripts/profile-diff.js b/scripts/profile-diff.js index 1baa8724d..221f95f7a 100755 --- a/scripts/profile-diff.js +++ b/scripts/profile-diff.js @@ -217,7 +217,7 @@ function buildProfileDiffSection(options) { if (baselineFiles.length === 0) { return { - markdown: '## Deterministic profile diff\n\nDeterministic profile diff: no cached baseline was restored.', + markdown: '## Deterministic profile diff\n\nDeterministic profile diff: no `main`-build profile was available.', signalCount: 0, }; } diff --git a/scripts/test-benchmark-compare.ts b/scripts/test-benchmark-compare.ts new file mode 100644 index 000000000..3f5d839f9 --- /dev/null +++ b/scripts/test-benchmark-compare.ts @@ -0,0 +1,167 @@ +#!/usr/bin/env bun +/** + * test-benchmark-compare.ts + * + * Unit tests for scripts/benchmark-compare.js โ€” the PR "Benchmark Results" + * comment builder. Locks the range-overlap classifier and proves the issue + * #815 failure mode: when the PR and `main` ranges overlap (the realistic + * same-runner case), a benchmark untouched by the diff reads as `unchanged`, + * not improved/regressed โ€” even when its point-estimate delta is positive. + * + * Run: bun run scripts/test-benchmark-compare.ts + */ + +import { createRequire } from "node:module"; + +const require = createRequire(import.meta.url); +const { + classify, + medianVariance, + buildBenchmarkComparison, +} = require("./benchmark-compare.js"); + +let passed = 0; + +function assert(condition: unknown, message: string): void { + if (!condition) throw new Error(`Assertion failed: ${message}`); + passed++; +} + +function assertEqual(actual: T, expected: T, message: string): void { + if (actual !== expected) { + throw new Error(`${message}\n expected: ${JSON.stringify(expected)}\n actual: ${JSON.stringify(actual)}`); + } + passed++; +} + +// A benchmark entry: opsPerSec point estimate plus the min/max range across rounds. +const bench = ( + suite: string, + name: string, + opsPerSec: number, + min: number, + max: number, + variancePercentage = 0, +) => ({ suite, name, opsPerSec, minOpsPerSec: min, maxOpsPerSec: max, variancePercentage }); + +const report = (fileName: string, benchmarks: object[]) => ({ + files: [{ fileName, benchmarks }], + totalBenchmarks: benchmarks.length, +}); + +// -- classify(): pure range overlap ----------------------------------------------- + +console.log("classify() range overlap..."); +{ + // PR range entirely above main range โ†’ improved. + assertEqual( + classify({ minOpsPerSec: 100, maxOpsPerSec: 110 }, { minOpsPerSec: 120, maxOpsPerSec: 130 }), + "improved", + "PR min above main max is improved", + ); + // PR range entirely below main range โ†’ regressed. + assertEqual( + classify({ minOpsPerSec: 120, maxOpsPerSec: 130 }, { minOpsPerSec: 100, maxOpsPerSec: 110 }), + "regressed", + "PR max below main min is regressed", + ); + // Overlapping ranges โ†’ unchanged noise. + assertEqual( + classify({ minOpsPerSec: 100, maxOpsPerSec: 130 }, { minOpsPerSec: 110, maxOpsPerSec: 140 }), + "unchanged", + "overlapping ranges are unchanged", + ); + // No main entry โ†’ new. + assertEqual( + classify(undefined, { minOpsPerSec: 110, maxOpsPerSec: 140 }), + "new", + "missing main entry is new", + ); +} + +// -- Issue #815: same-runner overlap must NOT misclassify an untouched benchmark --- + +console.log("issue #815: untouched benchmark reads as unchanged..."); +{ + // Mirrors the real PR #805 evidence: a regex-only change reported arraybuffer + // (which uses no regex) as "improved" purely from cross-runner offset. With a + // same-runner `main` build the ranges overlap, so it reads as unchanged even + // though the point estimate ticked up +2.5%. + const main = { opsPerSec: 120000, minOpsPerSec: 111000, maxOpsPerSec: 127000 }; + const pr = { opsPerSec: 123000, minOpsPerSec: 110000, maxOpsPerSec: 127052 }; + assertEqual(classify(main, pr), "unchanged", "same-runner overlap is unchanged, not improved"); +} + +// -- medianVariance(): honest noise floor from measured data ----------------------- + +console.log("medianVariance()..."); +{ + const data = report("benchmarks/a.js", [ + bench("a", "x", 100, 90, 110, 2), + bench("a", "y", 100, 90, 110, 6), + bench("a", "z", 100, 90, 110, 4), + bench("a", "err-skipped", 0, 0, 0, 0), // zero variance is ignored + ]); + assertEqual(medianVariance(data), 4, "median of {2,4,6} is 4"); + assertEqual(medianVariance(null), null, "null data yields null noise"); +} + +// -- buildBenchmarkComparison(): with a same-runner main build --------------------- + +console.log("buildBenchmarkComparison() with main build..."); +{ + const interpData = report("benchmarks/arraybuffer.js", [ + bench("arraybuffer", "create", 123000, 110000, 127052, 5), + bench("arraybuffer", "brand new", 5000, 4900, 5100, 3), + ]); + const interpMain = report("benchmarks/arraybuffer.js", [ + bench("arraybuffer", "create", 120000, 111000, 127000, 5), + // "brand new" intentionally absent from main โ†’ should render as ๐Ÿ†•. + ]); + const bytecodeData = report("benchmarks/arraybuffer.js", [ + bench("arraybuffer", "create", 162000, 140000, 190000, 8), + ]); + const bytecodeMain = report("benchmarks/arraybuffer.js", [ + bench("arraybuffer", "create", 160000, 152000, 164000, 8), + ]); + + const { body } = buildBenchmarkComparison({ + interpData, + interpMain, + bytecodeData, + bytecodeMain, + runnerLabel: "ubuntu-latest x64", + }); + + assert(body.includes("PR vs same-runner `main` build"), "header names the same-runner main build"); + assert(body.includes("main โ†’ PR"), "table columns show main โ†’ PR direction"); + assert(body.includes("Typical per-run noise"), "noise floor line is present"); + assert(body.includes("docs/adr/0076"), "footnote references the ADR"); + assert(body.includes("๐Ÿ†•"), "benchmark missing from main renders as new"); + // The untouched 'create' benchmark overlaps โ†’ must read as unchanged overlap, + // never a green improvement. + assert(body.includes("~ overlap"), "overlapping benchmark renders as unchanged overlap"); + assert(!body.includes("๐ŸŸข 1 improved"), "no spurious improvement from same-runner overlap"); +} + +// -- buildBenchmarkComparison(): no main build to compare -------------------------- + +console.log("buildBenchmarkComparison() without main build..."); +{ + const interpData = report("benchmarks/arraybuffer.js", [bench("arraybuffer", "create", 123000, 110000, 127052)]); + const bytecodeData = report("benchmarks/arraybuffer.js", [bench("arraybuffer", "create", 162000, 140000, 190000)]); + + const { body } = buildBenchmarkComparison({ + interpData, + interpMain: null, + bytecodeData, + bytecodeMain: null, + }); + + assert(body.includes("no same-runner `main` build to compare"), "states that no main build was available"); + assert(!body.includes("ฮ”"), "no delta column without a main build"); + // The footnote must not claim a comparison that did not happen (issue #815 review). + assert(!body.includes("back-to-back"), "no-main footnote does not claim a back-to-back comparison"); +} + +console.log(`\nbenchmark-compare: ${passed} assertions passed.`); From f6647abcad1594004710bb9831557ca949b3b1db Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Sat, 27 Jun 2026 09:46:43 +0100 Subject: [PATCH 2/4] docs(adr): link ADR 0076 to its pull request Co-Authored-By: Claude Opus 4.8 --- docs/adr/0076-same-runner-benchmark-comparison.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/adr/0076-same-runner-benchmark-comparison.md b/docs/adr/0076-same-runner-benchmark-comparison.md index c02f695bf..292c5792d 100644 --- a/docs/adr/0076-same-runner-benchmark-comparison.md +++ b/docs/adr/0076-same-runner-benchmark-comparison.md @@ -3,6 +3,7 @@ **Date:** 2026-06-27 **Area:** `ci` / `tooling` **Issue:** [#815](https://github.com/frostney/GocciaScript/issues/815) +**Pull Request:** [#886](https://github.com/frostney/GocciaScript/pull/886) The PR "Benchmark Results" comment previously classified benchmarks as ๐ŸŸข improved / ๐Ÿ”ด regressed by comparing the PR's freshly-measured run against a **cached baseline measured in a different job, on a different GitHub-hosted runner, at a different time** (whatever `main` last pushed). The baseline was written by `ci.yml`'s `benchmark` job via `actions/cache/save` and restored in `pr.yml`'s `benchmark-comment` job by `restore-keys` prefix. Cross-runner/cross-time variance is systematically larger than each run's own min/max spread, so the range-overlap classifier (improved iff PR min > baseline max) flipped large numbers of benchmarks to improved/regressed even when the diff could not possibly affect them โ€” e.g. PR [#805](https://github.com/frostney/GocciaScript/pull/805) (regex-decode only) reported `arraybuffer.js` and `arrays.js`, which use no regex, as broadly improved. The verdicts were confident but not actionable, and a real regression was indistinguishable from runner noise. From 4275c38ad5d57172b0eb2d7eccb161d62d448790 Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Sat, 27 Jun 2026 13:26:23 +0100 Subject: [PATCH 3/4] ci(benchmark): keep benchmark comment when build-main fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The benchmark job hard-depended on build-main (needs: [build, build-main]), so a build-main failure (base predating the benchmarkrunner target, or a transient FPC install hiccup) skipped benchmark and, transitively, benchmark-comment โ€” the PR got no benchmark feedback at all, even though build-pr succeeded. That made the downstream graceful-degradation path (optional main downloads + null-main handling in buildBenchmarkComparison) unreachable. Keep the build-main ordering dependency but run benchmark whenever `build` itself succeeds (if: !cancelled() && needs.build.result == 'success'), and make the main-build inputs optional: continue-on-error on the build-main download, a tolerant chmod, and if-no-files-found: ignore on the main result and profile uploads. When build-main fails the comparison now posts PR-only numbers instead of going silent. Addresses CodeRabbit review feedback on #886. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/pr.yml | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 74e3f79f7..2c4fe5a86 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -193,7 +193,13 @@ jobs: fi benchmark: + # `needs` keeps build-main ordered before this job, but its failure must not + # skip the benchmark/benchmark-comment chain: when build-main fails (e.g. the + # base predates the benchmarkrunner target, or a transient FPC hiccup) the + # downstream comparison already degrades to PR-only numbers via the optional + # main downloads + null-main handling. Run whenever `build` itself succeeded. needs: [build, build-main] + if: ${{ !cancelled() && needs.build.result == 'success' }} runs-on: ubuntu-latest strategy: fail-fast: false @@ -216,6 +222,7 @@ jobs: - name: Download main benchmark runner uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7 + continue-on-error: true with: name: build-main path: build-main/ @@ -223,7 +230,8 @@ jobs: - name: Make binaries executable run: | chmod +x build/* - chmod +x build-main/* + # build-main is optional: a failed build-main job leaves this empty. + chmod +x build-main/* 2>/dev/null || true # Same-runner A/B (issue #815): the PR and `main` builds are measured on # this one runner so the delta reflects the diff, not cross-runner @@ -315,6 +323,8 @@ jobs: with: name: benchmark-${{ matrix.mode }}-main retention-days: 1 + # Absent when build-main failed; the comparison degrades to PR-only. + if-no-files-found: ignore path: benchmark-${{ matrix.mode }}-main.json - name: Upload deterministic profiles @@ -331,6 +341,8 @@ jobs: with: name: profile-main retention-days: 1 + # Absent when build-main failed; the profile diff degrades gracefully. + if-no-files-found: ignore path: profile-main/ cli: From e30b90b390b41e9e480bd316fe8f7a41fe385a9e Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Sat, 27 Jun 2026 15:12:52 +0100 Subject: [PATCH 4/4] ci(benchmark): bound benchmark runs and lighten the warm-up MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The benchmark (interpreted) leg timed out after 51 minutes on #886: the PR measured pass hung while the main pass (same workloads, same runner) had just completed in ~3 min. Locally the full interpreted suite runs in ~2 min, so this is a non-deterministic per-benchmark calibration runaway on a loaded GitHub runner, not a benchmark bug. The trigger was self-inflicted: the two-binary warm-up ran with the default 200 ms calibration (no override), making it ~a full extra pass per binary โ€” roughly 2.5x the interpreted work and heavy runner load right before the final measured pass, which is when calibration is most likely to misfire. There was also no timeout, so a runaway burned 51 min and red-marked the PR. - Replace the two-binary warm-up with a single cheap one (PR binary, ROUNDS=1 + CALIBRATION_MS=10): still ramps the CPU and warms the shared benchmark-file cache, but light and low-load. The runner's per-benchmark warm-up (5 iters) covers binary-specific warmth, so one binary suffices. - Wrap each pass in `timeout -k 30 480` (8 min cap; normal ~3 min). A killed main pass writes no JSON and degrades to PR-only; a killed PR pass fails fast instead of spinning. - Add `timeout-minutes: 30` to the benchmark job as a hard backstop. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/pr.yml | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 2c4fe5a86..9072f3509 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -201,6 +201,11 @@ jobs: needs: [build, build-main] if: ${{ !cancelled() && needs.build.result == 'success' }} runs-on: ubuntu-latest + # Backstop: a normal run is ~7-9 min. Per-benchmark calibration can + # occasionally run away on a loaded GitHub runner (issue #815 follow-up); the + # per-step `timeout`s below cap each pass, and this caps the whole leg so a + # runaway fails in minutes instead of burning the default 360-minute budget. + timeout-minutes: 30 strategy: fail-fast: false matrix: @@ -234,17 +239,23 @@ jobs: chmod +x build-main/* 2>/dev/null || true # Same-runner A/B (issue #815): the PR and `main` builds are measured on - # this one runner so the delta reflects the diff, not cross-runner - # variance. Both benchmark the PR's `benchmarks/` workloads. The warm-up - # runs are discarded on purpose โ€” they shed runner cold-start (page - # faults, sub-turbo clocks) for BOTH binaries so neither the `main` run - # (measured first) nor the PR run is penalised by being measured cold. - - name: Warm up benchmark runners + # this one runner so the delta reflects the diff, not cross-runner variance. + # Both benchmark the PR's `benchmarks/` workloads. + # + # One cheap, discarded warm-up pass ramps the runner CPU off its idle clock + # and warms the OS file cache for the benchmark sources (shared by both + # binaries) before measuring. It deliberately uses a tiny calibration + # (10 ms, 1 round) so it stays light: an earlier full two-binary warm-up + # roughly doubled the interpreted leg and loaded the runner right before + # measuring, destabilising per-benchmark calibration enough that a measured + # pass could run away for tens of minutes (issue #815 follow-up). The + # runner's built-in per-benchmark warm-up (5 iterations) covers + # binary-specific warmth, so warming one binary here is sufficient. + - name: Warm up benchmark runner env: GOCCIA_BENCH_ROUNDS: 1 - run: | - ./build-main/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress > /dev/null 2>&1 || true - ./build/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress > /dev/null 2>&1 || true + GOCCIA_BENCH_CALIBRATION_MS: 10 + run: timeout 120 ./build/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress > /dev/null 2>&1 || true # `main` is built from the PR's base commit, so it cannot run benchmarks the # PR newly adds or changes to use new language features. The report is still @@ -252,18 +263,23 @@ jobs: # being blocked: a benchmark that errors on `main` simply has no baseline # entry and renders as ๐Ÿ†• new. The PR's own run below is intentionally NOT # tolerant โ€” a broken PR benchmark should fail the leg. + # `timeout` (GNU coreutils, present on ubuntu runners) caps a single pass: + # a normal interpreted/bytecode pass is ~3 min, so 8 min is generous, and a + # runaway calibration loop is killed (SIGKILL 30 s after SIGTERM) instead of + # spinning for the whole job. A killed `main` pass writes no JSON and the + # comparison degrades to PR-only via if-no-files-found + null-main handling. - name: Run benchmarks (main base) continue-on-error: true env: GOCCIA_BENCH_CALIBRATION_MS: 100 GOCCIA_BENCH_ROUNDS: 7 - run: ./build-main/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress --format=json --output=benchmark-${{ matrix.mode }}-main.json + run: timeout -k 30 480 ./build-main/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress --format=json --output=benchmark-${{ matrix.mode }}-main.json - name: Run benchmarks env: GOCCIA_BENCH_CALIBRATION_MS: 100 GOCCIA_BENCH_ROUNDS: 7 - run: ./build/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress --format=json --output=benchmark-${{ matrix.mode }}.json + run: timeout -k 30 480 ./build/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress --format=json --output=benchmark-${{ matrix.mode }}.json - name: Capture deterministic profiles (main base) if: matrix.mode == 'bytecode'