diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7f4311fe..a2db4044 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -779,7 +779,10 @@ jobs:
GOCCIA_BENCH_ROUNDS: 7
run: ./build/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress
- - name: Run benchmarks and save baseline
+ # main/x86_64-linux emits JSON so the next step can validate the benchmark
+ # report shape. PR comparison no longer reads a cached baseline from here โ
+ # each PR builds and benchmarks `main` on its own runner (issue #815).
+ - name: Run benchmarks (validate JSON output)
if: github.ref == 'refs/heads/main' && matrix.target == 'x86_64-linux'
env:
GOCCIA_BENCH_CALIBRATION_MS: 100
@@ -818,13 +821,6 @@ jobs:
sys.exit(1)
PY
- - name: Cache benchmark baseline
- if: github.ref == 'refs/heads/main' && matrix.target == 'x86_64-linux'
- uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5
- with:
- path: benchmark-${{ matrix.mode }}-results.json
- key: benchmark-${{ matrix.mode }}-baseline-${{ github.sha }}
-
- uses: oven-sh/setup-bun@0c5077e51419868618aeaa5fe8019c62421857d6 # v2
if: github.ref == 'refs/heads/main' && matrix.target == 'x86_64-linux' && matrix.mode == 'bytecode'
@@ -855,13 +851,6 @@ jobs:
--summary-output benchmark-profile-aggregate.json \
--markdown-output benchmark-profile-aggregate.md
- - name: Cache deterministic profile baseline
- if: github.ref == 'refs/heads/main' && matrix.target == 'x86_64-linux' && matrix.mode == 'bytecode'
- uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5
- with:
- path: profile-baseline/
- key: profile-baseline-${{ github.sha }}
-
- name: Upload benchmark profile
if: github.ref == 'refs/heads/main' && matrix.target == 'x86_64-linux' && matrix.mode == 'bytecode' && hashFiles('benchmark-profile-aggregate.json') != '' && hashFiles('profile-baseline/**') != ''
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
@@ -949,6 +938,9 @@ jobs:
- name: Check embedded resource CLI behaviour
run: bun run scripts/test-cli-embedded-resources.ts
+ - name: Check benchmark comparison logic
+ run: bun run scripts/test-benchmark-compare.ts
+
- name: Check Windows TLS dependencies
if: runner.os == 'Windows'
shell: pwsh
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index bf381442..9072f350 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -39,6 +39,35 @@ jobs:
retention-days: 1
path: pr-artifacts/*
+ # Builds the benchmark runner from the PR's base commit (the `main` the PR
+ # targets) so the `benchmark` job can measure it back-to-back with the PR build
+ # on the *same* runner. This is the trustworthy half of the same-runner A/B
+ # comparison that replaces the old cross-runner cached baseline (issue #815).
+ build-main:
+ runs-on: ubuntu-latest
+ defaults:
+ run:
+ shell: bash
+ steps:
+ - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5
+ with:
+ ref: ${{ github.event.pull_request.base.sha }}
+ fetch-depth: 0
+ persist-credentials: false
+
+ - name: Install FPC
+ run: sudo apt-get update && sudo apt-get install fpc -qq > /dev/null
+
+ - name: Build benchmark runner (main base)
+ run: ./build.pas benchmarkrunner --prod
+
+ - name: Upload main benchmark runner
+ uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
+ with:
+ name: build-main
+ retention-days: 1
+ path: build/GocciaBenchmarkRunner
+
docs:
runs-on: ubuntu-latest
steps:
@@ -164,8 +193,19 @@ jobs:
fi
benchmark:
- needs: build
+ # `needs` keeps build-main ordered before this job, but its failure must not
+ # skip the benchmark/benchmark-comment chain: when build-main fails (e.g. the
+ # base predates the benchmarkrunner target, or a transient FPC hiccup) the
+ # downstream comparison already degrades to PR-only numbers via the optional
+ # main downloads + null-main handling. Run whenever `build` itself succeeded.
+ needs: [build, build-main]
+ if: ${{ !cancelled() && needs.build.result == 'success' }}
runs-on: ubuntu-latest
+ # Backstop: a normal run is ~7-9 min. Per-benchmark calibration can
+ # occasionally run away on a loaded GitHub runner (issue #815 follow-up); the
+ # per-step `timeout`s below cap each pass, and this caps the whole leg so a
+ # runaway fails in minutes instead of burning the default 360-minute budget.
+ timeout-minutes: 30
strategy:
fail-fast: false
matrix:
@@ -185,14 +225,81 @@ jobs:
name: build-pr
path: build/
+ - name: Download main benchmark runner
+ uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7
+ continue-on-error: true
+ with:
+ name: build-main
+ path: build-main/
+
- name: Make binaries executable
- run: chmod +x build/*
+ run: |
+ chmod +x build/*
+ # build-main is optional: a failed build-main job leaves this empty.
+ chmod +x build-main/* 2>/dev/null || true
+
+ # Same-runner A/B (issue #815): the PR and `main` builds are measured on
+ # this one runner so the delta reflects the diff, not cross-runner variance.
+ # Both benchmark the PR's `benchmarks/` workloads.
+ #
+ # One cheap, discarded warm-up pass ramps the runner CPU off its idle clock
+ # and warms the OS file cache for the benchmark sources (shared by both
+ # binaries) before measuring. It deliberately uses a tiny calibration
+ # (10 ms, 1 round) so it stays light: an earlier full two-binary warm-up
+ # roughly doubled the interpreted leg and loaded the runner right before
+ # measuring, destabilising per-benchmark calibration enough that a measured
+ # pass could run away for tens of minutes (issue #815 follow-up). The
+ # runner's built-in per-benchmark warm-up (5 iterations) covers
+ # binary-specific warmth, so warming one binary here is sufficient.
+ - name: Warm up benchmark runner
+ env:
+ GOCCIA_BENCH_ROUNDS: 1
+ GOCCIA_BENCH_CALIBRATION_MS: 10
+ run: timeout 120 ./build/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress > /dev/null 2>&1 || true
+
+ # `main` is built from the PR's base commit, so it cannot run benchmarks the
+ # PR newly adds or changes to use new language features. The report is still
+ # written before a non-zero exit, so continue-on-error keeps such a PR from
+ # being blocked: a benchmark that errors on `main` simply has no baseline
+ # entry and renders as ๐ new. The PR's own run below is intentionally NOT
+ # tolerant โ a broken PR benchmark should fail the leg.
+ # `timeout` (GNU coreutils, present on ubuntu runners) caps a single pass:
+ # a normal interpreted/bytecode pass is ~3 min, so 8 min is generous, and a
+ # runaway calibration loop is killed (SIGKILL 30 s after SIGTERM) instead of
+ # spinning for the whole job. A killed `main` pass writes no JSON and the
+ # comparison degrades to PR-only via if-no-files-found + null-main handling.
+ - name: Run benchmarks (main base)
+ continue-on-error: true
+ env:
+ GOCCIA_BENCH_CALIBRATION_MS: 100
+ GOCCIA_BENCH_ROUNDS: 7
+ run: timeout -k 30 480 ./build-main/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress --format=json --output=benchmark-${{ matrix.mode }}-main.json
- name: Run benchmarks
env:
GOCCIA_BENCH_CALIBRATION_MS: 100
GOCCIA_BENCH_ROUNDS: 7
- run: ./build/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress --format=json --output=benchmark-${{ matrix.mode }}.json
+ run: timeout -k 30 480 ./build/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress --format=json --output=benchmark-${{ matrix.mode }}.json
+
+ - name: Capture deterministic profiles (main base)
+ if: matrix.mode == 'bytecode'
+ continue-on-error: true
+ run: |
+ set -euo pipefail
+ mkdir -p profile-main
+ while IFS= read -r bench; do
+ rel="${bench#benchmarks/}"
+ out="profile-main/${rel%.*}.json"
+ mkdir -p "$(dirname "$out")"
+ ./build-main/GocciaBenchmarkRunner "$bench" \
+ --mode=bytecode \
+ --profile-deterministic \
+ --profile=all \
+ --profile-output="$out" \
+ --no-progress \
+ --format=compact-json \
+ > /dev/null
+ done < <(find benchmarks -type f \( -name '*.js' -o -name '*.jsx' -o -name '*.ts' -o -name '*.tsx' -o -name '*.mjs' \) ! -path '*/helpers/*' | sort)
- name: Capture deterministic profiles
if: matrix.mode == 'bytecode'
@@ -227,6 +334,15 @@ jobs:
retention-days: 1
path: benchmark-${{ matrix.mode }}.json
+ - name: Upload main results
+ uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
+ with:
+ name: benchmark-${{ matrix.mode }}-main
+ retention-days: 1
+ # Absent when build-main failed; the comparison degrades to PR-only.
+ if-no-files-found: ignore
+ path: benchmark-${{ matrix.mode }}-main.json
+
- name: Upload deterministic profiles
if: matrix.mode == 'bytecode'
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
@@ -235,6 +351,16 @@ jobs:
retention-days: 1
path: profile-current/
+ - name: Upload main deterministic profiles
+ if: matrix.mode == 'bytecode'
+ uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
+ with:
+ name: profile-main
+ retention-days: 1
+ # Absent when build-main failed; the profile diff degrades gracefully.
+ if-no-files-found: ignore
+ path: profile-main/
+
cli:
needs: build
runs-on: ubuntu-latest
@@ -276,6 +402,9 @@ jobs:
- name: Check embedded resource CLI behaviour
run: bun run scripts/test-cli-embedded-resources.ts
+ - name: Check benchmark comparison logic
+ run: bun run scripts/test-benchmark-compare.ts
+
test262:
needs: build
runs-on: ubuntu-latest
@@ -381,259 +510,65 @@ jobs:
name: profile-current
path: profile-current/
- - name: Restore interpreted baseline
- uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5
+ - name: Download interpreted main results
+ uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7
+ continue-on-error: true
with:
- path: benchmark-interpreted-results.json
- key: benchmark-interpreted-baseline-will-not-match
- restore-keys: benchmark-interpreted-baseline-
+ name: benchmark-interpreted-main
- - name: Restore bytecode baseline
- uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5
+ - name: Download bytecode main results
+ uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7
+ continue-on-error: true
with:
- path: benchmark-bytecode-results.json
- key: benchmark-bytecode-baseline-will-not-match
- restore-keys: benchmark-bytecode-baseline-
+ name: benchmark-bytecode-main
- - name: Restore deterministic profile baseline
- uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5
+ - name: Download main deterministic profiles
+ uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7
+ continue-on-error: true
with:
- path: profile-baseline/
- key: profile-baseline-will-not-match
- restore-keys: profile-baseline-
-
- - name: Check baselines
- run: |
- if [ -f benchmark-interpreted-results.json ]; then
- echo "BASELINE_EXISTS=true" >> "$GITHUB_ENV"
- else
- echo "BASELINE_EXISTS=false" >> "$GITHUB_ENV"
- fi
- if [ -f benchmark-bytecode-results.json ]; then
- echo "BYTECODE_BASELINE_EXISTS=true" >> "$GITHUB_ENV"
- else
- echo "BYTECODE_BASELINE_EXISTS=false" >> "$GITHUB_ENV"
- fi
+ name: profile-main
+ path: profile-main/
- name: Compare benchmarks and comment
uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9
with:
script: |
const fs = require('fs');
+ const { buildBenchmarkComparison } = require('./scripts/benchmark-compare.js');
const { buildProfileDiffSection } = require('./scripts/profile-diff.js');
- const fmtOps = (n) => Math.round(n).toString().replace(/\B(?=(\d{3})+(?!\d))/g, ',');
- const fmtPercent = (change) => {
- if (change == null) return 'N/A';
- return `${change >= 0 ? '+' : ''}${change.toFixed(1)}%`;
- };
- const fmtAvg = (changes) => {
- if (changes.length === 0) return '';
- const avg = changes.reduce((a, b) => a + b, 0) / changes.length;
- return `avg ${fmtPercent(avg)}`;
- };
- const fmtDelta = (classification, change) => {
- if (change == null) return classification === 'new' ? '๐' : 'N/A';
- if (classification === 'improved') return `๐ข ${fmtPercent(change)}`;
- if (classification === 'regressed') return `๐ด ${fmtPercent(change)}`;
- if (classification === 'unchanged') return `~ overlap (${fmtPercent(change)})`;
- return '๐';
- };
- const fmtRange = (entry) => {
- if (!entry) return 'โ';
- if (entry.minOpsPerSec === entry.maxOpsPerSec) return fmtOps(entry.opsPerSec);
- return `${fmtOps(entry.minOpsPerSec)}..${fmtOps(entry.maxOpsPerSec)}`;
+ const read = (p) => JSON.parse(fs.readFileSync(p, 'utf8'));
+ const tryRead = (p) => {
+ try { return read(p); }
+ catch (e) { console.log(`No same-runner main result at ${p}:`, e.message); return null; }
};
- const fmtBenchmark = (entry) => {
- if (!entry) return 'โ';
- if (entry.minOpsPerSec === entry.maxOpsPerSec) return `${fmtOps(entry.opsPerSec)} ops/sec`;
- return `${fmtOps(entry.opsPerSec)} ops/sec [${fmtRange(entry)}]`;
- };
- const fmtCell = (baseEntry, currentEntry) => {
- if (baseEntry) return `${fmtBenchmark(baseEntry)} โ ${fmtBenchmark(currentEntry)}`;
- return fmtBenchmark(currentEntry);
- };
- const normalizeBenchmark = (benchmark) => ({
- opsPerSec: benchmark.opsPerSec,
- minOpsPerSec: benchmark.minOpsPerSec ?? benchmark.opsPerSec,
- maxOpsPerSec: benchmark.maxOpsPerSec ?? benchmark.opsPerSec,
- variancePercentage: benchmark.variancePercentage ?? 0,
- });
- const calculateChange = (baseEntry, currentEntry) => {
- if (!baseEntry || !baseEntry.opsPerSec) return null;
- return ((currentEntry.opsPerSec - baseEntry.opsPerSec) / baseEntry.opsPerSec) * 100;
- };
- const classify = (baseEntry, currentEntry) => {
- if (!baseEntry) return 'new';
- if (currentEntry.minOpsPerSec > baseEntry.maxOpsPerSec) return 'improved';
- if (currentEntry.maxOpsPerSec < baseEntry.minOpsPerSec) return 'regressed';
- return 'unchanged';
- };
-
- const interpData = JSON.parse(fs.readFileSync('benchmark-interpreted.json', 'utf8'));
- const bytecodeData = JSON.parse(fs.readFileSync('benchmark-bytecode.json', 'utf8'));
-
- let interpBaseline = null;
- if (process.env.BASELINE_EXISTS === 'true') {
- try { interpBaseline = JSON.parse(fs.readFileSync('benchmark-interpreted-results.json', 'utf8')); }
- catch (e) { console.log('Failed to parse interpreted baseline:', e.message); }
- }
- let bytecodeBaseline = null;
- if (process.env.BYTECODE_BASELINE_EXISTS === 'true') {
- try { bytecodeBaseline = JSON.parse(fs.readFileSync('benchmark-bytecode-results.json', 'utf8')); }
- catch (e) { console.log('Failed to parse bytecode baseline:', e.message); }
- }
- const hasAnyBaseline = interpBaseline || bytecodeBaseline;
-
- function buildMap(data) {
- const map = {};
- if (!data) return map;
- for (const file of data.files) {
- const m = {};
- for (const b of file.benchmarks) {
- if (!b.error) m[`${b.suite} > ${b.name}`] = normalizeBenchmark(b);
- }
- map[file.fileName] = m;
- }
- return map;
- }
-
- const interpBaseMap = buildMap(interpBaseline);
- const bytecodeBaseMap = buildMap(bytecodeBaseline);
- const bytecodeCurrentMap = buildMap(bytecodeData);
-
- const newStats = () => ({ improved: 0, regressed: 0, unchanged: 0, new: 0, changes: [] });
- const totals = { interp: newStats(), bytecode: newStats() };
- let totalBenchmarks = 0;
- const fileBlocks = [];
-
- for (const file of interpData.files) {
- const fileName = file.fileName;
- const iBase = interpBaseMap[fileName] || {};
- const bBase = bytecodeBaseMap[fileName] || {};
- const bCurrent = bytecodeCurrentMap[fileName] || {};
- const fileS = { interp: newStats(), bytecode: newStats() };
- let fileRows = '';
- const benches = file.benchmarks.filter(b => !b.error);
- totalBenchmarks += benches.length;
-
- if (hasAnyBaseline) {
- fileRows += '| Benchmark | Interpreted | ฮ | Bytecode | ฮ |\n';
- fileRows += '|-----------|------------|---|---------|---|\n';
- } else {
- fileRows += '| Benchmark | Interpreted | Bytecode |\n';
- fileRows += '|-----------|------------|----------|\n';
- }
-
- for (const bench of benches) {
- const key = `${bench.suite} > ${bench.name}`;
- const iCurrent = normalizeBenchmark(bench);
- const bCurrentEntry = bCurrent[key];
- const iBaseEntry = iBase[key];
- const bBaseEntry = bBase[key];
-
- if (hasAnyBaseline) {
- let iDelta;
- if (iBaseEntry) {
- const c = calculateChange(iBaseEntry, iCurrent);
- const classification = classify(iBaseEntry, iCurrent);
- if (c != null) fileS.interp.changes.push(c);
- fileS.interp[classification]++;
- iDelta = fmtDelta(classification, c);
- } else {
- iDelta = '๐';
- fileS.interp.new++;
- }
-
- let bDelta;
- if (bCurrentEntry && bBaseEntry) {
- const c = calculateChange(bBaseEntry, bCurrentEntry);
- const classification = classify(bBaseEntry, bCurrentEntry);
- if (c != null) fileS.bytecode.changes.push(c);
- fileS.bytecode[classification]++;
- bDelta = fmtDelta(classification, c);
- } else if (bCurrentEntry) {
- bDelta = '๐';
- fileS.bytecode.new++;
- } else {
- bDelta = 'โ';
- }
-
- fileRows += `| ${bench.name} | ${fmtCell(iBaseEntry, iCurrent)} | ${iDelta} | ${bCurrentEntry ? fmtCell(bBaseEntry, bCurrentEntry) : 'โ'} | ${bDelta} |\n`;
- } else {
- fileRows += `| ${bench.name} | ${fmtRange(iCurrent)} | ${bCurrentEntry ? fmtRange(bCurrentEntry) : 'โ'} |\n`;
- }
- }
-
- for (const mode of ['interp', 'bytecode']) {
- for (const k of ['improved', 'regressed', 'unchanged', 'new']) totals[mode][k] += fileS[mode][k];
- totals[mode].changes.push(...fileS[mode].changes);
- }
-
- const hasSignificant = fileS.interp.improved + fileS.interp.regressed + fileS.bytecode.improved + fileS.bytecode.regressed > 0;
-
- function modeSummary(label, s) {
- const p = [];
- if (s.improved > 0) p.push(`๐ข ${s.improved}`);
- if (s.regressed > 0) p.push(`๐ด ${s.regressed}`);
- if (s.new > 0) p.push(`๐ ${s.new}`);
- if (s.unchanged > 0) p.push(`${s.unchanged} unch.`);
- const avg = fmtAvg(s.changes);
- return `${label}: ${p.length > 0 ? p.join(', ') : `${s.changes.length + s.new}`}${avg ? ` ยท ${avg}` : ''}`;
- }
-
- const fileSummary = hasAnyBaseline
- ? modeSummary('Interp', fileS.interp) + ' ยท ' + modeSummary('Bytecode', fileS.bytecode)
- : `${benches.length} benchmarks`;
-
- fileBlocks.push({ fileName, fileSummary, fileRows, hasSignificant });
- }
-
- let body = '## Benchmark Results\n\n';
-
- if (hasAnyBaseline) {
- function overallMode(label, s) {
- const p = [];
- if (s.improved > 0) p.push(`๐ข ${s.improved} improved`);
- if (s.regressed > 0) p.push(`๐ด ${s.regressed} regressed`);
- if (s.new > 0) p.push(`๐ ${s.new} new`);
- if (s.unchanged > 0) p.push(`${s.unchanged} unchanged`);
- const avg = fmtAvg(s.changes);
- return `**${label}:** ${p.join(' ยท ')}${avg ? ` ยท ${avg}` : ''}`;
- }
- body += `**${totalBenchmarks}** benchmarks\n\n`;
- body += overallMode('Interpreted', totals.interp) + '\\\n';
- body += overallMode('Bytecode', totals.bytecode) + '\n\n';
- } else {
- body += `**${totalBenchmarks}** benchmarks (no baseline)\n\n`;
- }
-
- for (const block of fileBlocks) {
- const shortName = block.fileName.replace(/^benchmarks\//, '');
- const open = block.hasSignificant ? ' open' : '';
- body += `\n`;
- body += `${shortName} โ ${block.fileSummary}
\n\n`;
- body += block.fileRows;
- body += `\n \n\n`;
- }
+ // Deterministic profile diff (opcode/allocation counts) is runner
+ // independent, so it compares the same-runner main build's profiles.
const skipProfile = /\[skip-profile-check\]/.test(context.payload.pull_request?.body || '');
+ let profileDiff = '';
try {
- const profileDiff = buildProfileDiffSection({
- baselineDir: 'profile-baseline',
+ profileDiff = buildProfileDiffSection({
+ baselineDir: 'profile-main',
currentDir: 'profile-current',
skip: skipProfile,
- });
- body += profileDiff.markdown + '\n\n';
+ }).markdown;
} catch (error) {
console.log('Failed to build deterministic profile diff:', error.message);
- body += '## Deterministic profile diff\n\nDeterministic profile diff: unavailable due to a comparison error.\n\n';
+ profileDiff = '## Deterministic profile diff\n\nDeterministic profile diff: unavailable due to a comparison error.';
}
- body += 'Measured on ubuntu-latest x64. Benchmark ranges compare cached main-branch min/max ops/sec with the PR run; overlapping ranges are treated as unchanged noise. Percentage deltas are secondary context.';
+ const { body: inner } = buildBenchmarkComparison({
+ interpData: read('benchmark-interpreted.json'),
+ interpMain: tryRead('benchmark-interpreted-main.json'),
+ bytecodeData: read('benchmark-bytecode.json'),
+ bytecodeMain: tryRead('benchmark-bytecode-main.json'),
+ profileDiff,
+ runnerLabel: 'ubuntu-latest x64',
+ });
const marker = '';
- body = marker + '\n' + body;
+ const body = marker + '\n' + inner;
const { data: comments } = await github.rest.issues.listComments({
owner: context.repo.owner,
diff --git a/docs/adr/0076-same-runner-benchmark-comparison.md b/docs/adr/0076-same-runner-benchmark-comparison.md
new file mode 100644
index 00000000..292c5792
--- /dev/null
+++ b/docs/adr/0076-same-runner-benchmark-comparison.md
@@ -0,0 +1,14 @@
+# Same-runner benchmark comparison
+
+**Date:** 2026-06-27
+**Area:** `ci` / `tooling`
+**Issue:** [#815](https://github.com/frostney/GocciaScript/issues/815)
+**Pull Request:** [#886](https://github.com/frostney/GocciaScript/pull/886)
+
+The PR "Benchmark Results" comment previously classified benchmarks as ๐ข improved / ๐ด regressed by comparing the PR's freshly-measured run against a **cached baseline measured in a different job, on a different GitHub-hosted runner, at a different time** (whatever `main` last pushed). The baseline was written by `ci.yml`'s `benchmark` job via `actions/cache/save` and restored in `pr.yml`'s `benchmark-comment` job by `restore-keys` prefix. Cross-runner/cross-time variance is systematically larger than each run's own min/max spread, so the range-overlap classifier (improved iff PR min > baseline max) flipped large numbers of benchmarks to improved/regressed even when the diff could not possibly affect them โ e.g. PR [#805](https://github.com/frostney/GocciaScript/pull/805) (regex-decode only) reported `arraybuffer.js` and `arrays.js`, which use no regex, as broadly improved. The verdicts were confident but not actionable, and a real regression was indistinguishable from runner noise.
+
+We now build the PR's **base commit** (`github.event.pull_request.base.sha` โ the `main` the PR targets) in a dedicated `build-main` job, and the `benchmark` job benchmarks that `main` build and the PR build **back-to-back on the same runner**, after a discarded warm-up run that sheds runner cold-start (page faults, sub-turbo clocks). The range-overlap classifier is unchanged; it now compares two runs measured under the same conditions, so the systematic between-runner offset cancels and only within-run noise remains โ which is exactly what range overlap is designed to absorb. The deterministic profile diff (opcode/allocation counts) is regenerated from the same `main` build instead of a cached profile, making the whole comparison self-contained; the cross-runner `benchmark-*-baseline-*` and `profile-baseline-*` caches in `ci.yml` and their restores in `pr.yml` are removed. The ~200-line comparison previously inlined in workflow YAML moved to [`scripts/benchmark-compare.js`](../../scripts/benchmark-compare.js) with unit tests in `scripts/test-benchmark-compare.ts`, so the classifier semantics โ including the #815 false-positive case โ are locked by a test rather than embedded in `pr.yml`.
+
+The explicit trade-off is cost: building and benchmarking `main` inside every PR run roughly **doubles the benchmark job's CI time** (one extra `--prod` build of the benchmark runner plus a second benchmark pass per mode). That is the accepted price of a trustworthy comparison. The comparison stays **advisory (comment-only)**, not a merge-blocking gate: even same-runner measurement has a noise floor (re-running the identical `--prod` binary showed ยฑ11โ15%, up to ยฑ59%), `VISION.md` (line 43) deprioritizes raw throughput, and test262 remains the gating regression check. The range-overlap rule absorbs that floor as "unchanged overlap," and the comment now reports the measured median per-run variance so readers can judge whether a sub-noise delta is meaningful.
+
+Alternatives considered and rejected: **normalizing the cached cross-runner baseline** to an in-run calibration reference โ cheaper, but a single calibration benchmark cannot cancel per-workload-class variance (memory-bound vs CPU-bound runners scale differently), so false verdicts shrink rather than disappear; a **consistent self-hosted runner** โ removes cross-runner variance structurally but adds infrastructure burden and runs untrusted PR code on owned hardware in a public repo; and **demoting timing entirely** in favour of the already-trustworthy deterministic profile diff โ honest and cheap, but the deterministic diff only catches regressions that change instruction or allocation counts, not a pure wall-clock-per-op slowdown, which is part of what the gate exists to surface. Per-file interleaving of the two binaries (the issue's "ideally interleaved") is a possible future refinement if residual same-runner drift proves material; the reported noise floor makes such drift visible. See [docs/build-system.md](../build-system.md) `benchmark` job reference and [VISION.md](../../VISION.md).
diff --git a/docs/adr/README.md b/docs/adr/README.md
index 820ac306..1be3f122 100644
--- a/docs/adr/README.md
+++ b/docs/adr/README.md
@@ -85,3 +85,4 @@ Durable architecture and implementation decisions for GocciaScript. New ADRs use
- [0073 โ Opt-in ShadowRealm](0073-opt-in-shadowrealm.md)
- [0074 โ Deferred bytecode call-stack frames](0074-deferred-bytecode-call-stack-frames.md)
- [0075 โ ShadowRealm full test262 conformance](0075-shadowrealm-conformance.md)
+- [0076 โ Same-runner benchmark comparison](0076-same-runner-benchmark-comparison.md)
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index 46c9e207..39037c53 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -260,10 +260,10 @@ When a benchmark has a `setup` or `teardown` function, a second line displays th
Benchmarks run as part of the CI pipeline in both **interpreter mode** and
**bytecode mode**. CI uses `GOCCIA_BENCH_CALIBRATION_MS=100` and
`GOCCIA_BENCH_ROUNDS=7` for stable measurements with IQR outlier filtering. On
-pushes to `main`, the ubuntu-latest x64 runner saves JSON baselines for each
-mode (`benchmark-interpreted-results.json` and `benchmark-bytecode-results.json`)
-to the GitHub Actions cache. See [testing.md](testing.md#ci-integration) for the
-full pipeline overview.
+pushes to `main`, the ubuntu-latest x64 runner emits JSON and validates the
+report shape. PR comparison builds and benchmarks `main` on the PR's own runner
+rather than reading a cached baseline ([ADR 0076](adr/0076-same-runner-benchmark-comparison.md)).
+See [testing.md](testing.md#ci-integration) for the full pipeline overview.
Main bytecode benchmark runs also retain deterministic VM profile reports. The
GitHub Actions artifact is named `benchmark-profile` and contains:
@@ -284,13 +284,15 @@ paths are `benchmark-profiles/runs//aggregate.json.gz`,
### PR Benchmark Comparison
-The PR workflow (`.github/workflows/pr.yml`) restores the cached baseline JSON from `main`, runs the benchmark matrix, and posts a collapsible comparison comment on the PR. Each benchmark file gets a **unified table** with both execution modes side by side:
+The PR workflow (`.github/workflows/pr.yml`) builds the PR's base commit (`main`) in a `build-main` job and benchmarks that `main` build and the PR build **back-to-back on the same runner** (after a discarded warm-up), so deltas reflect the diff rather than cross-runner variance (issue #815, [ADR 0076](adr/0076-same-runner-benchmark-comparison.md)). The comparison logic lives in [`scripts/benchmark-compare.js`](../scripts/benchmark-compare.js) (unit-tested by `scripts/test-benchmark-compare.ts`). It posts a collapsible comparison comment on the PR; each benchmark file gets a **unified table** with both execution modes side by side:
-- Each table row shows `| Benchmark | Interpreted | ฮ | Bytecode | ฮ |` with the point estimate and cached/PR min-max range in the form `10,000 ops/sec [9,500..10,500] โ 9,200 ops/sec [8,700..9,700]`
-- Classification uses **range overlap** instead of a single point value: if the PR run sits fully above the baseline range it is improved, if it sits fully below it is regressed, and overlapping ranges are treated as unchanged noise
+- Each table row shows `| Benchmark | Interpreted (main โ PR) | ฮ | Bytecode (main โ PR) | ฮ |` with the point estimate and same-runner `main`/PR min-max range in the form `10,000 ops/sec [9,500..10,500] โ 9,200 ops/sec [8,700..9,700]`
+- Classification uses **range overlap** instead of a single point value: if the PR run sits fully above the `main` range it is improved, if it sits fully below it is regressed, and overlapping ranges are treated as unchanged noise
- Percentage deltas remain in the `ฮ` column as secondary context, even when the classifier marks a benchmark as `~ overlap`
+- The overall summary reports the measured **median per-run variance** as a noise floor, so sub-noise deltas are read as unchanged
- Results are **grouped by file**, each in a collapsible `` section
- Files with significant changes (improvements or regressions) are auto-expanded
- Each file summary shows per-mode counts (e.g., `Interp: ๐ข 1, 7 unch. ยท Bytecode: ๐ข 2, 6 unch.`)
- The **overall PR summary** shows per-mode totals on separate lines with average percentage deltas
-- ๐ข marks non-overlapping improvements, ๐ด marks non-overlapping regressions, `~ overlap` marks overlapping ranges, and ๐ marks new benchmarks with no baseline
+- The comparison is **advisory** (comment-only, never merge-blocking)
+- ๐ข marks non-overlapping improvements, ๐ด marks non-overlapping regressions, `~ overlap` marks overlapping ranges, and ๐ marks new benchmarks absent from the `main` build
diff --git a/docs/build-system.md b/docs/build-system.md
index 1425460c..99481b4a 100644
--- a/docs/build-system.md
+++ b/docs/build-system.md
@@ -669,7 +669,7 @@ Runs on the full platform matrix:
**`test262`** (needs build, ubuntu-latest x64 only, **non-blocking**) โ Downloads the `gocciascript-x86_64-linux` build, checks out [`tc39/test262`](https://github.com/tc39/test262) at the SHA pinned in `scripts/test262-suite-sha.txt`, runs `bun scripts/run_test262_suite.ts --suite-dir test262-suite --mode=bytecode --jobs=4 --timeout-ms=20000 --output=test262-results.json`, and uploads the report as a 30-day workflow artifact. The run step uses `continue-on-error: true` because the conformance lane is allowed to carry known steady-state failures while the engine closes compatibility gaps. On main, the JSON is also stashed via `actions/cache/save` under `test262-baseline-` so the PR workflow can compute ฮ vs main, and `cd website && bun run publish-test262 ../test262-results.json` publishes the compressed run report plus its UTC daily dashboard pointer to Vercel Blob when `BLOB_READ_WRITE_TOKEN` is configured. Main runs also upload the `test262-profile` artifact and publish matching aggregate/detail profile payloads under the separate `test262-profiles/` Blob namespace for weekly performance review. The one-off `cd website && bun run backfill-test262` command seeds retained artifact reports and reruns expired historical days directly into Blob. The pin is bumped weekly by `.github/workflows/test262-bump.yml`. See [docs/test262.md](test262.md) for the harness and profile report contracts.
-**`benchmark`** (needs build) โ Runs all benchmarks on all platforms. On main (ubuntu-latest x64), saves benchmark results as JSON to `actions/cache` for PR comparison. The main bytecode benchmark lane also captures deterministic VM profile details, uploads the `benchmark-profile` artifact, and publishes aggregate/detail profile payloads under the separate `benchmark-profiles/` Blob namespace when `BLOB_READ_WRITE_TOKEN` is configured.
+**`benchmark`** (needs build) โ Runs all benchmarks on all platforms. On main (ubuntu-latest x64), it additionally emits JSON and validates the report shape. PR comparison no longer reads a cached baseline from here โ each PR builds and benchmarks `main` on its own runner ([ADR 0076](adr/0076-same-runner-benchmark-comparison.md)). The main bytecode benchmark lane also captures deterministic VM profile details, uploads the `benchmark-profile` artifact, and publishes aggregate/detail profile payloads under the separate `benchmark-profiles/` Blob namespace when `BLOB_READ_WRITE_TOKEN` is configured.
**`cli`** (needs build) โ Downloads pre-built binaries and runs CLI behavior smoke tests on all platforms via Bun: options across all apps, lexer numeric-separator rejection, parser error display, config-file loading, and app-specific features including Sandbox Runner seed baselines. Windows runs additionally assert that the loader binary does not link OpenSSL DLLs (HTTPS must use the platform TLS stack statically).
@@ -692,7 +692,7 @@ Runs on **ubuntu-latest x64 only** (single runner, no matrix).
**`test`** (needs build) โ Runs all JavaScript tests and Pascal unit tests.
-**`benchmark`** (needs build) โ Restores the cached benchmark baseline JSON from main, runs all benchmarks with JSON output, and posts a collapsible comparison comment on the PR grouped by file. Each file section shows the cached baseline and PR `opsPerSec` point estimates side by side, with each point estimate carrying its min/max ops/sec range in brackets. Classification uses range overlap: fully above the baseline range is an improvement, fully below is a regression, and overlapping ranges are treated as unchanged noise. Percentage deltas are still shown as secondary context, and files with significant changes are auto-expanded. If no baseline exists, shows results without comparison.
+**`benchmark`** (needs build + build-main) โ Compares the PR against a `main` build measured **on the same runner**. A separate `build-main` job builds the benchmark runner from the PR's base commit (`github.event.pull_request.base.sha`); the `benchmark` job then warms up the runner (discarded) and benchmarks the `main` build and the PR build back-to-back, so deltas reflect the diff rather than cross-runner variance (issue #815, [ADR 0076](adr/0076-same-runner-benchmark-comparison.md)). It posts a collapsible comparison comment grouped by file: each file section shows the same-runner `main` and PR `opsPerSec` point estimates side by side, each carrying its min/max ops/sec range in brackets. Classification (in [`scripts/benchmark-compare.js`](../scripts/benchmark-compare.js), unit-tested by `scripts/test-benchmark-compare.ts`) uses range overlap: fully above the `main` range is an improvement, fully below is a regression, and overlapping ranges are treated as unchanged noise. The comment also reports the measured median per-run variance as a noise floor. Percentage deltas are secondary context, files with significant changes are auto-expanded, and the comparison is advisory (comment-only, never merge-blocking). The deterministic profile diff (opcode/allocation counts) is regenerated from the same `main` build. If no `main` build is available, shows results without comparison.
**`test262`** (needs build, **non-blocking**) โ Checks out `tc39/test262` at the pinned SHA, runs `bun scripts/run_test262_suite.ts --suite-dir test262-suite --mode=bytecode --jobs=4 --timeout-ms=20000 --output=test262-results.json`, and uploads the JSON report. Failing tests do not fail the job. The downstream `test262-comment` job (`if: always()`) restores the most recent `test262-baseline-` cache entry from main, then `bun scripts/run_test262_suite.ts --comment test262-results.json ` builds the markdown body and the workflow posts/updates a comment using marker ``. The comment shows a per-category breakdown (built-ins, harness, intl402, language, staging) with ฮ-vs-main columns when a baseline is cached, an "Areas closest to 100%" sub-table, and a collapsible per-test delta list. PR CI does not generate or upload the full-corpus test262 profile artifact.
diff --git a/scripts/benchmark-compare.js b/scripts/benchmark-compare.js
new file mode 100644
index 00000000..1f38cc79
--- /dev/null
+++ b/scripts/benchmark-compare.js
@@ -0,0 +1,274 @@
+#!/usr/bin/env node
+
+// Builds the PR "Benchmark Results" comment by comparing the PR build against a
+// `main` build measured **on the same runner, back-to-back** in the same job.
+//
+// Why this module exists: the comparison used to read a `main` baseline from the
+// Actions cache, written by a *different* job on a *different* GitHub-hosted
+// runner at a *different* time. Cross-runner/cross-time variance dwarfs each
+// run's own min/max spread, so the range-overlap classifier flipped large
+// numbers of benchmarks to improved/regressed even when the diff could not have
+// affected them (issue #815). Feeding the classifier two same-runner runs makes
+// the systematic offset cancel; the range-overlap rule then only sees within-run
+// noise, which it is designed for. See docs/adr/0076.
+//
+// The classifier itself is unchanged from the original inline implementation
+// (range overlap on each run's own min/max). This module exists so that logic
+// is a single tested unit instead of ~200 lines embedded in workflow YAML.
+
+const fmtOps = (n) => Math.round(n).toString().replace(/\B(?=(\d{3})+(?!\d))/g, ',');
+
+const fmtPercent = (change) => {
+ if (change == null) return 'N/A';
+ return `${change >= 0 ? '+' : ''}${change.toFixed(1)}%`;
+};
+
+const fmtAvg = (changes) => {
+ if (changes.length === 0) return '';
+ const avg = changes.reduce((a, b) => a + b, 0) / changes.length;
+ return `avg ${fmtPercent(avg)}`;
+};
+
+const fmtDelta = (classification, change) => {
+ if (change == null) return classification === 'new' ? '๐' : 'N/A';
+ if (classification === 'improved') return `๐ข ${fmtPercent(change)}`;
+ if (classification === 'regressed') return `๐ด ${fmtPercent(change)}`;
+ if (classification === 'unchanged') return `~ overlap (${fmtPercent(change)})`;
+ return '๐';
+};
+
+const fmtRange = (entry) => {
+ if (!entry) return 'โ';
+ if (entry.minOpsPerSec === entry.maxOpsPerSec) return fmtOps(entry.opsPerSec);
+ return `${fmtOps(entry.minOpsPerSec)}..${fmtOps(entry.maxOpsPerSec)}`;
+};
+
+const fmtBenchmark = (entry) => {
+ if (!entry) return 'โ';
+ if (entry.minOpsPerSec === entry.maxOpsPerSec) return `${fmtOps(entry.opsPerSec)} ops/sec`;
+ return `${fmtOps(entry.opsPerSec)} ops/sec [${fmtRange(entry)}]`;
+};
+
+const fmtCell = (baseEntry, currentEntry) => {
+ if (baseEntry) return `${fmtBenchmark(baseEntry)} โ ${fmtBenchmark(currentEntry)}`;
+ return fmtBenchmark(currentEntry);
+};
+
+const normalizeBenchmark = (benchmark) => ({
+ opsPerSec: benchmark.opsPerSec,
+ minOpsPerSec: benchmark.minOpsPerSec ?? benchmark.opsPerSec,
+ maxOpsPerSec: benchmark.maxOpsPerSec ?? benchmark.opsPerSec,
+ variancePercentage: benchmark.variancePercentage ?? 0,
+});
+
+const calculateChange = (baseEntry, currentEntry) => {
+ if (!baseEntry || !baseEntry.opsPerSec) return null;
+ return ((currentEntry.opsPerSec - baseEntry.opsPerSec) / baseEntry.opsPerSec) * 100;
+};
+
+// Range overlap on each run's own min/max range. Because both ranges now come
+// from the same runner, the only thing separating them is within-run noise: a
+// classification only fires when the PR range sits *entirely* above (improved)
+// or *entirely* below (regressed) the main range. Overlap โ unchanged noise.
+const classify = (baseEntry, currentEntry) => {
+ if (!baseEntry) return 'new';
+ if (currentEntry.minOpsPerSec > baseEntry.maxOpsPerSec) return 'improved';
+ if (currentEntry.maxOpsPerSec < baseEntry.minOpsPerSec) return 'regressed';
+ return 'unchanged';
+};
+
+function buildMap(data) {
+ const map = {};
+ if (!data) return map;
+ for (const file of data.files) {
+ const m = {};
+ for (const b of file.benchmarks) {
+ if (!b.error) m[`${b.suite} > ${b.name}`] = normalizeBenchmark(b);
+ }
+ map[file.fileName] = m;
+ }
+ return map;
+}
+
+// Median of each benchmark's own within-run variancePercentage: an honest,
+// measured "noise floor" surfaced so readers can judge whether a sub-noise delta
+// is meaningful. It is the spread the classifier already absorbs as overlap.
+function medianVariance(data) {
+ if (!data) return null;
+ const values = [];
+ for (const file of data.files) {
+ for (const b of file.benchmarks) {
+ if (!b.error && typeof b.variancePercentage === 'number' && b.variancePercentage > 0) {
+ values.push(b.variancePercentage);
+ }
+ }
+ }
+ if (values.length === 0) return null;
+ values.sort((a, b) => a - b);
+ const mid = Math.floor(values.length / 2);
+ return values.length % 2 ? values[mid] : (values[mid - 1] + values[mid]) / 2;
+}
+
+const newStats = () => ({ improved: 0, regressed: 0, unchanged: 0, new: 0, changes: [] });
+
+// options:
+// interpData, bytecodeData โ required PR-run reports (this run)
+// interpMain, bytecodeMain โ same-runner `main`-build reports (may be null)
+// profileDiff โ optional pre-rendered deterministic profile markdown
+// runnerLabel โ e.g. 'ubuntu-latest x64'
+// returns { body } โ markdown WITHOUT the HTML comment marker.
+function buildBenchmarkComparison(options) {
+ const interpData = options.interpData;
+ const bytecodeData = options.bytecodeData;
+ const interpMain = options.interpMain || null;
+ const bytecodeMain = options.bytecodeMain || null;
+ const runnerLabel = options.runnerLabel || 'ubuntu-latest x64';
+
+ const hasAnyMain = interpMain || bytecodeMain;
+
+ const interpMainMap = buildMap(interpMain);
+ const bytecodeMainMap = buildMap(bytecodeMain);
+ const bytecodeCurrentMap = buildMap(bytecodeData);
+
+ const totals = { interp: newStats(), bytecode: newStats() };
+ let totalBenchmarks = 0;
+ const fileBlocks = [];
+
+ for (const file of interpData.files) {
+ const fileName = file.fileName;
+ const iBase = interpMainMap[fileName] || {};
+ const bBase = bytecodeMainMap[fileName] || {};
+ const bCurrent = bytecodeCurrentMap[fileName] || {};
+ const fileS = { interp: newStats(), bytecode: newStats() };
+ let fileRows = '';
+ const benches = file.benchmarks.filter((b) => !b.error);
+ totalBenchmarks += benches.length;
+
+ if (hasAnyMain) {
+ fileRows += '| Benchmark | Interpreted (main โ PR) | ฮ | Bytecode (main โ PR) | ฮ |\n';
+ fileRows += '|-----------|------------|---|---------|---|\n';
+ } else {
+ fileRows += '| Benchmark | Interpreted | Bytecode |\n';
+ fileRows += '|-----------|------------|----------|\n';
+ }
+
+ for (const bench of benches) {
+ const key = `${bench.suite} > ${bench.name}`;
+ const iCurrent = normalizeBenchmark(bench);
+ const bCurrentEntry = bCurrent[key];
+ const iBaseEntry = iBase[key];
+ const bBaseEntry = bBase[key];
+
+ if (hasAnyMain) {
+ let iDelta;
+ if (iBaseEntry) {
+ const c = calculateChange(iBaseEntry, iCurrent);
+ const classification = classify(iBaseEntry, iCurrent);
+ if (c != null) fileS.interp.changes.push(c);
+ fileS.interp[classification]++;
+ iDelta = fmtDelta(classification, c);
+ } else {
+ iDelta = '๐';
+ fileS.interp.new++;
+ }
+
+ let bDelta;
+ if (bCurrentEntry && bBaseEntry) {
+ const c = calculateChange(bBaseEntry, bCurrentEntry);
+ const classification = classify(bBaseEntry, bCurrentEntry);
+ if (c != null) fileS.bytecode.changes.push(c);
+ fileS.bytecode[classification]++;
+ bDelta = fmtDelta(classification, c);
+ } else if (bCurrentEntry) {
+ bDelta = '๐';
+ fileS.bytecode.new++;
+ } else {
+ bDelta = 'โ';
+ }
+
+ fileRows += `| ${bench.name} | ${fmtCell(iBaseEntry, iCurrent)} | ${iDelta} | ${bCurrentEntry ? fmtCell(bBaseEntry, bCurrentEntry) : 'โ'} | ${bDelta} |\n`;
+ } else {
+ fileRows += `| ${bench.name} | ${fmtRange(iCurrent)} | ${bCurrentEntry ? fmtRange(bCurrentEntry) : 'โ'} |\n`;
+ }
+ }
+
+ for (const mode of ['interp', 'bytecode']) {
+ for (const k of ['improved', 'regressed', 'unchanged', 'new']) totals[mode][k] += fileS[mode][k];
+ totals[mode].changes.push(...fileS[mode].changes);
+ }
+
+ const hasSignificant = fileS.interp.improved + fileS.interp.regressed + fileS.bytecode.improved + fileS.bytecode.regressed > 0;
+
+ function modeSummary(label, s) {
+ const p = [];
+ if (s.improved > 0) p.push(`๐ข ${s.improved}`);
+ if (s.regressed > 0) p.push(`๐ด ${s.regressed}`);
+ if (s.new > 0) p.push(`๐ ${s.new}`);
+ if (s.unchanged > 0) p.push(`${s.unchanged} unch.`);
+ const avg = fmtAvg(s.changes);
+ return `${label}: ${p.length > 0 ? p.join(', ') : `${s.changes.length + s.new}`}${avg ? ` ยท ${avg}` : ''}`;
+ }
+
+ const fileSummary = hasAnyMain
+ ? modeSummary('Interp', fileS.interp) + ' ยท ' + modeSummary('Bytecode', fileS.bytecode)
+ : `${benches.length} benchmarks`;
+
+ fileBlocks.push({ fileName, fileSummary, fileRows, hasSignificant });
+ }
+
+ let body = '## Benchmark Results\n\n';
+
+ if (hasAnyMain) {
+ function overallMode(label, s) {
+ const p = [];
+ if (s.improved > 0) p.push(`๐ข ${s.improved} improved`);
+ if (s.regressed > 0) p.push(`๐ด ${s.regressed} regressed`);
+ if (s.new > 0) p.push(`๐ ${s.new} new`);
+ if (s.unchanged > 0) p.push(`${s.unchanged} unchanged`);
+ const avg = fmtAvg(s.changes);
+ return `**${label}:** ${p.join(' ยท ')}${avg ? ` ยท ${avg}` : ''}`;
+ }
+ body += `**${totalBenchmarks}** benchmarks ยท PR vs same-runner \`main\` build\n\n`;
+ body += overallMode('Interpreted', totals.interp) + '\\\n';
+ body += overallMode('Bytecode', totals.bytecode) + '\n\n';
+
+ const iNoise = medianVariance(interpData);
+ const bNoise = medianVariance(bytecodeData);
+ const noiseParts = [];
+ if (iNoise != null) noiseParts.push(`interpreted ยฑ${iNoise.toFixed(1)}%`);
+ if (bNoise != null) noiseParts.push(`bytecode ยฑ${bNoise.toFixed(1)}%`);
+ if (noiseParts.length > 0) {
+ body += `Typical per-run noise (median variance): ${noiseParts.join(', ')}. Deltas within noise overlap and read as unchanged.\n\n`;
+ }
+ } else {
+ body += `**${totalBenchmarks}** benchmarks (no same-runner \`main\` build to compare)\n\n`;
+ }
+
+ for (const block of fileBlocks) {
+ const shortName = block.fileName.replace(/^benchmarks\//, '');
+ const open = block.hasSignificant ? ' open' : '';
+ body += `\n`;
+ body += `${shortName} โ ${block.fileSummary}
\n\n`;
+ body += block.fileRows;
+ body += `\n \n\n`;
+ }
+
+ if (options.profileDiff) {
+ body += options.profileDiff + '\n\n';
+ }
+
+ body += hasAnyMain
+ ? `Measured on ${runnerLabel}. Each PR run also builds the \`main\` base and benchmarks it back-to-back on the same runner after a warm-up discard, so the ranges compare two runs measured under the same conditions; overlapping min/max ranges are treated as unchanged noise. Percentage deltas are secondary context. See docs/adr/0076.`
+ : `Measured on ${runnerLabel}. No \`main\` build was available to compare against, so these are raw PR-run numbers. See docs/adr/0076.`;
+
+ return { body };
+}
+
+module.exports = {
+ buildBenchmarkComparison,
+ classify,
+ calculateChange,
+ normalizeBenchmark,
+ buildMap,
+ medianVariance,
+};
diff --git a/scripts/profile-diff.js b/scripts/profile-diff.js
index 1baa8724..221f95f7 100755
--- a/scripts/profile-diff.js
+++ b/scripts/profile-diff.js
@@ -217,7 +217,7 @@ function buildProfileDiffSection(options) {
if (baselineFiles.length === 0) {
return {
- markdown: '## Deterministic profile diff\n\nDeterministic profile diff: no cached baseline was restored.',
+ markdown: '## Deterministic profile diff\n\nDeterministic profile diff: no `main`-build profile was available.',
signalCount: 0,
};
}
diff --git a/scripts/test-benchmark-compare.ts b/scripts/test-benchmark-compare.ts
new file mode 100644
index 00000000..3f5d839f
--- /dev/null
+++ b/scripts/test-benchmark-compare.ts
@@ -0,0 +1,167 @@
+#!/usr/bin/env bun
+/**
+ * test-benchmark-compare.ts
+ *
+ * Unit tests for scripts/benchmark-compare.js โ the PR "Benchmark Results"
+ * comment builder. Locks the range-overlap classifier and proves the issue
+ * #815 failure mode: when the PR and `main` ranges overlap (the realistic
+ * same-runner case), a benchmark untouched by the diff reads as `unchanged`,
+ * not improved/regressed โ even when its point-estimate delta is positive.
+ *
+ * Run: bun run scripts/test-benchmark-compare.ts
+ */
+
+import { createRequire } from "node:module";
+
+const require = createRequire(import.meta.url);
+const {
+ classify,
+ medianVariance,
+ buildBenchmarkComparison,
+} = require("./benchmark-compare.js");
+
+let passed = 0;
+
+function assert(condition: unknown, message: string): void {
+ if (!condition) throw new Error(`Assertion failed: ${message}`);
+ passed++;
+}
+
+function assertEqual(actual: T, expected: T, message: string): void {
+ if (actual !== expected) {
+ throw new Error(`${message}\n expected: ${JSON.stringify(expected)}\n actual: ${JSON.stringify(actual)}`);
+ }
+ passed++;
+}
+
+// A benchmark entry: opsPerSec point estimate plus the min/max range across rounds.
+const bench = (
+ suite: string,
+ name: string,
+ opsPerSec: number,
+ min: number,
+ max: number,
+ variancePercentage = 0,
+) => ({ suite, name, opsPerSec, minOpsPerSec: min, maxOpsPerSec: max, variancePercentage });
+
+const report = (fileName: string, benchmarks: object[]) => ({
+ files: [{ fileName, benchmarks }],
+ totalBenchmarks: benchmarks.length,
+});
+
+// -- classify(): pure range overlap -----------------------------------------------
+
+console.log("classify() range overlap...");
+{
+ // PR range entirely above main range โ improved.
+ assertEqual(
+ classify({ minOpsPerSec: 100, maxOpsPerSec: 110 }, { minOpsPerSec: 120, maxOpsPerSec: 130 }),
+ "improved",
+ "PR min above main max is improved",
+ );
+ // PR range entirely below main range โ regressed.
+ assertEqual(
+ classify({ minOpsPerSec: 120, maxOpsPerSec: 130 }, { minOpsPerSec: 100, maxOpsPerSec: 110 }),
+ "regressed",
+ "PR max below main min is regressed",
+ );
+ // Overlapping ranges โ unchanged noise.
+ assertEqual(
+ classify({ minOpsPerSec: 100, maxOpsPerSec: 130 }, { minOpsPerSec: 110, maxOpsPerSec: 140 }),
+ "unchanged",
+ "overlapping ranges are unchanged",
+ );
+ // No main entry โ new.
+ assertEqual(
+ classify(undefined, { minOpsPerSec: 110, maxOpsPerSec: 140 }),
+ "new",
+ "missing main entry is new",
+ );
+}
+
+// -- Issue #815: same-runner overlap must NOT misclassify an untouched benchmark ---
+
+console.log("issue #815: untouched benchmark reads as unchanged...");
+{
+ // Mirrors the real PR #805 evidence: a regex-only change reported arraybuffer
+ // (which uses no regex) as "improved" purely from cross-runner offset. With a
+ // same-runner `main` build the ranges overlap, so it reads as unchanged even
+ // though the point estimate ticked up +2.5%.
+ const main = { opsPerSec: 120000, minOpsPerSec: 111000, maxOpsPerSec: 127000 };
+ const pr = { opsPerSec: 123000, minOpsPerSec: 110000, maxOpsPerSec: 127052 };
+ assertEqual(classify(main, pr), "unchanged", "same-runner overlap is unchanged, not improved");
+}
+
+// -- medianVariance(): honest noise floor from measured data -----------------------
+
+console.log("medianVariance()...");
+{
+ const data = report("benchmarks/a.js", [
+ bench("a", "x", 100, 90, 110, 2),
+ bench("a", "y", 100, 90, 110, 6),
+ bench("a", "z", 100, 90, 110, 4),
+ bench("a", "err-skipped", 0, 0, 0, 0), // zero variance is ignored
+ ]);
+ assertEqual(medianVariance(data), 4, "median of {2,4,6} is 4");
+ assertEqual(medianVariance(null), null, "null data yields null noise");
+}
+
+// -- buildBenchmarkComparison(): with a same-runner main build ---------------------
+
+console.log("buildBenchmarkComparison() with main build...");
+{
+ const interpData = report("benchmarks/arraybuffer.js", [
+ bench("arraybuffer", "create", 123000, 110000, 127052, 5),
+ bench("arraybuffer", "brand new", 5000, 4900, 5100, 3),
+ ]);
+ const interpMain = report("benchmarks/arraybuffer.js", [
+ bench("arraybuffer", "create", 120000, 111000, 127000, 5),
+ // "brand new" intentionally absent from main โ should render as ๐.
+ ]);
+ const bytecodeData = report("benchmarks/arraybuffer.js", [
+ bench("arraybuffer", "create", 162000, 140000, 190000, 8),
+ ]);
+ const bytecodeMain = report("benchmarks/arraybuffer.js", [
+ bench("arraybuffer", "create", 160000, 152000, 164000, 8),
+ ]);
+
+ const { body } = buildBenchmarkComparison({
+ interpData,
+ interpMain,
+ bytecodeData,
+ bytecodeMain,
+ runnerLabel: "ubuntu-latest x64",
+ });
+
+ assert(body.includes("PR vs same-runner `main` build"), "header names the same-runner main build");
+ assert(body.includes("main โ PR"), "table columns show main โ PR direction");
+ assert(body.includes("Typical per-run noise"), "noise floor line is present");
+ assert(body.includes("docs/adr/0076"), "footnote references the ADR");
+ assert(body.includes("๐"), "benchmark missing from main renders as new");
+ // The untouched 'create' benchmark overlaps โ must read as unchanged overlap,
+ // never a green improvement.
+ assert(body.includes("~ overlap"), "overlapping benchmark renders as unchanged overlap");
+ assert(!body.includes("๐ข 1 improved"), "no spurious improvement from same-runner overlap");
+}
+
+// -- buildBenchmarkComparison(): no main build to compare --------------------------
+
+console.log("buildBenchmarkComparison() without main build...");
+{
+ const interpData = report("benchmarks/arraybuffer.js", [bench("arraybuffer", "create", 123000, 110000, 127052)]);
+ const bytecodeData = report("benchmarks/arraybuffer.js", [bench("arraybuffer", "create", 162000, 140000, 190000)]);
+
+ const { body } = buildBenchmarkComparison({
+ interpData,
+ interpMain: null,
+ bytecodeData,
+ bytecodeMain: null,
+ });
+
+ assert(body.includes("no same-runner `main` build to compare"), "states that no main build was available");
+ assert(!body.includes("ฮ"), "no delta column without a main build");
+ // The footnote must not claim a comparison that did not happen (issue #815 review).
+ assert(!body.includes("back-to-back"), "no-main footnote does not claim a back-to-back comparison");
+}
+
+console.log(`\nbenchmark-compare: ${passed} assertions passed.`);