From 1e89ce3432f97057517a27bf1311cffb15827ddc Mon Sep 17 00:00:00 2001
From: Johannes Stein <johannesstein@freeze-dev.com>
Date: Sat, 27 Jun 2026 09:45:27 +0100
Subject: [PATCH 1/4] ci(benchmark): compare PR against a same-runner main
 build

The PR "Benchmark Results" comment classified benchmarks by comparing the
PR run against a cached `main` baseline measured on a different runner at a
different time. Cross-runner variance dwarfs each run's own min/max spread,
so the range-overlap classifier flipped large numbers of unrelated
benchmarks to improved/regressed (e.g. #805, a regex-only change, reported
non-regex benchmarks as improved). The verdicts were not actionable.

Build the PR's base commit in a new `build-main` job and benchmark that
`main` build and the PR build back-to-back on the same runner, after a
warm-up discard of both binaries, so deltas reflect the diff rather than
infrastructure. The range-overlap classifier is unchanged; it now compares
two runs measured under the same conditions. The `main`-base run is
`continue-on-error` so a PR that adds a benchmark `main` cannot run is not
blocked (it degrades to new). The comparison stays advisory (comment-only).

- Extract the ~200-line inline comparison into scripts/benchmark-compare.js
  with unit tests (scripts/test-benchmark-compare.ts), wired into both cli
  jobs; report the measured median per-run variance as a noise floor.
- Regenerate the deterministic profile diff from the same `main` build and
  remove the dead cross-runner benchmark/profile baseline caches.
- Add ADR 0076 and update docs/build-system.md and docs/benchmarks.md.

Closes #815

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .github/workflows/ci.yml                      |  22 +-
 .github/workflows/pr.yml                      | 367 +++++++-----------
 .../0076-same-runner-benchmark-comparison.md  |  13 +
 docs/adr/README.md                            |   1 +
 docs/benchmarks.md                            |  18 +-
 docs/build-system.md                          |   4 +-
 scripts/benchmark-compare.js                  | 274 +++++++++++++
 scripts/profile-diff.js                       |   2 +-
 scripts/test-benchmark-compare.ts             | 167 ++++++++
 9 files changed, 612 insertions(+), 256 deletions(-)
 create mode 100644 docs/adr/0076-same-runner-benchmark-comparison.md
 create mode 100644 scripts/benchmark-compare.js
 create mode 100644 scripts/test-benchmark-compare.ts

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7f4311fe5..a2db4044f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -779,7 +779,10 @@ jobs:
           GOCCIA_BENCH_ROUNDS: 7
         run: ./build/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress
 
-      - name: Run benchmarks and save baseline
+      # main/x86_64-linux emits JSON so the next step can validate the benchmark
+      # report shape. PR comparison no longer reads a cached baseline from here —
+      # each PR builds and benchmarks `main` on its own runner (issue #815).
+      - name: Run benchmarks (validate JSON output)
         if: github.ref == 'refs/heads/main' && matrix.target == 'x86_64-linux'
         env:
           GOCCIA_BENCH_CALIBRATION_MS: 100
@@ -818,13 +821,6 @@ jobs:
             sys.exit(1)
           PY
 
-      - name: Cache benchmark baseline
-        if: github.ref == 'refs/heads/main' && matrix.target == 'x86_64-linux'
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5
-        with:
-          path: benchmark-${{ matrix.mode }}-results.json
-          key: benchmark-${{ matrix.mode }}-baseline-${{ github.sha }}
-
       - uses: oven-sh/setup-bun@0c5077e51419868618aeaa5fe8019c62421857d6 # v2
         if: github.ref == 'refs/heads/main' && matrix.target == 'x86_64-linux' && matrix.mode == 'bytecode'
 
@@ -855,13 +851,6 @@ jobs:
             --summary-output benchmark-profile-aggregate.json \
             --markdown-output benchmark-profile-aggregate.md
 
-      - name: Cache deterministic profile baseline
-        if: github.ref == 'refs/heads/main' && matrix.target == 'x86_64-linux' && matrix.mode == 'bytecode'
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5
-        with:
-          path: profile-baseline/
-          key: profile-baseline-${{ github.sha }}
-
       - name: Upload benchmark profile
         if: github.ref == 'refs/heads/main' && matrix.target == 'x86_64-linux' && matrix.mode == 'bytecode' && hashFiles('benchmark-profile-aggregate.json') != '' && hashFiles('profile-baseline/**') != ''
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
@@ -949,6 +938,9 @@ jobs:
       - name: Check embedded resource CLI behaviour
         run: bun run scripts/test-cli-embedded-resources.ts
 
+      - name: Check benchmark comparison logic
+        run: bun run scripts/test-benchmark-compare.ts
+
       - name: Check Windows TLS dependencies
         if: runner.os == 'Windows'
         shell: pwsh
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index bf3814423..74e3f79f7 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -39,6 +39,35 @@ jobs:
           retention-days: 1
           path: pr-artifacts/*
 
+  # Builds the benchmark runner from the PR's base commit (the `main` the PR
+  # targets) so the `benchmark` job can measure it back-to-back with the PR build
+  # on the *same* runner. This is the trustworthy half of the same-runner A/B
+  # comparison that replaces the old cross-runner cached baseline (issue #815).
+  build-main:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5
+        with:
+          ref: ${{ github.event.pull_request.base.sha }}
+          fetch-depth: 0
+          persist-credentials: false
+
+      - name: Install FPC
+        run: sudo apt-get update && sudo apt-get install fpc -qq > /dev/null
+
+      - name: Build benchmark runner (main base)
+        run: ./build.pas benchmarkrunner --prod
+
+      - name: Upload main benchmark runner
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
+        with:
+          name: build-main
+          retention-days: 1
+          path: build/GocciaBenchmarkRunner
+
   docs:
     runs-on: ubuntu-latest
     steps:
@@ -164,7 +193,7 @@ jobs:
           fi
 
   benchmark:
-    needs: build
+    needs: [build, build-main]
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
@@ -185,8 +214,42 @@ jobs:
           name: build-pr
           path: build/
 
+      - name: Download main benchmark runner
+        uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7
+        with:
+          name: build-main
+          path: build-main/
+
       - name: Make binaries executable
-        run: chmod +x build/*
+        run: |
+          chmod +x build/*
+          chmod +x build-main/*
+
+      # Same-runner A/B (issue #815): the PR and `main` builds are measured on
+      # this one runner so the delta reflects the diff, not cross-runner
+      # variance. Both benchmark the PR's `benchmarks/` workloads. The warm-up
+      # runs are discarded on purpose — they shed runner cold-start (page
+      # faults, sub-turbo clocks) for BOTH binaries so neither the `main` run
+      # (measured first) nor the PR run is penalised by being measured cold.
+      - name: Warm up benchmark runners
+        env:
+          GOCCIA_BENCH_ROUNDS: 1
+        run: |
+          ./build-main/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress > /dev/null 2>&1 || true
+          ./build/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress > /dev/null 2>&1 || true
+
+      # `main` is built from the PR's base commit, so it cannot run benchmarks the
+      # PR newly adds or changes to use new language features. The report is still
+      # written before a non-zero exit, so continue-on-error keeps such a PR from
+      # being blocked: a benchmark that errors on `main` simply has no baseline
+      # entry and renders as 🆕 new. The PR's own run below is intentionally NOT
+      # tolerant — a broken PR benchmark should fail the leg.
+      - name: Run benchmarks (main base)
+        continue-on-error: true
+        env:
+          GOCCIA_BENCH_CALIBRATION_MS: 100
+          GOCCIA_BENCH_ROUNDS: 7
+        run: ./build-main/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress --format=json --output=benchmark-${{ matrix.mode }}-main.json
 
       - name: Run benchmarks
         env:
@@ -194,6 +257,26 @@ jobs:
           GOCCIA_BENCH_ROUNDS: 7
         run: ./build/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress --format=json --output=benchmark-${{ matrix.mode }}.json
 
+      - name: Capture deterministic profiles (main base)
+        if: matrix.mode == 'bytecode'
+        continue-on-error: true
+        run: |
+          set -euo pipefail
+          mkdir -p profile-main
+          while IFS= read -r bench; do
+            rel="${bench#benchmarks/}"
+            out="profile-main/${rel%.*}.json"
+            mkdir -p "$(dirname "$out")"
+            ./build-main/GocciaBenchmarkRunner "$bench" \
+              --mode=bytecode \
+              --profile-deterministic \
+              --profile=all \
+              --profile-output="$out" \
+              --no-progress \
+              --format=compact-json \
+              > /dev/null
+          done < <(find benchmarks -type f \( -name '*.js' -o -name '*.jsx' -o -name '*.ts' -o -name '*.tsx' -o -name '*.mjs' \) ! -path '*/helpers/*' | sort)
+
       - name: Capture deterministic profiles
         if: matrix.mode == 'bytecode'
         continue-on-error: true
@@ -227,6 +310,13 @@ jobs:
           retention-days: 1
           path: benchmark-${{ matrix.mode }}.json
 
+      - name: Upload main results
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
+        with:
+          name: benchmark-${{ matrix.mode }}-main
+          retention-days: 1
+          path: benchmark-${{ matrix.mode }}-main.json
+
       - name: Upload deterministic profiles
         if: matrix.mode == 'bytecode'
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
@@ -235,6 +325,14 @@ jobs:
           retention-days: 1
           path: profile-current/
 
+      - name: Upload main deterministic profiles
+        if: matrix.mode == 'bytecode'
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
+        with:
+          name: profile-main
+          retention-days: 1
+          path: profile-main/
+
   cli:
     needs: build
     runs-on: ubuntu-latest
@@ -276,6 +374,9 @@ jobs:
       - name: Check embedded resource CLI behaviour
         run: bun run scripts/test-cli-embedded-resources.ts
 
+      - name: Check benchmark comparison logic
+        run: bun run scripts/test-benchmark-compare.ts
+
   test262:
     needs: build
     runs-on: ubuntu-latest
@@ -381,259 +482,65 @@ jobs:
           name: profile-current
           path: profile-current/
 
-      - name: Restore interpreted baseline
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5
+      - name: Download interpreted main results
+        uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7
+        continue-on-error: true
         with:
-          path: benchmark-interpreted-results.json
-          key: benchmark-interpreted-baseline-will-not-match
-          restore-keys: benchmark-interpreted-baseline-
+          name: benchmark-interpreted-main
 
-      - name: Restore bytecode baseline
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5
+      - name: Download bytecode main results
+        uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7
+        continue-on-error: true
         with:
-          path: benchmark-bytecode-results.json
-          key: benchmark-bytecode-baseline-will-not-match
-          restore-keys: benchmark-bytecode-baseline-
+          name: benchmark-bytecode-main
 
-      - name: Restore deterministic profile baseline
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5
+      - name: Download main deterministic profiles
+        uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7
+        continue-on-error: true
         with:
-          path: profile-baseline/
-          key: profile-baseline-will-not-match
-          restore-keys: profile-baseline-
-
-      - name: Check baselines
-        run: |
-          if [ -f benchmark-interpreted-results.json ]; then
-            echo "BASELINE_EXISTS=true" >> "$GITHUB_ENV"
-          else
-            echo "BASELINE_EXISTS=false" >> "$GITHUB_ENV"
-          fi
-          if [ -f benchmark-bytecode-results.json ]; then
-            echo "BYTECODE_BASELINE_EXISTS=true" >> "$GITHUB_ENV"
-          else
-            echo "BYTECODE_BASELINE_EXISTS=false" >> "$GITHUB_ENV"
-          fi
+          name: profile-main
+          path: profile-main/
 
       - name: Compare benchmarks and comment
         uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9
         with:
           script: |
             const fs = require('fs');
+            const { buildBenchmarkComparison } = require('./scripts/benchmark-compare.js');
             const { buildProfileDiffSection } = require('./scripts/profile-diff.js');
 
-            const fmtOps = (n) => Math.round(n).toString().replace(/\B(?=(\d{3})+(?!\d))/g, ',');
-            const fmtPercent = (change) => {
-              if (change == null) return 'N/A';
-              return `${change >= 0 ? '+' : ''}${change.toFixed(1)}%`;
-            };
-            const fmtAvg = (changes) => {
-              if (changes.length === 0) return '';
-              const avg = changes.reduce((a, b) => a + b, 0) / changes.length;
-              return `avg ${fmtPercent(avg)}`;
-            };
-            const fmtDelta = (classification, change) => {
-              if (change == null) return classification === 'new' ? '🆕' : 'N/A';
-              if (classification === 'improved') return `🟢 ${fmtPercent(change)}`;
-              if (classification === 'regressed') return `🔴 ${fmtPercent(change)}`;
-              if (classification === 'unchanged') return `~ overlap (${fmtPercent(change)})`;
-              return '🆕';
-            };
-            const fmtRange = (entry) => {
-              if (!entry) return '—';
-              if (entry.minOpsPerSec === entry.maxOpsPerSec) return fmtOps(entry.opsPerSec);
-              return `${fmtOps(entry.minOpsPerSec)}..${fmtOps(entry.maxOpsPerSec)}`;
+            const read = (p) => JSON.parse(fs.readFileSync(p, 'utf8'));
+            const tryRead = (p) => {
+              try { return read(p); }
+              catch (e) { console.log(`No same-runner main result at ${p}:`, e.message); return null; }
             };
-            const fmtBenchmark = (entry) => {
-              if (!entry) return '—';
-              if (entry.minOpsPerSec === entry.maxOpsPerSec) return `${fmtOps(entry.opsPerSec)} ops/sec`;
-              return `${fmtOps(entry.opsPerSec)} ops/sec [${fmtRange(entry)}]`;
-            };
-            const fmtCell = (baseEntry, currentEntry) => {
-              if (baseEntry) return `${fmtBenchmark(baseEntry)} → ${fmtBenchmark(currentEntry)}`;
-              return fmtBenchmark(currentEntry);
-            };
-            const normalizeBenchmark = (benchmark) => ({
-              opsPerSec: benchmark.opsPerSec,
-              minOpsPerSec: benchmark.minOpsPerSec ?? benchmark.opsPerSec,
-              maxOpsPerSec: benchmark.maxOpsPerSec ?? benchmark.opsPerSec,
-              variancePercentage: benchmark.variancePercentage ?? 0,
-            });
-            const calculateChange = (baseEntry, currentEntry) => {
-              if (!baseEntry || !baseEntry.opsPerSec) return null;
-              return ((currentEntry.opsPerSec - baseEntry.opsPerSec) / baseEntry.opsPerSec) * 100;
-            };
-            const classify = (baseEntry, currentEntry) => {
-              if (!baseEntry) return 'new';
-              if (currentEntry.minOpsPerSec > baseEntry.maxOpsPerSec) return 'improved';
-              if (currentEntry.maxOpsPerSec < baseEntry.minOpsPerSec) return 'regressed';
-              return 'unchanged';
-            };
-
-            const interpData = JSON.parse(fs.readFileSync('benchmark-interpreted.json', 'utf8'));
-            const bytecodeData = JSON.parse(fs.readFileSync('benchmark-bytecode.json', 'utf8'));
-
-            let interpBaseline = null;
-            if (process.env.BASELINE_EXISTS === 'true') {
-              try { interpBaseline = JSON.parse(fs.readFileSync('benchmark-interpreted-results.json', 'utf8')); }
-              catch (e) { console.log('Failed to parse interpreted baseline:', e.message); }
-            }
-            let bytecodeBaseline = null;
-            if (process.env.BYTECODE_BASELINE_EXISTS === 'true') {
-              try { bytecodeBaseline = JSON.parse(fs.readFileSync('benchmark-bytecode-results.json', 'utf8')); }
-              catch (e) { console.log('Failed to parse bytecode baseline:', e.message); }
-            }
-            const hasAnyBaseline = interpBaseline || bytecodeBaseline;
-
-            function buildMap(data) {
-              const map = {};
-              if (!data) return map;
-              for (const file of data.files) {
-                const m = {};
-                for (const b of file.benchmarks) {
-                  if (!b.error) m[`${b.suite} > ${b.name}`] = normalizeBenchmark(b);
-                }
-                map[file.fileName] = m;
-              }
-              return map;
-            }
-
-            const interpBaseMap = buildMap(interpBaseline);
-            const bytecodeBaseMap = buildMap(bytecodeBaseline);
-            const bytecodeCurrentMap = buildMap(bytecodeData);
-
-            const newStats = () => ({ improved: 0, regressed: 0, unchanged: 0, new: 0, changes: [] });
-            const totals = { interp: newStats(), bytecode: newStats() };
-            let totalBenchmarks = 0;
-            const fileBlocks = [];
-
-            for (const file of interpData.files) {
-              const fileName = file.fileName;
-              const iBase = interpBaseMap[fileName] || {};
-              const bBase = bytecodeBaseMap[fileName] || {};
-              const bCurrent = bytecodeCurrentMap[fileName] || {};
-              const fileS = { interp: newStats(), bytecode: newStats() };
-              let fileRows = '';
-              const benches = file.benchmarks.filter(b => !b.error);
-              totalBenchmarks += benches.length;
-
-              if (hasAnyBaseline) {
-                fileRows += '| Benchmark | Interpreted | Δ | Bytecode | Δ |\n';
-                fileRows += '|-----------|------------|---|---------|---|\n';
-              } else {
-                fileRows += '| Benchmark | Interpreted | Bytecode |\n';
-                fileRows += '|-----------|------------|----------|\n';
-              }
-
-              for (const bench of benches) {
-                const key = `${bench.suite} > ${bench.name}`;
-                const iCurrent = normalizeBenchmark(bench);
-                const bCurrentEntry = bCurrent[key];
-                const iBaseEntry = iBase[key];
-                const bBaseEntry = bBase[key];
-
-                if (hasAnyBaseline) {
-                  let iDelta;
-                  if (iBaseEntry) {
-                    const c = calculateChange(iBaseEntry, iCurrent);
-                    const classification = classify(iBaseEntry, iCurrent);
-                    if (c != null) fileS.interp.changes.push(c);
-                    fileS.interp[classification]++;
-                    iDelta = fmtDelta(classification, c);
-                  } else {
-                    iDelta = '🆕';
-                    fileS.interp.new++;
-                  }
-
-                  let bDelta;
-                  if (bCurrentEntry && bBaseEntry) {
-                    const c = calculateChange(bBaseEntry, bCurrentEntry);
-                    const classification = classify(bBaseEntry, bCurrentEntry);
-                    if (c != null) fileS.bytecode.changes.push(c);
-                    fileS.bytecode[classification]++;
-                    bDelta = fmtDelta(classification, c);
-                  } else if (bCurrentEntry) {
-                    bDelta = '🆕';
-                    fileS.bytecode.new++;
-                  } else {
-                    bDelta = '—';
-                  }
-
-                  fileRows += `| ${bench.name} | ${fmtCell(iBaseEntry, iCurrent)} | ${iDelta} | ${bCurrentEntry ? fmtCell(bBaseEntry, bCurrentEntry) : '—'} | ${bDelta} |\n`;
-                } else {
-                  fileRows += `| ${bench.name} | ${fmtRange(iCurrent)} | ${bCurrentEntry ? fmtRange(bCurrentEntry) : '—'} |\n`;
-                }
-              }
-
-              for (const mode of ['interp', 'bytecode']) {
-                for (const k of ['improved', 'regressed', 'unchanged', 'new']) totals[mode][k] += fileS[mode][k];
-                totals[mode].changes.push(...fileS[mode].changes);
-              }
-
-              const hasSignificant = fileS.interp.improved + fileS.interp.regressed + fileS.bytecode.improved + fileS.bytecode.regressed > 0;
-
-              function modeSummary(label, s) {
-                const p = [];
-                if (s.improved > 0) p.push(`🟢 ${s.improved}`);
-                if (s.regressed > 0) p.push(`🔴 ${s.regressed}`);
-                if (s.new > 0) p.push(`🆕 ${s.new}`);
-                if (s.unchanged > 0) p.push(`${s.unchanged} unch.`);
-                const avg = fmtAvg(s.changes);
-                return `${label}: ${p.length > 0 ? p.join(', ') : `${s.changes.length + s.new}`}${avg ? ` · ${avg}` : ''}`;
-              }
-
-              const fileSummary = hasAnyBaseline
-                ? modeSummary('Interp', fileS.interp) + ' · ' + modeSummary('Bytecode', fileS.bytecode)
-                : `${benches.length} benchmarks`;
-
-              fileBlocks.push({ fileName, fileSummary, fileRows, hasSignificant });
-            }
-
-            let body = '## Benchmark Results\n\n';
-
-            if (hasAnyBaseline) {
-              function overallMode(label, s) {
-                const p = [];
-                if (s.improved > 0) p.push(`🟢 ${s.improved} improved`);
-                if (s.regressed > 0) p.push(`🔴 ${s.regressed} regressed`);
-                if (s.new > 0) p.push(`🆕 ${s.new} new`);
-                if (s.unchanged > 0) p.push(`${s.unchanged} unchanged`);
-                const avg = fmtAvg(s.changes);
-                return `**${label}:** ${p.join(' · ')}${avg ? ` · ${avg}` : ''}`;
-              }
-              body += `**${totalBenchmarks}** benchmarks\n\n`;
-              body += overallMode('Interpreted', totals.interp) + '\\\n';
-              body += overallMode('Bytecode', totals.bytecode) + '\n\n';
-            } else {
-              body += `**${totalBenchmarks}** benchmarks (no baseline)\n\n`;
-            }
-
-            for (const block of fileBlocks) {
-              const shortName = block.fileName.replace(/^benchmarks\//, '');
-              const open = block.hasSignificant ? ' open' : '';
-              body += `<details${open}>\n`;
-              body += `<summary><strong>${shortName}</strong> — ${block.fileSummary}</summary>\n\n`;
-              body += block.fileRows;
-              body += `\n</details>\n\n`;
-            }
 
+            // Deterministic profile diff (opcode/allocation counts) is runner
+            // independent, so it compares the same-runner main build's profiles.
             const skipProfile = /\[skip-profile-check\]/.test(context.payload.pull_request?.body || '');
+            let profileDiff = '';
             try {
-              const profileDiff = buildProfileDiffSection({
-                baselineDir: 'profile-baseline',
+              profileDiff = buildProfileDiffSection({
+                baselineDir: 'profile-main',
                 currentDir: 'profile-current',
                 skip: skipProfile,
-              });
-              body += profileDiff.markdown + '\n\n';
+              }).markdown;
             } catch (error) {
               console.log('Failed to build deterministic profile diff:', error.message);
-              body += '## Deterministic profile diff\n\nDeterministic profile diff: unavailable due to a comparison error.\n\n';
+              profileDiff = '## Deterministic profile diff\n\nDeterministic profile diff: unavailable due to a comparison error.';
             }
 
-            body += '<sub>Measured on ubuntu-latest x64. Benchmark ranges compare cached main-branch min/max ops/sec with the PR run; overlapping ranges are treated as unchanged noise. Percentage deltas are secondary context.</sub>';
+            const { body: inner } = buildBenchmarkComparison({
+              interpData: read('benchmark-interpreted.json'),
+              interpMain: tryRead('benchmark-interpreted-main.json'),
+              bytecodeData: read('benchmark-bytecode.json'),
+              bytecodeMain: tryRead('benchmark-bytecode-main.json'),
+              profileDiff,
+              runnerLabel: 'ubuntu-latest x64',
+            });
 
             const marker = '<!-- benchmark-comparison -->';
-            body = marker + '\n' + body;
+            const body = marker + '\n' + inner;
 
             const { data: comments } = await github.rest.issues.listComments({
               owner: context.repo.owner,
diff --git a/docs/adr/0076-same-runner-benchmark-comparison.md b/docs/adr/0076-same-runner-benchmark-comparison.md
new file mode 100644
index 000000000..c02f695bf
--- /dev/null
+++ b/docs/adr/0076-same-runner-benchmark-comparison.md
@@ -0,0 +1,13 @@
+# Same-runner benchmark comparison
+
+**Date:** 2026-06-27
+**Area:** `ci` / `tooling`
+**Issue:** [#815](https://github.com/frostney/GocciaScript/issues/815)
+
+The PR "Benchmark Results" comment previously classified benchmarks as 🟢 improved / 🔴 regressed by comparing the PR's freshly-measured run against a **cached baseline measured in a different job, on a different GitHub-hosted runner, at a different time** (whatever `main` last pushed). The baseline was written by `ci.yml`'s `benchmark` job via `actions/cache/save` and restored in `pr.yml`'s `benchmark-comment` job by `restore-keys` prefix. Cross-runner/cross-time variance is systematically larger than each run's own min/max spread, so the range-overlap classifier (improved iff PR min > baseline max) flipped large numbers of benchmarks to improved/regressed even when the diff could not possibly affect them — e.g. PR [#805](https://github.com/frostney/GocciaScript/pull/805) (regex-decode only) reported `arraybuffer.js` and `arrays.js`, which use no regex, as broadly improved. The verdicts were confident but not actionable, and a real regression was indistinguishable from runner noise.
+
+We now build the PR's **base commit** (`github.event.pull_request.base.sha` — the `main` the PR targets) in a dedicated `build-main` job, and the `benchmark` job benchmarks that `main` build and the PR build **back-to-back on the same runner**, after a discarded warm-up run that sheds runner cold-start (page faults, sub-turbo clocks). The range-overlap classifier is unchanged; it now compares two runs measured under the same conditions, so the systematic between-runner offset cancels and only within-run noise remains — which is exactly what range overlap is designed to absorb. The deterministic profile diff (opcode/allocation counts) is regenerated from the same `main` build instead of a cached profile, making the whole comparison self-contained; the cross-runner `benchmark-*-baseline-*` and `profile-baseline-*` caches in `ci.yml` and their restores in `pr.yml` are removed. The ~200-line comparison previously inlined in workflow YAML moved to [`scripts/benchmark-compare.js`](../../scripts/benchmark-compare.js) with unit tests in `scripts/test-benchmark-compare.ts`, so the classifier semantics — including the #815 false-positive case — are locked by a test rather than embedded in `pr.yml`.
+
+The explicit trade-off is cost: building and benchmarking `main` inside every PR run roughly **doubles the benchmark job's CI time** (one extra `--prod` build of the benchmark runner plus a second benchmark pass per mode). That is the accepted price of a trustworthy comparison. The comparison stays **advisory (comment-only)**, not a merge-blocking gate: even same-runner measurement has a noise floor (re-running the identical `--prod` binary showed ±11–15%, up to ±59%), `VISION.md` (line 43) deprioritizes raw throughput, and test262 remains the gating regression check. The range-overlap rule absorbs that floor as "unchanged overlap," and the comment now reports the measured median per-run variance so readers can judge whether a sub-noise delta is meaningful.
+
+Alternatives considered and rejected: **normalizing the cached cross-runner baseline** to an in-run calibration reference — cheaper, but a single calibration benchmark cannot cancel per-workload-class variance (memory-bound vs CPU-bound runners scale differently), so false verdicts shrink rather than disappear; a **consistent self-hosted runner** — removes cross-runner variance structurally but adds infrastructure burden and runs untrusted PR code on owned hardware in a public repo; and **demoting timing entirely** in favour of the already-trustworthy deterministic profile diff — honest and cheap, but the deterministic diff only catches regressions that change instruction or allocation counts, not a pure wall-clock-per-op slowdown, which is part of what the gate exists to surface. Per-file interleaving of the two binaries (the issue's "ideally interleaved") is a possible future refinement if residual same-runner drift proves material; the reported noise floor makes such drift visible. See [docs/build-system.md](../build-system.md) `benchmark` job reference and [VISION.md](../../VISION.md).
diff --git a/docs/adr/README.md b/docs/adr/README.md
index 820ac306d..1be3f122d 100644
--- a/docs/adr/README.md
+++ b/docs/adr/README.md
@@ -85,3 +85,4 @@ Durable architecture and implementation decisions for GocciaScript. New ADRs use
 - [0073 — Opt-in ShadowRealm](0073-opt-in-shadowrealm.md)
 - [0074 — Deferred bytecode call-stack frames](0074-deferred-bytecode-call-stack-frames.md)
 - [0075 — ShadowRealm full test262 conformance](0075-shadowrealm-conformance.md)
+- [0076 — Same-runner benchmark comparison](0076-same-runner-benchmark-comparison.md)
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index 46c9e2073..39037c536 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -260,10 +260,10 @@ When a benchmark has a `setup` or `teardown` function, a second line displays th
 Benchmarks run as part of the CI pipeline in both **interpreter mode** and
 **bytecode mode**. CI uses `GOCCIA_BENCH_CALIBRATION_MS=100` and
 `GOCCIA_BENCH_ROUNDS=7` for stable measurements with IQR outlier filtering. On
-pushes to `main`, the ubuntu-latest x64 runner saves JSON baselines for each
-mode (`benchmark-interpreted-results.json` and `benchmark-bytecode-results.json`)
-to the GitHub Actions cache. See [testing.md](testing.md#ci-integration) for the
-full pipeline overview.
+pushes to `main`, the ubuntu-latest x64 runner emits JSON and validates the
+report shape. PR comparison builds and benchmarks `main` on the PR's own runner
+rather than reading a cached baseline ([ADR 0076](adr/0076-same-runner-benchmark-comparison.md)).
+See [testing.md](testing.md#ci-integration) for the full pipeline overview.
 
 Main bytecode benchmark runs also retain deterministic VM profile reports. The
 GitHub Actions artifact is named `benchmark-profile` and contains:
@@ -284,13 +284,15 @@ paths are `benchmark-profiles/runs/<artifactId>/aggregate.json.gz`,
 
 ### PR Benchmark Comparison
 
-The PR workflow (`.github/workflows/pr.yml`) restores the cached baseline JSON from `main`, runs the benchmark matrix, and posts a collapsible comparison comment on the PR. Each benchmark file gets a **unified table** with both execution modes side by side:
+The PR workflow (`.github/workflows/pr.yml`) builds the PR's base commit (`main`) in a `build-main` job and benchmarks that `main` build and the PR build **back-to-back on the same runner** (after a discarded warm-up), so deltas reflect the diff rather than cross-runner variance (issue #815, [ADR 0076](adr/0076-same-runner-benchmark-comparison.md)). The comparison logic lives in [`scripts/benchmark-compare.js`](../scripts/benchmark-compare.js) (unit-tested by `scripts/test-benchmark-compare.ts`). It posts a collapsible comparison comment on the PR; each benchmark file gets a **unified table** with both execution modes side by side:
 
-- Each table row shows `| Benchmark | Interpreted | Δ | Bytecode | Δ |` with the point estimate and cached/PR min-max range in the form `10,000 ops/sec [9,500..10,500] → 9,200 ops/sec [8,700..9,700]`
-- Classification uses **range overlap** instead of a single point value: if the PR run sits fully above the baseline range it is improved, if it sits fully below it is regressed, and overlapping ranges are treated as unchanged noise
+- Each table row shows `| Benchmark | Interpreted (main → PR) | Δ | Bytecode (main → PR) | Δ |` with the point estimate and same-runner `main`/PR min-max range in the form `10,000 ops/sec [9,500..10,500] → 9,200 ops/sec [8,700..9,700]`
+- Classification uses **range overlap** instead of a single point value: if the PR run sits fully above the `main` range it is improved, if it sits fully below it is regressed, and overlapping ranges are treated as unchanged noise
 - Percentage deltas remain in the `Δ` column as secondary context, even when the classifier marks a benchmark as `~ overlap`
+- The overall summary reports the measured **median per-run variance** as a noise floor, so sub-noise deltas are read as unchanged
 - Results are **grouped by file**, each in a collapsible `<details>` section
 - Files with significant changes (improvements or regressions) are auto-expanded
 - Each file summary shows per-mode counts (e.g., `Interp: 🟢 1, 7 unch. · Bytecode: 🟢 2, 6 unch.`)
 - The **overall PR summary** shows per-mode totals on separate lines with average percentage deltas
-- 🟢 marks non-overlapping improvements, 🔴 marks non-overlapping regressions, `~ overlap` marks overlapping ranges, and 🆕 marks new benchmarks with no baseline
+- The comparison is **advisory** (comment-only, never merge-blocking)
+- 🟢 marks non-overlapping improvements, 🔴 marks non-overlapping regressions, `~ overlap` marks overlapping ranges, and 🆕 marks new benchmarks absent from the `main` build
diff --git a/docs/build-system.md b/docs/build-system.md
index 1425460c8..99481b4a3 100644
--- a/docs/build-system.md
+++ b/docs/build-system.md
@@ -669,7 +669,7 @@ Runs on the full platform matrix:
 
 **`test262`** (needs build, ubuntu-latest x64 only, **non-blocking**) — Downloads the `gocciascript-x86_64-linux` build, checks out [`tc39/test262`](https://github.com/tc39/test262) at the SHA pinned in `scripts/test262-suite-sha.txt`, runs `bun scripts/run_test262_suite.ts --suite-dir test262-suite --mode=bytecode --jobs=4 --timeout-ms=20000 --output=test262-results.json`, and uploads the report as a 30-day workflow artifact. The run step uses `continue-on-error: true` because the conformance lane is allowed to carry known steady-state failures while the engine closes compatibility gaps. On main, the JSON is also stashed via `actions/cache/save` under `test262-baseline-<sha>` so the PR workflow can compute Δ vs main, and `cd website && bun run publish-test262 ../test262-results.json` publishes the compressed run report plus its UTC daily dashboard pointer to Vercel Blob when `BLOB_READ_WRITE_TOKEN` is configured. Main runs also upload the `test262-profile` artifact and publish matching aggregate/detail profile payloads under the separate `test262-profiles/` Blob namespace for weekly performance review. The one-off `cd website && bun run backfill-test262` command seeds retained artifact reports and reruns expired historical days directly into Blob. The pin is bumped weekly by `.github/workflows/test262-bump.yml`. See [docs/test262.md](test262.md) for the harness and profile report contracts.
 
-**`benchmark`** (needs build) — Runs all benchmarks on all platforms. On main (ubuntu-latest x64), saves benchmark results as JSON to `actions/cache` for PR comparison. The main bytecode benchmark lane also captures deterministic VM profile details, uploads the `benchmark-profile` artifact, and publishes aggregate/detail profile payloads under the separate `benchmark-profiles/` Blob namespace when `BLOB_READ_WRITE_TOKEN` is configured.
+**`benchmark`** (needs build) — Runs all benchmarks on all platforms. On main (ubuntu-latest x64), it additionally emits JSON and validates the report shape. PR comparison no longer reads a cached baseline from here — each PR builds and benchmarks `main` on its own runner ([ADR 0076](adr/0076-same-runner-benchmark-comparison.md)). The main bytecode benchmark lane also captures deterministic VM profile details, uploads the `benchmark-profile` artifact, and publishes aggregate/detail profile payloads under the separate `benchmark-profiles/` Blob namespace when `BLOB_READ_WRITE_TOKEN` is configured.
 
 **`cli`** (needs build) — Downloads pre-built binaries and runs CLI behavior smoke tests on all platforms via Bun: options across all apps, lexer numeric-separator rejection, parser error display, config-file loading, and app-specific features including Sandbox Runner seed baselines. Windows runs additionally assert that the loader binary does not link OpenSSL DLLs (HTTPS must use the platform TLS stack statically).
 
@@ -692,7 +692,7 @@ Runs on **ubuntu-latest x64 only** (single runner, no matrix).
 
 **`test`** (needs build) — Runs all JavaScript tests and Pascal unit tests.
 
-**`benchmark`** (needs build) — Restores the cached benchmark baseline JSON from main, runs all benchmarks with JSON output, and posts a collapsible comparison comment on the PR grouped by file. Each file section shows the cached baseline and PR `opsPerSec` point estimates side by side, with each point estimate carrying its min/max ops/sec range in brackets. Classification uses range overlap: fully above the baseline range is an improvement, fully below is a regression, and overlapping ranges are treated as unchanged noise. Percentage deltas are still shown as secondary context, and files with significant changes are auto-expanded. If no baseline exists, shows results without comparison.
+**`benchmark`** (needs build + build-main) — Compares the PR against a `main` build measured **on the same runner**. A separate `build-main` job builds the benchmark runner from the PR's base commit (`github.event.pull_request.base.sha`); the `benchmark` job then warms up the runner (discarded) and benchmarks the `main` build and the PR build back-to-back, so deltas reflect the diff rather than cross-runner variance (issue #815, [ADR 0076](adr/0076-same-runner-benchmark-comparison.md)). It posts a collapsible comparison comment grouped by file: each file section shows the same-runner `main` and PR `opsPerSec` point estimates side by side, each carrying its min/max ops/sec range in brackets. Classification (in [`scripts/benchmark-compare.js`](../scripts/benchmark-compare.js), unit-tested by `scripts/test-benchmark-compare.ts`) uses range overlap: fully above the `main` range is an improvement, fully below is a regression, and overlapping ranges are treated as unchanged noise. The comment also reports the measured median per-run variance as a noise floor. Percentage deltas are secondary context, files with significant changes are auto-expanded, and the comparison is advisory (comment-only, never merge-blocking). The deterministic profile diff (opcode/allocation counts) is regenerated from the same `main` build. If no `main` build is available, shows results without comparison.
 
 **`test262`** (needs build, **non-blocking**) — Checks out `tc39/test262` at the pinned SHA, runs `bun scripts/run_test262_suite.ts --suite-dir test262-suite --mode=bytecode --jobs=4 --timeout-ms=20000 --output=test262-results.json`, and uploads the JSON report. Failing tests do not fail the job. The downstream `test262-comment` job (`if: always()`) restores the most recent `test262-baseline-` cache entry from main, then `bun scripts/run_test262_suite.ts --comment test262-results.json <baseline>` builds the markdown body and the workflow posts/updates a comment using marker `<!-- test262-results -->`. The comment shows a per-category breakdown (built-ins, harness, intl402, language, staging) with Δ-vs-main columns when a baseline is cached, an "Areas closest to 100%" sub-table, and a collapsible per-test delta list. PR CI does not generate or upload the full-corpus test262 profile artifact.
 
diff --git a/scripts/benchmark-compare.js b/scripts/benchmark-compare.js
new file mode 100644
index 000000000..1f38cc79d
--- /dev/null
+++ b/scripts/benchmark-compare.js
@@ -0,0 +1,274 @@
+#!/usr/bin/env node
+
+// Builds the PR "Benchmark Results" comment by comparing the PR build against a
+// `main` build measured **on the same runner, back-to-back** in the same job.
+//
+// Why this module exists: the comparison used to read a `main` baseline from the
+// Actions cache, written by a *different* job on a *different* GitHub-hosted
+// runner at a *different* time. Cross-runner/cross-time variance dwarfs each
+// run's own min/max spread, so the range-overlap classifier flipped large
+// numbers of benchmarks to improved/regressed even when the diff could not have
+// affected them (issue #815). Feeding the classifier two same-runner runs makes
+// the systematic offset cancel; the range-overlap rule then only sees within-run
+// noise, which it is designed for. See docs/adr/0076.
+//
+// The classifier itself is unchanged from the original inline implementation
+// (range overlap on each run's own min/max). This module exists so that logic
+// is a single tested unit instead of ~200 lines embedded in workflow YAML.
+
+const fmtOps = (n) => Math.round(n).toString().replace(/\B(?=(\d{3})+(?!\d))/g, ',');
+
+const fmtPercent = (change) => {
+  if (change == null) return 'N/A';
+  return `${change >= 0 ? '+' : ''}${change.toFixed(1)}%`;
+};
+
+const fmtAvg = (changes) => {
+  if (changes.length === 0) return '';
+  const avg = changes.reduce((a, b) => a + b, 0) / changes.length;
+  return `avg ${fmtPercent(avg)}`;
+};
+
+const fmtDelta = (classification, change) => {
+  if (change == null) return classification === 'new' ? '🆕' : 'N/A';
+  if (classification === 'improved') return `🟢 ${fmtPercent(change)}`;
+  if (classification === 'regressed') return `🔴 ${fmtPercent(change)}`;
+  if (classification === 'unchanged') return `~ overlap (${fmtPercent(change)})`;
+  return '🆕';
+};
+
+const fmtRange = (entry) => {
+  if (!entry) return '—';
+  if (entry.minOpsPerSec === entry.maxOpsPerSec) return fmtOps(entry.opsPerSec);
+  return `${fmtOps(entry.minOpsPerSec)}..${fmtOps(entry.maxOpsPerSec)}`;
+};
+
+const fmtBenchmark = (entry) => {
+  if (!entry) return '—';
+  if (entry.minOpsPerSec === entry.maxOpsPerSec) return `${fmtOps(entry.opsPerSec)} ops/sec`;
+  return `${fmtOps(entry.opsPerSec)} ops/sec [${fmtRange(entry)}]`;
+};
+
+const fmtCell = (baseEntry, currentEntry) => {
+  if (baseEntry) return `${fmtBenchmark(baseEntry)} → ${fmtBenchmark(currentEntry)}`;
+  return fmtBenchmark(currentEntry);
+};
+
+const normalizeBenchmark = (benchmark) => ({
+  opsPerSec: benchmark.opsPerSec,
+  minOpsPerSec: benchmark.minOpsPerSec ?? benchmark.opsPerSec,
+  maxOpsPerSec: benchmark.maxOpsPerSec ?? benchmark.opsPerSec,
+  variancePercentage: benchmark.variancePercentage ?? 0,
+});
+
+const calculateChange = (baseEntry, currentEntry) => {
+  if (!baseEntry || !baseEntry.opsPerSec) return null;
+  return ((currentEntry.opsPerSec - baseEntry.opsPerSec) / baseEntry.opsPerSec) * 100;
+};
+
+// Range overlap on each run's own min/max range. Because both ranges now come
+// from the same runner, the only thing separating them is within-run noise: a
+// classification only fires when the PR range sits *entirely* above (improved)
+// or *entirely* below (regressed) the main range. Overlap → unchanged noise.
+const classify = (baseEntry, currentEntry) => {
+  if (!baseEntry) return 'new';
+  if (currentEntry.minOpsPerSec > baseEntry.maxOpsPerSec) return 'improved';
+  if (currentEntry.maxOpsPerSec < baseEntry.minOpsPerSec) return 'regressed';
+  return 'unchanged';
+};
+
+function buildMap(data) {
+  const map = {};
+  if (!data) return map;
+  for (const file of data.files) {
+    const m = {};
+    for (const b of file.benchmarks) {
+      if (!b.error) m[`${b.suite} > ${b.name}`] = normalizeBenchmark(b);
+    }
+    map[file.fileName] = m;
+  }
+  return map;
+}
+
+// Median of each benchmark's own within-run variancePercentage: an honest,
+// measured "noise floor" surfaced so readers can judge whether a sub-noise delta
+// is meaningful. It is the spread the classifier already absorbs as overlap.
+function medianVariance(data) {
+  if (!data) return null;
+  const values = [];
+  for (const file of data.files) {
+    for (const b of file.benchmarks) {
+      if (!b.error && typeof b.variancePercentage === 'number' && b.variancePercentage > 0) {
+        values.push(b.variancePercentage);
+      }
+    }
+  }
+  if (values.length === 0) return null;
+  values.sort((a, b) => a - b);
+  const mid = Math.floor(values.length / 2);
+  return values.length % 2 ? values[mid] : (values[mid - 1] + values[mid]) / 2;
+}
+
+const newStats = () => ({ improved: 0, regressed: 0, unchanged: 0, new: 0, changes: [] });
+
+// options:
+//   interpData, bytecodeData       — required PR-run reports (this run)
+//   interpMain, bytecodeMain        — same-runner `main`-build reports (may be null)
+//   profileDiff                     — optional pre-rendered deterministic profile markdown
+//   runnerLabel                     — e.g. 'ubuntu-latest x64'
+// returns { body } — markdown WITHOUT the HTML comment marker.
+function buildBenchmarkComparison(options) {
+  const interpData = options.interpData;
+  const bytecodeData = options.bytecodeData;
+  const interpMain = options.interpMain || null;
+  const bytecodeMain = options.bytecodeMain || null;
+  const runnerLabel = options.runnerLabel || 'ubuntu-latest x64';
+
+  const hasAnyMain = interpMain || bytecodeMain;
+
+  const interpMainMap = buildMap(interpMain);
+  const bytecodeMainMap = buildMap(bytecodeMain);
+  const bytecodeCurrentMap = buildMap(bytecodeData);
+
+  const totals = { interp: newStats(), bytecode: newStats() };
+  let totalBenchmarks = 0;
+  const fileBlocks = [];
+
+  for (const file of interpData.files) {
+    const fileName = file.fileName;
+    const iBase = interpMainMap[fileName] || {};
+    const bBase = bytecodeMainMap[fileName] || {};
+    const bCurrent = bytecodeCurrentMap[fileName] || {};
+    const fileS = { interp: newStats(), bytecode: newStats() };
+    let fileRows = '';
+    const benches = file.benchmarks.filter((b) => !b.error);
+    totalBenchmarks += benches.length;
+
+    if (hasAnyMain) {
+      fileRows += '| Benchmark | Interpreted (main → PR) | Δ | Bytecode (main → PR) | Δ |\n';
+      fileRows += '|-----------|------------|---|---------|---|\n';
+    } else {
+      fileRows += '| Benchmark | Interpreted | Bytecode |\n';
+      fileRows += '|-----------|------------|----------|\n';
+    }
+
+    for (const bench of benches) {
+      const key = `${bench.suite} > ${bench.name}`;
+      const iCurrent = normalizeBenchmark(bench);
+      const bCurrentEntry = bCurrent[key];
+      const iBaseEntry = iBase[key];
+      const bBaseEntry = bBase[key];
+
+      if (hasAnyMain) {
+        let iDelta;
+        if (iBaseEntry) {
+          const c = calculateChange(iBaseEntry, iCurrent);
+          const classification = classify(iBaseEntry, iCurrent);
+          if (c != null) fileS.interp.changes.push(c);
+          fileS.interp[classification]++;
+          iDelta = fmtDelta(classification, c);
+        } else {
+          iDelta = '🆕';
+          fileS.interp.new++;
+        }
+
+        let bDelta;
+        if (bCurrentEntry && bBaseEntry) {
+          const c = calculateChange(bBaseEntry, bCurrentEntry);
+          const classification = classify(bBaseEntry, bCurrentEntry);
+          if (c != null) fileS.bytecode.changes.push(c);
+          fileS.bytecode[classification]++;
+          bDelta = fmtDelta(classification, c);
+        } else if (bCurrentEntry) {
+          bDelta = '🆕';
+          fileS.bytecode.new++;
+        } else {
+          bDelta = '—';
+        }
+
+        fileRows += `| ${bench.name} | ${fmtCell(iBaseEntry, iCurrent)} | ${iDelta} | ${bCurrentEntry ? fmtCell(bBaseEntry, bCurrentEntry) : '—'} | ${bDelta} |\n`;
+      } else {
+        fileRows += `| ${bench.name} | ${fmtRange(iCurrent)} | ${bCurrentEntry ? fmtRange(bCurrentEntry) : '—'} |\n`;
+      }
+    }
+
+    for (const mode of ['interp', 'bytecode']) {
+      for (const k of ['improved', 'regressed', 'unchanged', 'new']) totals[mode][k] += fileS[mode][k];
+      totals[mode].changes.push(...fileS[mode].changes);
+    }
+
+    const hasSignificant = fileS.interp.improved + fileS.interp.regressed + fileS.bytecode.improved + fileS.bytecode.regressed > 0;
+
+    function modeSummary(label, s) {
+      const p = [];
+      if (s.improved > 0) p.push(`🟢 ${s.improved}`);
+      if (s.regressed > 0) p.push(`🔴 ${s.regressed}`);
+      if (s.new > 0) p.push(`🆕 ${s.new}`);
+      if (s.unchanged > 0) p.push(`${s.unchanged} unch.`);
+      const avg = fmtAvg(s.changes);
+      return `${label}: ${p.length > 0 ? p.join(', ') : `${s.changes.length + s.new}`}${avg ? ` · ${avg}` : ''}`;
+    }
+
+    const fileSummary = hasAnyMain
+      ? modeSummary('Interp', fileS.interp) + ' · ' + modeSummary('Bytecode', fileS.bytecode)
+      : `${benches.length} benchmarks`;
+
+    fileBlocks.push({ fileName, fileSummary, fileRows, hasSignificant });
+  }
+
+  let body = '## Benchmark Results\n\n';
+
+  if (hasAnyMain) {
+    function overallMode(label, s) {
+      const p = [];
+      if (s.improved > 0) p.push(`🟢 ${s.improved} improved`);
+      if (s.regressed > 0) p.push(`🔴 ${s.regressed} regressed`);
+      if (s.new > 0) p.push(`🆕 ${s.new} new`);
+      if (s.unchanged > 0) p.push(`${s.unchanged} unchanged`);
+      const avg = fmtAvg(s.changes);
+      return `**${label}:** ${p.join(' · ')}${avg ? ` · ${avg}` : ''}`;
+    }
+    body += `**${totalBenchmarks}** benchmarks · PR vs same-runner \`main\` build\n\n`;
+    body += overallMode('Interpreted', totals.interp) + '\\\n';
+    body += overallMode('Bytecode', totals.bytecode) + '\n\n';
+
+    const iNoise = medianVariance(interpData);
+    const bNoise = medianVariance(bytecodeData);
+    const noiseParts = [];
+    if (iNoise != null) noiseParts.push(`interpreted ±${iNoise.toFixed(1)}%`);
+    if (bNoise != null) noiseParts.push(`bytecode ±${bNoise.toFixed(1)}%`);
+    if (noiseParts.length > 0) {
+      body += `Typical per-run noise (median variance): ${noiseParts.join(', ')}. Deltas within noise overlap and read as unchanged.\n\n`;
+    }
+  } else {
+    body += `**${totalBenchmarks}** benchmarks (no same-runner \`main\` build to compare)\n\n`;
+  }
+
+  for (const block of fileBlocks) {
+    const shortName = block.fileName.replace(/^benchmarks\//, '');
+    const open = block.hasSignificant ? ' open' : '';
+    body += `<details${open}>\n`;
+    body += `<summary><strong>${shortName}</strong> — ${block.fileSummary}</summary>\n\n`;
+    body += block.fileRows;
+    body += `\n</details>\n\n`;
+  }
+
+  if (options.profileDiff) {
+    body += options.profileDiff + '\n\n';
+  }
+
+  body += hasAnyMain
+    ? `<sub>Measured on ${runnerLabel}. Each PR run also builds the \`main\` base and benchmarks it back-to-back on the same runner after a warm-up discard, so the ranges compare two runs measured under the same conditions; overlapping min/max ranges are treated as unchanged noise. Percentage deltas are secondary context. See docs/adr/0076.</sub>`
+    : `<sub>Measured on ${runnerLabel}. No \`main\` build was available to compare against, so these are raw PR-run numbers. See docs/adr/0076.</sub>`;
+
+  return { body };
+}
+
+module.exports = {
+  buildBenchmarkComparison,
+  classify,
+  calculateChange,
+  normalizeBenchmark,
+  buildMap,
+  medianVariance,
+};
diff --git a/scripts/profile-diff.js b/scripts/profile-diff.js
index 1baa8724d..221f95f7a 100755
--- a/scripts/profile-diff.js
+++ b/scripts/profile-diff.js
@@ -217,7 +217,7 @@ function buildProfileDiffSection(options) {
 
   if (baselineFiles.length === 0) {
     return {
-      markdown: '## Deterministic profile diff\n\nDeterministic profile diff: no cached baseline was restored.',
+      markdown: '## Deterministic profile diff\n\nDeterministic profile diff: no `main`-build profile was available.',
       signalCount: 0,
     };
   }
diff --git a/scripts/test-benchmark-compare.ts b/scripts/test-benchmark-compare.ts
new file mode 100644
index 000000000..3f5d839f9
--- /dev/null
+++ b/scripts/test-benchmark-compare.ts
@@ -0,0 +1,167 @@
+#!/usr/bin/env bun
+/**
+ * test-benchmark-compare.ts
+ *
+ * Unit tests for scripts/benchmark-compare.js — the PR "Benchmark Results"
+ * comment builder. Locks the range-overlap classifier and proves the issue
+ * #815 failure mode: when the PR and `main` ranges overlap (the realistic
+ * same-runner case), a benchmark untouched by the diff reads as `unchanged`,
+ * not improved/regressed — even when its point-estimate delta is positive.
+ *
+ * Run: bun run scripts/test-benchmark-compare.ts
+ */
+
+import { createRequire } from "node:module";
+
+const require = createRequire(import.meta.url);
+const {
+  classify,
+  medianVariance,
+  buildBenchmarkComparison,
+} = require("./benchmark-compare.js");
+
+let passed = 0;
+
+function assert(condition: unknown, message: string): void {
+  if (!condition) throw new Error(`Assertion failed: ${message}`);
+  passed++;
+}
+
+function assertEqual<T>(actual: T, expected: T, message: string): void {
+  if (actual !== expected) {
+    throw new Error(`${message}\n  expected: ${JSON.stringify(expected)}\n  actual:   ${JSON.stringify(actual)}`);
+  }
+  passed++;
+}
+
+// A benchmark entry: opsPerSec point estimate plus the min/max range across rounds.
+const bench = (
+  suite: string,
+  name: string,
+  opsPerSec: number,
+  min: number,
+  max: number,
+  variancePercentage = 0,
+) => ({ suite, name, opsPerSec, minOpsPerSec: min, maxOpsPerSec: max, variancePercentage });
+
+const report = (fileName: string, benchmarks: object[]) => ({
+  files: [{ fileName, benchmarks }],
+  totalBenchmarks: benchmarks.length,
+});
+
+// -- classify(): pure range overlap -----------------------------------------------
+
+console.log("classify() range overlap...");
+{
+  // PR range entirely above main range → improved.
+  assertEqual(
+    classify({ minOpsPerSec: 100, maxOpsPerSec: 110 }, { minOpsPerSec: 120, maxOpsPerSec: 130 }),
+    "improved",
+    "PR min above main max is improved",
+  );
+  // PR range entirely below main range → regressed.
+  assertEqual(
+    classify({ minOpsPerSec: 120, maxOpsPerSec: 130 }, { minOpsPerSec: 100, maxOpsPerSec: 110 }),
+    "regressed",
+    "PR max below main min is regressed",
+  );
+  // Overlapping ranges → unchanged noise.
+  assertEqual(
+    classify({ minOpsPerSec: 100, maxOpsPerSec: 130 }, { minOpsPerSec: 110, maxOpsPerSec: 140 }),
+    "unchanged",
+    "overlapping ranges are unchanged",
+  );
+  // No main entry → new.
+  assertEqual(
+    classify(undefined, { minOpsPerSec: 110, maxOpsPerSec: 140 }),
+    "new",
+    "missing main entry is new",
+  );
+}
+
+// -- Issue #815: same-runner overlap must NOT misclassify an untouched benchmark ---
+
+console.log("issue #815: untouched benchmark reads as unchanged...");
+{
+  // Mirrors the real PR #805 evidence: a regex-only change reported arraybuffer
+  // (which uses no regex) as "improved" purely from cross-runner offset. With a
+  // same-runner `main` build the ranges overlap, so it reads as unchanged even
+  // though the point estimate ticked up +2.5%.
+  const main = { opsPerSec: 120000, minOpsPerSec: 111000, maxOpsPerSec: 127000 };
+  const pr = { opsPerSec: 123000, minOpsPerSec: 110000, maxOpsPerSec: 127052 };
+  assertEqual(classify(main, pr), "unchanged", "same-runner overlap is unchanged, not improved");
+}
+
+// -- medianVariance(): honest noise floor from measured data -----------------------
+
+console.log("medianVariance()...");
+{
+  const data = report("benchmarks/a.js", [
+    bench("a", "x", 100, 90, 110, 2),
+    bench("a", "y", 100, 90, 110, 6),
+    bench("a", "z", 100, 90, 110, 4),
+    bench("a", "err-skipped", 0, 0, 0, 0), // zero variance is ignored
+  ]);
+  assertEqual(medianVariance(data), 4, "median of {2,4,6} is 4");
+  assertEqual(medianVariance(null), null, "null data yields null noise");
+}
+
+// -- buildBenchmarkComparison(): with a same-runner main build ---------------------
+
+console.log("buildBenchmarkComparison() with main build...");
+{
+  const interpData = report("benchmarks/arraybuffer.js", [
+    bench("arraybuffer", "create", 123000, 110000, 127052, 5),
+    bench("arraybuffer", "brand new", 5000, 4900, 5100, 3),
+  ]);
+  const interpMain = report("benchmarks/arraybuffer.js", [
+    bench("arraybuffer", "create", 120000, 111000, 127000, 5),
+    // "brand new" intentionally absent from main → should render as 🆕.
+  ]);
+  const bytecodeData = report("benchmarks/arraybuffer.js", [
+    bench("arraybuffer", "create", 162000, 140000, 190000, 8),
+  ]);
+  const bytecodeMain = report("benchmarks/arraybuffer.js", [
+    bench("arraybuffer", "create", 160000, 152000, 164000, 8),
+  ]);
+
+  const { body } = buildBenchmarkComparison({
+    interpData,
+    interpMain,
+    bytecodeData,
+    bytecodeMain,
+    runnerLabel: "ubuntu-latest x64",
+  });
+
+  assert(body.includes("PR vs same-runner `main` build"), "header names the same-runner main build");
+  assert(body.includes("main → PR"), "table columns show main → PR direction");
+  assert(body.includes("Typical per-run noise"), "noise floor line is present");
+  assert(body.includes("docs/adr/0076"), "footnote references the ADR");
+  assert(body.includes("🆕"), "benchmark missing from main renders as new");
+  // The untouched 'create' benchmark overlaps → must read as unchanged overlap,
+  // never a green improvement.
+  assert(body.includes("~ overlap"), "overlapping benchmark renders as unchanged overlap");
+  assert(!body.includes("🟢 1 improved"), "no spurious improvement from same-runner overlap");
+}
+
+// -- buildBenchmarkComparison(): no main build to compare --------------------------
+
+console.log("buildBenchmarkComparison() without main build...");
+{
+  const interpData = report("benchmarks/arraybuffer.js", [bench("arraybuffer", "create", 123000, 110000, 127052)]);
+  const bytecodeData = report("benchmarks/arraybuffer.js", [bench("arraybuffer", "create", 162000, 140000, 190000)]);
+
+  const { body } = buildBenchmarkComparison({
+    interpData,
+    interpMain: null,
+    bytecodeData,
+    bytecodeMain: null,
+  });
+
+  assert(body.includes("no same-runner `main` build to compare"), "states that no main build was available");
+  assert(!body.includes("Δ"), "no delta column without a main build");
+  // The footnote must not claim a comparison that did not happen (issue #815 review).
+  assert(!body.includes("back-to-back"), "no-main footnote does not claim a back-to-back comparison");
+}
+
+console.log(`\nbenchmark-compare: ${passed} assertions passed.`);

From f6647abcad1594004710bb9831557ca949b3b1db Mon Sep 17 00:00:00 2001
From: Johannes Stein <johannesstein@freeze-dev.com>
Date: Sat, 27 Jun 2026 09:46:43 +0100
Subject: [PATCH 2/4] docs(adr): link ADR 0076 to its pull request

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 docs/adr/0076-same-runner-benchmark-comparison.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/adr/0076-same-runner-benchmark-comparison.md b/docs/adr/0076-same-runner-benchmark-comparison.md
index c02f695bf..292c5792d 100644
--- a/docs/adr/0076-same-runner-benchmark-comparison.md
+++ b/docs/adr/0076-same-runner-benchmark-comparison.md
@@ -3,6 +3,7 @@
 **Date:** 2026-06-27
 **Area:** `ci` / `tooling`
 **Issue:** [#815](https://github.com/frostney/GocciaScript/issues/815)
+**Pull Request:** [#886](https://github.com/frostney/GocciaScript/pull/886)
 
 The PR "Benchmark Results" comment previously classified benchmarks as 🟢 improved / 🔴 regressed by comparing the PR's freshly-measured run against a **cached baseline measured in a different job, on a different GitHub-hosted runner, at a different time** (whatever `main` last pushed). The baseline was written by `ci.yml`'s `benchmark` job via `actions/cache/save` and restored in `pr.yml`'s `benchmark-comment` job by `restore-keys` prefix. Cross-runner/cross-time variance is systematically larger than each run's own min/max spread, so the range-overlap classifier (improved iff PR min > baseline max) flipped large numbers of benchmarks to improved/regressed even when the diff could not possibly affect them — e.g. PR [#805](https://github.com/frostney/GocciaScript/pull/805) (regex-decode only) reported `arraybuffer.js` and `arrays.js`, which use no regex, as broadly improved. The verdicts were confident but not actionable, and a real regression was indistinguishable from runner noise.
 

From 4275c38ad5d57172b0eb2d7eccb161d62d448790 Mon Sep 17 00:00:00 2001
From: Johannes Stein <johannesstein@freeze-dev.com>
Date: Sat, 27 Jun 2026 13:26:23 +0100
Subject: [PATCH 3/4] ci(benchmark): keep benchmark comment when build-main
 fails
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The benchmark job hard-depended on build-main (needs: [build, build-main]),
so a build-main failure (base predating the benchmarkrunner target, or a
transient FPC install hiccup) skipped benchmark and, transitively,
benchmark-comment — the PR got no benchmark feedback at all, even though
build-pr succeeded. That made the downstream graceful-degradation path
(optional main downloads + null-main handling in buildBenchmarkComparison)
unreachable.

Keep the build-main ordering dependency but run benchmark whenever `build`
itself succeeds (if: !cancelled() && needs.build.result == 'success'), and
make the main-build inputs optional: continue-on-error on the build-main
download, a tolerant chmod, and if-no-files-found: ignore on the main result
and profile uploads. When build-main fails the comparison now posts PR-only
numbers instead of going silent.

Addresses CodeRabbit review feedback on #886.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .github/workflows/pr.yml | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 74e3f79f7..2c4fe5a86 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -193,7 +193,13 @@ jobs:
           fi
 
   benchmark:
+    # `needs` keeps build-main ordered before this job, but its failure must not
+    # skip the benchmark/benchmark-comment chain: when build-main fails (e.g. the
+    # base predates the benchmarkrunner target, or a transient FPC hiccup) the
+    # downstream comparison already degrades to PR-only numbers via the optional
+    # main downloads + null-main handling. Run whenever `build` itself succeeded.
     needs: [build, build-main]
+    if: ${{ !cancelled() && needs.build.result == 'success' }}
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
@@ -216,6 +222,7 @@ jobs:
 
       - name: Download main benchmark runner
         uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7
+        continue-on-error: true
         with:
           name: build-main
           path: build-main/
@@ -223,7 +230,8 @@ jobs:
       - name: Make binaries executable
         run: |
           chmod +x build/*
-          chmod +x build-main/*
+          # build-main is optional: a failed build-main job leaves this empty.
+          chmod +x build-main/* 2>/dev/null || true
 
       # Same-runner A/B (issue #815): the PR and `main` builds are measured on
       # this one runner so the delta reflects the diff, not cross-runner
@@ -315,6 +323,8 @@ jobs:
         with:
           name: benchmark-${{ matrix.mode }}-main
           retention-days: 1
+          # Absent when build-main failed; the comparison degrades to PR-only.
+          if-no-files-found: ignore
           path: benchmark-${{ matrix.mode }}-main.json
 
       - name: Upload deterministic profiles
@@ -331,6 +341,8 @@ jobs:
         with:
           name: profile-main
           retention-days: 1
+          # Absent when build-main failed; the profile diff degrades gracefully.
+          if-no-files-found: ignore
           path: profile-main/
 
   cli:

From e30b90b390b41e9e480bd316fe8f7a41fe385a9e Mon Sep 17 00:00:00 2001
From: Johannes Stein <johannesstein@freeze-dev.com>
Date: Sat, 27 Jun 2026 15:12:52 +0100
Subject: [PATCH 4/4] ci(benchmark): bound benchmark runs and lighten the
 warm-up
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The benchmark (interpreted) leg timed out after 51 minutes on #886: the PR
measured pass hung while the main pass (same workloads, same runner) had
just completed in ~3 min. Locally the full interpreted suite runs in ~2 min,
so this is a non-deterministic per-benchmark calibration runaway on a loaded
GitHub runner, not a benchmark bug.

The trigger was self-inflicted: the two-binary warm-up ran with the default
200 ms calibration (no override), making it ~a full extra pass per binary —
roughly 2.5x the interpreted work and heavy runner load right before the
final measured pass, which is when calibration is most likely to misfire.
There was also no timeout, so a runaway burned 51 min and red-marked the PR.

- Replace the two-binary warm-up with a single cheap one (PR binary,
  ROUNDS=1 + CALIBRATION_MS=10): still ramps the CPU and warms the shared
  benchmark-file cache, but light and low-load. The runner's per-benchmark
  warm-up (5 iters) covers binary-specific warmth, so one binary suffices.
- Wrap each pass in `timeout -k 30 480` (8 min cap; normal ~3 min). A killed
  main pass writes no JSON and degrades to PR-only; a killed PR pass fails
  fast instead of spinning.
- Add `timeout-minutes: 30` to the benchmark job as a hard backstop.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .github/workflows/pr.yml | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 2c4fe5a86..9072f3509 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -201,6 +201,11 @@ jobs:
     needs: [build, build-main]
     if: ${{ !cancelled() && needs.build.result == 'success' }}
     runs-on: ubuntu-latest
+    # Backstop: a normal run is ~7-9 min. Per-benchmark calibration can
+    # occasionally run away on a loaded GitHub runner (issue #815 follow-up); the
+    # per-step `timeout`s below cap each pass, and this caps the whole leg so a
+    # runaway fails in minutes instead of burning the default 360-minute budget.
+    timeout-minutes: 30
     strategy:
       fail-fast: false
       matrix:
@@ -234,17 +239,23 @@ jobs:
           chmod +x build-main/* 2>/dev/null || true
 
       # Same-runner A/B (issue #815): the PR and `main` builds are measured on
-      # this one runner so the delta reflects the diff, not cross-runner
-      # variance. Both benchmark the PR's `benchmarks/` workloads. The warm-up
-      # runs are discarded on purpose — they shed runner cold-start (page
-      # faults, sub-turbo clocks) for BOTH binaries so neither the `main` run
-      # (measured first) nor the PR run is penalised by being measured cold.
-      - name: Warm up benchmark runners
+      # this one runner so the delta reflects the diff, not cross-runner variance.
+      # Both benchmark the PR's `benchmarks/` workloads.
+      #
+      # One cheap, discarded warm-up pass ramps the runner CPU off its idle clock
+      # and warms the OS file cache for the benchmark sources (shared by both
+      # binaries) before measuring. It deliberately uses a tiny calibration
+      # (10 ms, 1 round) so it stays light: an earlier full two-binary warm-up
+      # roughly doubled the interpreted leg and loaded the runner right before
+      # measuring, destabilising per-benchmark calibration enough that a measured
+      # pass could run away for tens of minutes (issue #815 follow-up). The
+      # runner's built-in per-benchmark warm-up (5 iterations) covers
+      # binary-specific warmth, so warming one binary here is sufficient.
+      - name: Warm up benchmark runner
         env:
           GOCCIA_BENCH_ROUNDS: 1
-        run: |
-          ./build-main/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress > /dev/null 2>&1 || true
-          ./build/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress > /dev/null 2>&1 || true
+          GOCCIA_BENCH_CALIBRATION_MS: 10
+        run: timeout 120 ./build/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress > /dev/null 2>&1 || true
 
       # `main` is built from the PR's base commit, so it cannot run benchmarks the
       # PR newly adds or changes to use new language features. The report is still
@@ -252,18 +263,23 @@ jobs:
       # being blocked: a benchmark that errors on `main` simply has no baseline
       # entry and renders as 🆕 new. The PR's own run below is intentionally NOT
       # tolerant — a broken PR benchmark should fail the leg.
+      # `timeout` (GNU coreutils, present on ubuntu runners) caps a single pass:
+      # a normal interpreted/bytecode pass is ~3 min, so 8 min is generous, and a
+      # runaway calibration loop is killed (SIGKILL 30 s after SIGTERM) instead of
+      # spinning for the whole job. A killed `main` pass writes no JSON and the
+      # comparison degrades to PR-only via if-no-files-found + null-main handling.
       - name: Run benchmarks (main base)
         continue-on-error: true
         env:
           GOCCIA_BENCH_CALIBRATION_MS: 100
           GOCCIA_BENCH_ROUNDS: 7
-        run: ./build-main/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress --format=json --output=benchmark-${{ matrix.mode }}-main.json
+        run: timeout -k 30 480 ./build-main/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress --format=json --output=benchmark-${{ matrix.mode }}-main.json
 
       - name: Run benchmarks
         env:
           GOCCIA_BENCH_CALIBRATION_MS: 100
           GOCCIA_BENCH_ROUNDS: 7
-        run: ./build/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress --format=json --output=benchmark-${{ matrix.mode }}.json
+        run: timeout -k 30 480 ./build/GocciaBenchmarkRunner benchmarks ${{ matrix.mode == 'bytecode' && '--mode=bytecode' || '' }} --no-progress --format=json --output=benchmark-${{ matrix.mode }}.json
 
       - name: Capture deterministic profiles (main base)
         if: matrix.mode == 'bytecode'