From 3b28d03a444644b66d8b16b4c42ffec8700be4c0 Mon Sep 17 00:00:00 2001
From: Partouf <partouf@gmail.com>
Date: Sat, 27 Jun 2026 15:53:15 +0200
Subject: [PATCH 1/5] Add objdump parser benchmark with aggregated JSON output

Add a Catch2 micro-benchmark for the ObjDumpParser pipeline (fromStream +
outputJson) over the pre-captured objdump resource files, plus tooling to
run it repeatedly and aggregate run-to-run statistics.

- src/test/objdump_benchmark.cpp: BENCHMARK over four inputs of increasing
  size, tagged [!benchmark] so it stays out of the default test run.
- src/test/CMakeLists.txt: compile the benchmark and enable Catch2's
  CATCH_CONFIG_ENABLE_BENCHMARKING.
- scripts/run-benchmarks.sh: run N times, emit one Catch2 XML per run.
- scripts/aggregate_bench.py: combine the XML runs into a summary JSON
  (mean of per-run means, run-to-run stddev, CV, min/max, per-run means).
- .github/workflows/benchmark.yml: manual workflow that builds Release,
  runs the benchmarks, and uploads the summary + raw XML as an artifact.
- .gitignore: ignore build-release/ and bench-results/.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark.yml |  55 +++++++++++++++++
 .gitignore                      |   2 +
 scripts/aggregate_bench.py      | 102 ++++++++++++++++++++++++++++++++
 scripts/run-benchmarks.sh       |  52 ++++++++++++++++
 src/test/CMakeLists.txt         |   4 +-
 src/test/objdump_benchmark.cpp  |  71 ++++++++++++++++++++++
 6 files changed, 285 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/benchmark.yml
 create mode 100755 scripts/aggregate_bench.py
 create mode 100755 scripts/run-benchmarks.sh
 create mode 100644 src/test/objdump_benchmark.cpp

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 0000000..26a07e9
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,55 @@
+name: Benchmark
+
+# Manual trigger: run this to produce a benchmark summary you can download as an
+# artifact (and promote to a baseline for future comparisons).
+on:
+  workflow_dispatch:
+    inputs:
+      runs:
+        description: "Number of benchmark process runs to average"
+        required: false
+        default: "5"
+
+jobs:
+  benchmark:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install Conan
+        id: conan
+        uses: turtlebrowser/get-conan@main
+        with:
+          version: 2.8.1
+
+      - name: Configure conan
+        run: |
+          conan profile detect
+          sed -i 's/compiler\.cppstd=.*/compiler.cppstd=20/g' ~/.conan2/profiles/default
+          conan install --build=fmt/11.0.0 .
+
+      - name: Build (Release)
+        run: |
+          CURDIR=$PWD
+          mkdir -p build-release
+          cd build-release
+          cmake "-DCMAKE_PROJECT_TOP_LEVEL_INCLUDES=$CURDIR/conan_provider.cmake" -DCMAKE_C_COMPILER=/usr/bin/gcc-10 -DCMAKE_CXX_COMPILER=/usr/bin/g++-10 -DCMAKE_BUILD_TYPE=Release ..
+          cmake --build .
+
+      - name: Run benchmarks
+        env:
+          BENCH_RUNS: ${{ github.event.inputs.runs }}
+          BENCH_LABEL: ${{ runner.os }}/${{ runner.arch }}
+        run: |
+          ./scripts/run-benchmarks.sh
+
+      - name: Show summary
+        run: cat bench-results/bench-summary.json
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: bench-summary
+          path: |
+            bench-results/bench-summary.json
+            bench-results/run-*.xml
diff --git a/.gitignore b/.gitignore
index 6c2375d..ea0deb9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,8 @@
 *.app
 
 build/
+build-release/
+bench-results/
 CMakeUserPresets.json
 
 # CLion stuff
diff --git a/scripts/aggregate_bench.py b/scripts/aggregate_bench.py
new file mode 100755
index 0000000..815d5fd
--- /dev/null
+++ b/scripts/aggregate_bench.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+"""Aggregate one or more Catch2 (v2) XML benchmark runs into a summary JSON.
+
+Each input XML is one process run of the benchmark (which itself takes many
+samples). This script combines the per-run means for each benchmark into
+run-to-run statistics suitable for storing as a CI artifact / baseline.
+
+Usage:
+    aggregate_bench.py run-*.xml -o bench-summary.json
+    aggregate_bench.py run-1.xml run-2.xml --commit "$(git rev-parse HEAD)"
+
+All timing values in the Catch2 XML are nanoseconds; the summary keeps raw
+nanoseconds and adds millisecond convenience values.
+"""
+import argparse
+import datetime
+import json
+import statistics
+import sys
+import xml.etree.ElementTree as ET
+
+
+def parse_run(path):
+    """Return {benchmark_name: {mean_ns, stddev_ns, samples}} for one XML run."""
+    root = ET.parse(path).getroot()
+    out = {}
+    for bench in root.iter("BenchmarkResults"):
+        mean = bench.find("mean")
+        stddev = bench.find("standardDeviation")
+        out[bench.attrib["name"]] = {
+            "mean_ns": float(mean.attrib["value"]) if mean is not None else None,
+            "stddev_ns": float(stddev.attrib["value"]) if stddev is not None else None,
+            "samples": int(bench.attrib["samples"]),
+        }
+    return out
+
+
+def aggregate(paths):
+    runs = [parse_run(p) for p in paths]
+    # Preserve first-seen order of benchmark names.
+    names = []
+    for run in runs:
+        for name in run:
+            if name not in names:
+                names.append(name)
+
+    benchmarks = []
+    for name in names:
+        means = [run[name]["mean_ns"] for run in runs if name in run]
+        within = [run[name]["stddev_ns"] for run in runs if name in run]
+        samples = next(run[name]["samples"] for run in runs if name in run)
+
+        mean_ns = statistics.fmean(means)
+        across_ns = statistics.stdev(means) if len(means) > 1 else 0.0
+        benchmarks.append({
+            "name": name,
+            "runs": len(means),
+            "samples_per_run": samples,
+            "mean_ns": mean_ns,
+            "mean_ms": mean_ns / 1e6,
+            "stddev_across_runs_ns": across_ns,
+            "stddev_across_runs_ms": across_ns / 1e6,
+            "cv_percent": (across_ns / mean_ns * 100.0) if mean_ns else 0.0,
+            "min_ms": min(means) / 1e6,
+            "max_ms": max(means) / 1e6,
+            "mean_within_run_stddev_ms": (statistics.fmean(within) / 1e6) if within else None,
+            "per_run_mean_ms": [m / 1e6 for m in means],
+        })
+    return benchmarks
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument("xml", nargs="+", help="Catch2 XML run file(s)")
+    ap.add_argument("-o", "--output", help="JSON output file (default: stdout)")
+    ap.add_argument("--commit", default="", help="git commit SHA to record")
+    ap.add_argument("--ref", default="", help="git ref/branch to record")
+    ap.add_argument("--label", default="", help="free-form label (e.g. runner/build)")
+    args = ap.parse_args()
+
+    summary = {
+        "schema": "asm-parser-bench/1",
+        "generated_utc": datetime.datetime.now(datetime.timezone.utc).isoformat(),
+        "commit": args.commit,
+        "ref": args.ref,
+        "label": args.label,
+        "run_count": len(args.xml),
+        "benchmarks": aggregate(args.xml),
+    }
+    text = json.dumps(summary, indent=2)
+
+    if args.output:
+        with open(args.output, "w") as f:
+            f.write(text + "\n")
+        print(f"Wrote {args.output}", file=sys.stderr)
+    else:
+        print(text)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run-benchmarks.sh b/scripts/run-benchmarks.sh
new file mode 100755
index 0000000..6b92dbf
--- /dev/null
+++ b/scripts/run-benchmarks.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+#
+# Run the objdump benchmark N times and aggregate the per-run Catch2 XML into a
+# single summary JSON (bench-results/bench-summary.json by default).
+#
+# Env overrides:
+#   BENCH_RUNS      number of process runs                 (default: 5)
+#   BENCH_FILTER    Catch2 test filter                     (default: [objdump])
+#   BENCH_TEST_BIN  path to asm-parser-test                (default: build-release/src/test/asm-parser-test)
+#   BENCH_OUTDIR    where to write XML + summary           (default: bench-results)
+#
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+cd "$REPO_ROOT"
+
+RUNS="${BENCH_RUNS:-5}"
+FILTER="${BENCH_FILTER:-[objdump]}"
+TEST_BIN="${BENCH_TEST_BIN:-build-release/src/test/asm-parser-test}"
+OUTDIR="${BENCH_OUTDIR:-bench-results}"
+
+if [ ! -x "$TEST_BIN" ]; then
+  echo "error: test binary not found/executable at '$TEST_BIN'" >&2
+  echo "build a Release first, e.g.:" >&2
+  echo "  cmake -B build-release -G Ninja -S . -DCMAKE_BUILD_TYPE=Release && cmake --build build-release" >&2
+  exit 1
+fi
+
+TEST_DIR="$(cd "$(dirname "$TEST_BIN")" && pwd)"
+TEST_EXE="$TEST_DIR/$(basename "$TEST_BIN")"
+
+mkdir -p "$OUTDIR"
+ABS_OUTDIR="$(cd "$OUTDIR" && pwd)"
+
+xmls=()
+for i in $(seq 1 "$RUNS"); do
+  xml="$ABS_OUTDIR/run-$i.xml"
+  echo "Benchmark run $i/$RUNS -> $OUTDIR/run-$i.xml"
+  # The test resolves resource paths relative to its own working dir.
+  ( cd "$TEST_DIR" && "$TEST_EXE" "$FILTER" -r xml -o "$xml" )
+  xmls+=("$xml")
+done
+
+COMMIT="$(git -C "$REPO_ROOT" rev-parse HEAD 2>/dev/null || echo "")"
+REF="$(git -C "$REPO_ROOT" rev-parse --abbrev-ref HEAD 2>/dev/null || echo "")"
+
+python3 "$SCRIPT_DIR/aggregate_bench.py" "${xmls[@]}" \
+  --commit "$COMMIT" --ref "$REF" --label "${BENCH_LABEL:-}" \
+  -o "$ABS_OUTDIR/bench-summary.json"
+
+echo "Summary written to $OUTDIR/bench-summary.json"
diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt
index 897a3b2..6fa683f 100644
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@@ -1,3 +1,5 @@
-add_executable(asm-parser-test test_test.cpp asmtext_filter_tests.cpp)
+add_executable(asm-parser-test test_test.cpp asmtext_filter_tests.cpp objdump_benchmark.cpp)
 target_link_libraries(asm-parser-test asm-parser-lib)
+# Enable Catch2's built-in micro-benchmarking support (BENCHMARK macro).
+target_compile_definitions(asm-parser-test PRIVATE CATCH_CONFIG_ENABLE_BENCHMARKING)
 add_test(NAME asm-parser-test COMMAND $<TARGET_FILE:asm-parser-test>)
diff --git a/src/test/objdump_benchmark.cpp b/src/test/objdump_benchmark.cpp
new file mode 100644
index 0000000..9c4da41
--- /dev/null
+++ b/src/test/objdump_benchmark.cpp
@@ -0,0 +1,71 @@
+#include <catch2/catch.hpp>
+
+#include "../objdump/parser.hpp"
+#include "../types/filter.hpp"
+
+#include <filesystem>
+#include <fstream>
+#include <sstream>
+#include <string>
+
+namespace
+{
+
+std::string resourcePath(const std::string &name)
+{
+    if (std::filesystem::current_path().string().ends_with("test"))
+        return "../../../resources/" + name;
+    return "../../resources/" + name;
+}
+
+std::string readResource(const std::string &name)
+{
+    std::ifstream fs(resourcePath(name), std::ios::binary);
+    REQUIRE(fs.is_open() == true);
+    std::stringstream ss;
+    ss << fs.rdbuf();
+    return ss.str();
+}
+
+AsmParser::Filter defaultBinaryFilter()
+{
+    AsmParser::Filter filter;
+    filter.binary = true;
+    filter.plt = true;
+    filter.library_functions = true;
+    filter.unused_labels = true;
+    return filter;
+}
+
+// Parse a buffer with a fresh parser and produce the JSON output, exercising the
+// full objdump pipeline the way main.cpp does for a -binary run. Returns the
+// output size so Catch keeps the result and the work isn't optimised away.
+size_t parseObjDump(const std::string &input, const AsmParser::Filter &filter)
+{
+    AsmParser::ObjDumpParser parser(filter);
+    std::istringstream in(input);
+    parser.fromStream(in);
+
+    std::ostringstream out;
+    parser.outputJson(out);
+    return out.str().size();
+}
+
+} // namespace
+
+TEST_CASE("objdump parser benchmark", "[!benchmark][objdump]")
+{
+    const auto filter = defaultBinaryFilter();
+
+    // The asm files here are pre-captured objdump output of various sizes.
+    for (const auto &name : {"example.asm", "example_intel.asm", "gcc12_sort_object_reloc.asm",
+                             "gcc12_bin_fmt_O2_flto.asm"})
+    {
+        const std::string input = readResource(name);
+
+        BENCHMARK(std::string("parse ") + name)
+        {
+            return parseObjDump(input, filter);
+        };
+    }
+}

From 7c6b98147baf84e0e3e8ab0f3938422553182925 Mon Sep 17 00:00:00 2001
From: Partouf <partouf@gmail.com>
Date: Sat, 27 Jun 2026 15:57:41 +0200
Subject: [PATCH 2/5] Run benchmark workflow automatically on push and PR

Trigger the benchmark workflow on push to main and on pull requests
(matching build.yml), instead of only on manual dispatch. Keep
workflow_dispatch for ad-hoc reruns.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark.yml | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 26a07e9..544b088 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -1,8 +1,13 @@
 name: Benchmark
 
-# Manual trigger: run this to produce a benchmark summary you can download as an
-# artifact (and promote to a baseline for future comparisons).
+# Runs automatically on every push to main and on pull requests, producing a
+# benchmark summary artifact (the main-branch runs can be promoted to a baseline
+# for future comparisons). Also exposes a manual trigger for ad-hoc reruns.
 on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
   workflow_dispatch:
     inputs:
       runs:

From 5f5c2c28a30ef58e02d71509cb2e24ed33a4cdf5 Mon Sep 17 00:00:00 2001
From: Partouf <partouf@gmail.com>
Date: Sat, 27 Jun 2026 15:58:02 +0200
Subject: [PATCH 3/5] Run benchmark workflow on all branches

Drop the branch filters so the benchmark runs on push to any branch.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark.yml | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 544b088..3c8b505 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -1,13 +1,11 @@
 name: Benchmark
 
-# Runs automatically on every push to main and on pull requests, producing a
-# benchmark summary artifact (the main-branch runs can be promoted to a baseline
-# for future comparisons). Also exposes a manual trigger for ad-hoc reruns.
+# Runs automatically on every push to any branch and on pull requests, producing
+# a benchmark summary artifact (the main-branch runs can be promoted to a
+# baseline for future comparisons). Also exposes a manual trigger for ad-hoc reruns.
 on:
   push:
-    branches: [ main ]
   pull_request:
-    branches: [ main ]
   workflow_dispatch:
     inputs:
       runs:

From f8c43454198ce729bdf152b26f15d023e9cc434d Mon Sep 17 00:00:00 2001
From: Partouf <partouf@gmail.com>
Date: Sat, 27 Jun 2026 16:05:38 +0200
Subject: [PATCH 4/5] Update GitHub Actions to latest versions

Bump all actions to their latest releases to clear the Node.js 20
deprecation warnings (runners now default to Node 24):

- actions/checkout@v4        -> @v7  (Node 24)
- actions/upload-artifact@v4 -> @v7  (Node 24)
- turtlebrowser/get-conan@main -> @v1.2  (pin to release)
- fnkr/github-action-ghr@v1   -> @v1.3

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark.yml | 6 +++---
 .github/workflows/build.yml     | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 3c8b505..80da0b2 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -18,11 +18,11 @@ jobs:
     runs-on: ubuntu-22.04
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v7
 
       - name: Install Conan
         id: conan
-        uses: turtlebrowser/get-conan@main
+        uses: turtlebrowser/get-conan@v1.2
         with:
           version: 2.8.1
 
@@ -50,7 +50,7 @@ jobs:
       - name: Show summary
         run: cat bench-results/bench-summary.json
 
-      - uses: actions/upload-artifact@v4
+      - uses: actions/upload-artifact@v7
         with:
           name: bench-summary
           path: |
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 6f85850..f9bb428 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -12,11 +12,11 @@ jobs:
     runs-on: ubuntu-22.04
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v7
 
       - name: Install Conan
         id: conan
-        uses: turtlebrowser/get-conan@main
+        uses: turtlebrowser/get-conan@v1.2
         with:
           version: 2.8.1
 
@@ -42,12 +42,12 @@ jobs:
         run: |
           $GITHUB_WORKSPACE/build/src/asm-parser $GITHUB_WORKSPACE/resources/example_intel.asm > ./example_intel.json
 
-      - uses: actions/upload-artifact@v4
+      - uses: actions/upload-artifact@v7
         with:
           path: build/src/asm-parser
 
       - name: Release
-        uses: fnkr/github-action-ghr@v1
+        uses: fnkr/github-action-ghr@v1.3
         if: startsWith(github.ref, 'refs/tags/')
         env:
           GHR_COMPRESS: xz

From a912400b6e88af7cb5f0f4aaf4840b3bc2d6e285 Mon Sep 17 00:00:00 2001
From: Partouf <partouf@gmail.com>
Date: Sat, 27 Jun 2026 16:13:38 +0200
Subject: [PATCH 5/5] Add benchmark baseline and regression comparison in CI

Capture a baseline benchmark summary (5 runs, GitHub runner) and compare
each new run against it in the workflow.

- bench-results/baseline.json: committed baseline; .gitignore keeps it
  tracked while ignoring other generated benchmark output.
- scripts/compare_bench.py: compares a summary against the baseline,
  prints a per-benchmark delta table, writes a Markdown table to the
  GitHub step summary, and exits non-zero on a regression beyond the
  threshold (default 10%).
- benchmark.yml: run the comparison after generating the summary; upload
  the artifact even on failure.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark.yml |  10 ++++
 .gitignore                      |   4 +-
 bench-results/baseline.json     |  90 ++++++++++++++++++++++++++++
 scripts/compare_bench.py        | 101 ++++++++++++++++++++++++++++++++
 4 files changed, 204 insertions(+), 1 deletion(-)
 create mode 100644 bench-results/baseline.json
 create mode 100755 scripts/compare_bench.py

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 80da0b2..6a1239c 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -50,7 +50,17 @@ jobs:
       - name: Show summary
         run: cat bench-results/bench-summary.json
 
+      - name: Compare against baseline
+        run: |
+          python3 scripts/compare_bench.py \
+            --baseline bench-results/baseline.json \
+            --current bench-results/bench-summary.json \
+            --threshold "${BENCH_THRESHOLD:-10}"
+        env:
+          BENCH_THRESHOLD: "10"
+
       - uses: actions/upload-artifact@v7
+        if: always()
         with:
           name: bench-summary
           path: |
diff --git a/.gitignore b/.gitignore
index ea0deb9..8b314ae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,7 +33,9 @@
 
 build/
 build-release/
-bench-results/
+# ignore generated benchmark output, but keep the committed baseline
+/bench-results/*
+!/bench-results/baseline.json
 CMakeUserPresets.json
 
 # CLion stuff
diff --git a/bench-results/baseline.json b/bench-results/baseline.json
new file mode 100644
index 0000000..5e49968
--- /dev/null
+++ b/bench-results/baseline.json
@@ -0,0 +1,90 @@
+{
+  "schema": "asm-parser-bench/1",
+  "generated_utc": "2026-06-27T14:08:48.248475+00:00",
+  "commit": "c3440213b27ecae1583337c4b446991da21341a0",
+  "ref": "HEAD",
+  "label": "Linux/X64",
+  "run_count": 5,
+  "benchmarks": [
+    {
+      "name": "parse example.asm",
+      "runs": 5,
+      "samples_per_run": 100,
+      "mean_ns": 155782.8,
+      "mean_ms": 0.1557828,
+      "stddev_across_runs_ns": 1852.4474891343075,
+      "stddev_across_runs_ms": 0.0018524474891343074,
+      "cv_percent": 1.1891219628446192,
+      "min_ms": 0.154184,
+      "max_ms": 0.158972,
+      "mean_within_run_stddev_ms": 0.016918420000000003,
+      "per_run_mean_ms": [
+        0.155571,
+        0.155049,
+        0.154184,
+        0.155138,
+        0.158972
+      ]
+    },
+    {
+      "name": "parse example_intel.asm",
+      "runs": 5,
+      "samples_per_run": 100,
+      "mean_ns": 7681760.0,
+      "mean_ms": 7.68176,
+      "stddev_across_runs_ns": 8631.949374272303,
+      "stddev_across_runs_ms": 0.008631949374272302,
+      "cv_percent": 0.11236942281810812,
+      "min_ms": 7.67117,
+      "max_ms": 7.69446,
+      "mean_within_run_stddev_ms": 0.08032122,
+      "per_run_mean_ms": [
+        7.67117,
+        7.69446,
+        7.67717,
+        7.68252,
+        7.68348
+      ]
+    },
+    {
+      "name": "parse gcc12_sort_object_reloc.asm",
+      "runs": 5,
+      "samples_per_run": 100,
+      "mean_ns": 6399328.0,
+      "mean_ms": 6.399328,
+      "stddev_across_runs_ns": 95071.93550149276,
+      "stddev_across_runs_ms": 0.09507193550149276,
+      "cv_percent": 1.4856549859843526,
+      "min_ms": 6.33027,
+      "max_ms": 6.56389,
+      "mean_within_run_stddev_ms": 0.33537957999999995,
+      "per_run_mean_ms": [
+        6.56389,
+        6.37586,
+        6.38739,
+        6.33027,
+        6.33923
+      ]
+    },
+    {
+      "name": "parse gcc12_bin_fmt_O2_flto.asm",
+      "runs": 5,
+      "samples_per_run": 100,
+      "mean_ns": 117524800.0,
+      "mean_ms": 117.5248,
+      "stddev_across_runs_ns": 272434.76283323317,
+      "stddev_across_runs_ms": 0.27243476283323315,
+      "cv_percent": 0.231810445823548,
+      "min_ms": 117.144,
+      "max_ms": 117.791,
+      "mean_within_run_stddev_ms": 0.9706758000000001,
+      "per_run_mean_ms": [
+        117.604,
+        117.791,
+        117.735,
+        117.35,
+        117.144
+      ]
+    }
+  ]
+}
diff --git a/scripts/compare_bench.py b/scripts/compare_bench.py
new file mode 100755
index 0000000..9912014
--- /dev/null
+++ b/scripts/compare_bench.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+"""Compare a benchmark summary against a committed baseline.
+
+Reads two summary JSON files produced by aggregate_bench.py and reports the
+per-benchmark delta. Exits non-zero if any benchmark regressed (got slower) by
+more than the threshold, so it can gate a CI job.
+
+Usage:
+    compare_bench.py --baseline bench-results/baseline.json \
+                     --current  bench-results/bench-summary.json \
+                     --threshold 10
+
+A Markdown table is written to $GITHUB_STEP_SUMMARY when that env var is set.
+"""
+import argparse
+import json
+import os
+import sys
+
+
+def load(path):
+    with open(path) as f:
+        data = json.load(f)
+    return {b["name"]: b for b in data.get("benchmarks", [])}, data
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument("--baseline", required=True)
+    ap.add_argument("--current", required=True)
+    ap.add_argument("--threshold", type=float, default=10.0,
+                    help="max allowed regression in percent (default: 10)")
+    args = ap.parse_args()
+
+    base, base_meta = load(args.baseline)
+    cur, cur_meta = load(args.current)
+
+    rows = []
+    regressions = []
+    for name, c in cur.items():
+        b = base.get(name)
+        cur_ms = c["mean_ms"]
+        if b is None:
+            rows.append((name, None, cur_ms, None, "NEW"))
+            continue
+        base_ms = b["mean_ms"]
+        delta = (cur_ms - base_ms) / base_ms * 100.0 if base_ms else 0.0
+        status = "ok"
+        if delta > args.threshold:
+            status = "REGRESSION"
+            regressions.append((name, delta))
+        elif delta < -args.threshold:
+            status = "improved"
+        rows.append((name, base_ms, cur_ms, delta, status))
+
+    for name in base:
+        if name not in cur:
+            rows.append((name, base[name]["mean_ms"], None, None, "MISSING"))
+
+    # Console report
+    print(f"Baseline commit: {base_meta.get('commit', '?')}")
+    print(f"Current  commit: {cur_meta.get('commit', '?')}")
+    print(f"Threshold: ±{args.threshold:.1f}%\n")
+    header = f"{'benchmark':<36} {'baseline':>12} {'current':>12} {'delta':>9}  status"
+    print(header)
+    print("-" * len(header))
+    for name, base_ms, cur_ms, delta, status in rows:
+        b = f"{base_ms:.4f} ms" if base_ms is not None else "—"
+        c = f"{cur_ms:.4f} ms" if cur_ms is not None else "—"
+        d = f"{delta:+.2f}%" if delta is not None else "—"
+        print(f"{name:<36} {b:>12} {c:>12} {d:>9}  {status}")
+
+    # GitHub step summary (Markdown)
+    step_summary = os.environ.get("GITHUB_STEP_SUMMARY")
+    if step_summary:
+        with open(step_summary, "a") as f:
+            f.write("## Benchmark vs baseline\n\n")
+            f.write(f"Threshold: ±{args.threshold:.1f}% &middot; "
+                    f"baseline `{base_meta.get('commit', '?')[:12]}`\n\n")
+            f.write("| benchmark | baseline | current | delta | status |\n")
+            f.write("|---|--:|--:|--:|:--|\n")
+            for name, base_ms, cur_ms, delta, status in rows:
+                b = f"{base_ms:.4f} ms" if base_ms is not None else "—"
+                c = f"{cur_ms:.4f} ms" if cur_ms is not None else "—"
+                d = f"{delta:+.2f}%" if delta is not None else "—"
+                emoji = {"REGRESSION": "🔴", "improved": "🟢", "ok": "⚪",
+                         "NEW": "🆕", "MISSING": "⚠️"}.get(status, "")
+                f.write(f"| {name} | {b} | {c} | {d} | {emoji} {status} |\n")
+
+    if regressions:
+        print(f"\n{len(regressions)} benchmark(s) regressed beyond ±{args.threshold:.1f}%:")
+        for name, delta in regressions:
+            print(f"  - {name}: {delta:+.2f}%")
+        return 1
+    print("\nNo regressions beyond threshold.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())