From f9e92e81f9cb9a0acac8c0eca614f1768f1e1907 Mon Sep 17 00:00:00 2001
From: Abmcar <abmcar@qq.com>
Date: Sat, 16 May 2026 18:47:57 +0800
Subject: [PATCH 1/2] ci: speed up evmone perf regression checks

---
 .ci/run_test_suite.sh                   |  40 +++-
 .github/workflows/dtvm_evm_test_x86.yml |  33 ++-
 tools/check_performance_regression.py   | 296 +++++++++++++++++++++---
 3 files changed, 327 insertions(+), 42 deletions(-)

diff --git a/.ci/run_test_suite.sh b/.ci/run_test_suite.sh
index dd253b0c4..bb03e1c70 100644
--- a/.ci/run_test_suite.sh
+++ b/.ci/run_test_suite.sh
@@ -305,16 +305,34 @@ for STACK_TYPE in ${STACK_TYPES[@]}; do
             ;;
         "benchmarksuite")
             # Clone evmone and run performance regression check
-            EVMONE_DIR="evmone"
-            if [ ! -d "$EVMONE_DIR" ]; then
-                git clone --depth 1 --recurse-submodules -b for_test https://github.com/DTVMStack/evmone.git $EVMONE_DIR
+            EVMONE_DIR=${EVMONE_DIR:-evmone}
+            EVMONE_REPO=${EVMONE_REPO:-https://github.com/DTVMStack/evmone.git}
+            EVMONE_REF=${EVMONE_REF:-for_test}
+            EVMONE_COMMIT=${EVMONE_COMMIT:-}
+
+            if [ -z "$EVMONE_COMMIT" ]; then
+                EVMONE_COMMIT=$(git ls-remote "$EVMONE_REPO" "refs/heads/$EVMONE_REF" | awk '{print $1}')
+            fi
+            if [ -z "$EVMONE_COMMIT" ]; then
+                echo "Unable to resolve $EVMONE_REPO refs/heads/$EVMONE_REF"
+                exit 1
             fi
 
+            if [ ! -d "$EVMONE_DIR/.git" ]; then
+                rm -rf "$EVMONE_DIR"
+                git clone --depth 1 --recurse-submodules -b "$EVMONE_REF" "$EVMONE_REPO" "$EVMONE_DIR"
+            fi
+            git -C "$EVMONE_DIR" remote set-url origin "$EVMONE_REPO"
+            git -C "$EVMONE_DIR" fetch --depth 1 origin "$EVMONE_REF"
+            git -C "$EVMONE_DIR" checkout --detach "$EVMONE_COMMIT"
+            git -C "$EVMONE_DIR" submodule update --init --recursive
+
             BENCHMARK_THRESHOLD=${BENCHMARK_THRESHOLD:-0.15}
             BENCHMARK_MODE=${BENCHMARK_MODE:-multipass}
             BENCHMARK_SUMMARY_FILE=${BENCHMARK_SUMMARY_FILE:-/tmp/perf_summary.md}
             BENCHMARK_REPETITIONS=${BENCHMARK_REPETITIONS:-3}
             BENCHMARK_MIN_TIME=${BENCHMARK_MIN_TIME:-""}
+            BENCHMARK_JOBS=${BENCHMARK_JOBS:-1}
 
             PERF_ARGS=""
             if [ -n "$BENCHMARK_REPETITIONS" ]; then
@@ -323,6 +341,9 @@ for STACK_TYPE in ${STACK_TYPES[@]}; do
             if [ -n "$BENCHMARK_MIN_TIME" ]; then
                 PERF_ARGS="$PERF_ARGS --benchmark-min-time $BENCHMARK_MIN_TIME"
             fi
+            if [ -n "$BENCHMARK_JOBS" ]; then
+                PERF_ARGS="$PERF_ARGS --benchmark-jobs $BENCHMARK_JOBS"
+            fi
 
             cp build/lib/* $EVMONE_DIR/
 
@@ -331,7 +352,18 @@ for STACK_TYPE in ${STACK_TYPES[@]}; do
             cp ../tools/check_performance_regression.py ./
 
             if [ ! -f "build/bin/evmone-bench" ]; then
-                "$SCRIPT_DIR/cmake_ci_build.sh" build -- -DEVMONE_TESTING=ON -DCMAKE_BUILD_TYPE=Release
+                EVMONE_CMAKE_ARGS=(-DEVMONE_TESTING=ON -DCMAKE_BUILD_TYPE=Release)
+                if [ "${DTVM_CI_USE_NINJA:-0}" = "1" ]; then
+                    EVMONE_CMAKE_ARGS=(-G Ninja "${EVMONE_CMAKE_ARGS[@]}")
+                fi
+                if [ "${DTVM_CI_USE_SCCACHE:-0}" = "1" ]; then
+                    EVMONE_CMAKE_ARGS+=(
+                        -DCMAKE_C_COMPILER_LAUNCHER=sccache
+                        -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
+                    )
+                fi
+                cmake -S . -B build "${EVMONE_CMAKE_ARGS[@]}"
+                cmake --build build --target evmone-bench --parallel "${DTVM_CI_BUILD_JOBS:-16}"
             fi
             print_sccache_stats "before benchmarks"
 
diff --git a/.github/workflows/dtvm_evm_test_x86.yml b/.github/workflows/dtvm_evm_test_x86.yml
index b2815fbd0..d85dddb7f 100644
--- a/.github/workflows/dtvm_evm_test_x86.yml
+++ b/.github/workflows/dtvm_evm_test_x86.yml
@@ -16,6 +16,10 @@ on:
 permissions:
     contents: read
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
 # Shared FetchContent cache root for all container jobs. The hook in
 # CMakeLists.txt (commit 96707a2 lines 8-18) picks this up as the base
 # dir for FetchContent populations. The cache key includes the generator
@@ -28,6 +32,9 @@ env:
   SCCACHE_IDLE_TIMEOUT: "0"
   DTVM_CI_USE_NINJA: "1"
   DTVM_CI_USE_SCCACHE: "1"
+  EVMONE_REPO: https://github.com/DTVMStack/evmone.git
+  EVMONE_REF: for_test
+  EVMONE_BENCH_CACHE_VERSION: v1
 
 jobs:
   build_test_evm_interpreter_x86_ctest:
@@ -500,6 +507,23 @@ jobs:
           git checkout ${{ github.sha }}
           git stash pop || true
 
+      - name: Resolve evmone benchmark commit
+        id: evmone
+        run: |
+          EVMONE_COMMIT=$(git ls-remote "$EVMONE_REPO" "refs/heads/$EVMONE_REF" | awk '{print $1}')
+          if [ -z "$EVMONE_COMMIT" ]; then
+            echo "Unable to resolve $EVMONE_REPO refs/heads/$EVMONE_REF" >&2
+            exit 1
+          fi
+          echo "commit=$EVMONE_COMMIT" >> "$GITHUB_OUTPUT"
+          echo "Resolved evmone $EVMONE_REF to $EVMONE_COMMIT"
+
+      - name: Cache evmone-bench
+        uses: actions/cache@v4
+        with:
+          path: evmone
+          key: ${{ runner.os }}-evmone-bench-${{ env.EVMONE_BENCH_CACHE_VERSION }}-${{ steps.evmone.outputs.commit }}-${{ hashFiles('.ci/run_test_suite.sh', '.github/workflows/dtvm_evm_test_x86.yml') }}
+
       - name: Build current PR and check regression
         id: perf-check
         run: |
@@ -509,7 +533,7 @@ jobs:
           export LLVM_DIR=$LLVM_SYS_150_PREFIX/lib/cmake/llvm
           export PATH=$LLVM_SYS_150_PREFIX/bin:$PATH
 
-          rm -rf build evmone
+          rm -rf build
 
           export CMAKE_BUILD_TARGET=Release
           export ENABLE_ASAN=false
@@ -523,10 +547,17 @@ jobs:
           export BENCHMARK_BASELINE_LIB=/tmp/baseline_lib
           export BENCHMARK_SUMMARY_FILE=/tmp/perf_summary_${{ matrix.mode }}.md
           export BENCHMARK_REPETITIONS=5
+          export BENCHMARK_JOBS=3
+          export EVMONE_COMMIT=${{ steps.evmone.outputs.commit }}
 
           bash .ci/run_test_suite.sh
         continue-on-error: true
 
+      - name: Clean evmone cache inputs
+        if: always()
+        run: |
+          rm -f evmone/libdtvmapi.so evmone/libdtvmapi.so.* evmone/check_performance_regression.py
+
       - name: Write Performance Summary
         if: always()
         run: |
diff --git a/tools/check_performance_regression.py b/tools/check_performance_regression.py
index 48d2d4e02..96a779a06 100755
--- a/tools/check_performance_regression.py
+++ b/tools/check_performance_regression.py
@@ -21,10 +21,12 @@
 import argparse
 import json
 import os
+import re
 import shutil
 import subprocess
 import sys
 import tempfile
+from collections import Counter
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
 
@@ -37,33 +39,43 @@ class BenchmarkResult:
     iterations: int
 
 
-def run_benchmark(
-    lib_path: str,
-    mode: str,
-    benchmark_dir: str,
-    extra_args: Optional[List[str]] = None,
-    repetitions: int = 3,
-    benchmark_min_time: Optional[str] = None,
-) -> List[BenchmarkResult]:
-    """Run benchmark and parse JSON output.
+def benchmark_env(lib_path: str, mode: str) -> Dict[str, str]:
+    return {**os.environ, "EVMONE_EXTERNAL_OPTIONS": f"{lib_path},mode={mode}"}
 
-    Uses --benchmark_out to write JSON results to a temporary file so that
-    the human-readable benchmark progress streams to stdout/stderr in real
-    time (important for CI visibility).
 
-    When *repetitions* > 1, each benchmark runs N times and only the
-    median aggregate is kept, significantly reducing noise from ASLR and
-    shared-runner contention.
-    """
-    env = {"EVMONE_EXTERNAL_OPTIONS": f"{lib_path},mode={mode}"}
+def available_cpus() -> List[int]:
+    if hasattr(os, "sched_getaffinity"):
+        return sorted(os.sched_getaffinity(0))
+    cpu_count = os.cpu_count() or 1
+    return list(range(cpu_count))
 
-    fd, json_out_path = tempfile.mkstemp(suffix=".json")
-    os.close(fd)
 
+def split_benchmark_args(extra_args: Optional[List[str]]) -> Tuple[str, List[str]]:
+    benchmark_filter = "external/total/*"
+    remaining_args: List[str] = []
+
+    for arg in extra_args or []:
+        if arg.startswith("--benchmark_filter="):
+            benchmark_filter = arg.split("=", 1)[1]
+        else:
+            remaining_args.append(arg)
+
+    return benchmark_filter, remaining_args
+
+
+def build_benchmark_cmd(
+    benchmark_dir: str,
+    json_out_path: str,
+    benchmark_filter: str,
+    repetitions: int,
+    benchmark_min_time: Optional[str],
+    extra_args: List[str],
+    cpu: Optional[int],
+) -> List[str]:
     cmd: List[str] = []
 
-    if shutil.which("taskset"):
-        cmd.extend(["taskset", "-c", "0"])
+    if cpu is not None and shutil.which("taskset"):
+        cmd.extend(["taskset", "-c", str(cpu)])
 
     cmd.extend([
         "./build/bin/evmone-bench",
@@ -80,39 +92,242 @@ def run_benchmark(
     if benchmark_min_time:
         cmd.append(f"--benchmark_min_time={benchmark_min_time}")
 
-    if extra_args:
-        cmd.extend(extra_args)
+    cmd.extend(extra_args)
+    cmd.append(f"--benchmark_filter={benchmark_filter}")
+    return cmd
 
-    if not any(arg.startswith("--benchmark_filter") for arg in cmd):
-        cmd.append("--benchmark_filter=external/total/*")
 
-    print(f"Running: {' '.join(cmd)}")
-    print(f"Environment: EVMONE_EXTERNAL_OPTIONS={env['EVMONE_EXTERNAL_OPTIONS']}")
+def list_benchmark_names(
+    lib_path: str,
+    mode: str,
+    benchmark_dir: str,
+    benchmark_filter: str,
+    extra_args: List[str],
+) -> List[str]:
+    cmd = [
+        "./build/bin/evmone-bench",
+        benchmark_dir,
+        "--benchmark_list_tests",
+        f"--benchmark_filter={benchmark_filter}",
+    ]
+    cmd.extend(extra_args)
+
+    print(f"Listing benchmarks: {' '.join(cmd)}")
     sys.stdout.flush()
 
     result = subprocess.run(
         cmd,
-        env={**subprocess.os.environ, **env},
+        env=benchmark_env(lib_path, mode),
+        text=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
     )
-
     if result.returncode != 0:
-        print(f"Benchmark execution failed with code {result.returncode}")
-        try:
-            os.unlink(json_out_path)
-        except OSError:
-            pass
+        print(result.stdout, end="")
+        print(f"Benchmark listing failed with code {result.returncode}")
+        sys.exit(2)
+
+    names: List[str] = []
+    for line in result.stdout.splitlines():
+        name = line.strip()
+        if not name or any(ch.isspace() for ch in name):
+            continue
+        names.append(name)
+
+    if not names:
+        print(f"::error::No benchmarks matched filter: {benchmark_filter}")
         sys.exit(2)
 
+    return names
+
+
+def exact_filter(names: List[str]) -> str:
+    return "^(" + "|".join(re.escape(name) for name in names) + ")$"
+
+
+def parse_benchmark_json_file(json_out_path: str) -> List[BenchmarkResult]:
+    with open(json_out_path, "r") as f:
+        json_data = f.read()
+    return parse_benchmark_json(json_data)
+
+
+def run_benchmark_shard(
+    lib_path: str,
+    mode: str,
+    benchmark_dir: str,
+    benchmark_filter: str,
+    extra_args: List[str],
+    repetitions: int,
+    benchmark_min_time: Optional[str],
+    cpu: Optional[int],
+) -> List[BenchmarkResult]:
+    fd, json_out_path = tempfile.mkstemp(suffix=".json")
+    os.close(fd)
+
     try:
-        with open(json_out_path, "r") as f:
-            json_data = f.read()
+        cmd = build_benchmark_cmd(
+            benchmark_dir,
+            json_out_path,
+            benchmark_filter,
+            repetitions,
+            benchmark_min_time,
+            extra_args,
+            cpu,
+        )
+
+        print(f"Running: {' '.join(cmd)}")
+        print(f"Environment: EVMONE_EXTERNAL_OPTIONS={lib_path},mode={mode}")
+        sys.stdout.flush()
+
+        result = subprocess.run(cmd, env=benchmark_env(lib_path, mode))
+
+        if result.returncode != 0:
+            print(f"Benchmark execution failed with code {result.returncode}")
+            sys.exit(2)
+
+        return parse_benchmark_json_file(json_out_path)
     finally:
         try:
             os.unlink(json_out_path)
         except OSError:
             pass
 
-    return parse_benchmark_json(json_data)
+
+def run_benchmark_parallel(
+    lib_path: str,
+    mode: str,
+    benchmark_dir: str,
+    extra_args: Optional[List[str]],
+    repetitions: int,
+    benchmark_min_time: Optional[str],
+    benchmark_jobs: int,
+) -> List[BenchmarkResult]:
+    benchmark_filter, remaining_args = split_benchmark_args(extra_args)
+
+    if benchmark_jobs <= 1:
+        cpus = available_cpus()
+        cpu = cpus[0] if cpus else None
+        return run_benchmark_shard(
+            lib_path,
+            mode,
+            benchmark_dir,
+            benchmark_filter,
+            remaining_args,
+            repetitions,
+            benchmark_min_time,
+            cpu,
+        )
+
+    names = list_benchmark_names(
+        lib_path, mode, benchmark_dir, benchmark_filter, remaining_args
+    )
+    cpus = available_cpus()
+    jobs = min(benchmark_jobs, len(names), len(cpus) if cpus else benchmark_jobs)
+
+    if jobs < benchmark_jobs:
+        print(f"::notice::Reduced benchmark jobs from {benchmark_jobs} to {jobs}")
+
+    shards = [names[i::jobs] for i in range(jobs)]
+    temp_paths: List[str] = []
+    processes: List[subprocess.Popen] = []
+
+    try:
+        for index, shard_names in enumerate(shards):
+            fd, json_out_path = tempfile.mkstemp(suffix=".json")
+            os.close(fd)
+            temp_paths.append(json_out_path)
+
+            cpu = cpus[index % len(cpus)] if cpus else None
+            cmd = build_benchmark_cmd(
+                benchmark_dir,
+                json_out_path,
+                exact_filter(shard_names),
+                repetitions,
+                benchmark_min_time,
+                remaining_args,
+                cpu,
+            )
+
+            cpu_msg = f" on CPU {cpu}" if cpu is not None else ""
+            print(
+                f"Starting benchmark shard {index + 1}/{jobs}"
+                f" ({len(shard_names)} benchmarks){cpu_msg}: {' '.join(cmd)}"
+            )
+            sys.stdout.flush()
+
+            processes.append(
+                subprocess.Popen(cmd, env=benchmark_env(lib_path, mode))
+            )
+
+        failures = []
+        for index, process in enumerate(processes):
+            return_code = process.wait()
+            if return_code != 0:
+                failures.append((index + 1, return_code))
+
+        if failures:
+            for shard, return_code in failures:
+                print(
+                    f"Benchmark shard {shard} failed with code {return_code}"
+                )
+            sys.exit(2)
+
+        results: List[BenchmarkResult] = []
+        for json_out_path in temp_paths:
+            results.extend(parse_benchmark_json_file(json_out_path))
+
+        name_counts = Counter(result.name for result in results)
+        duplicate_names = sorted(
+            name for name, count in name_counts.items() if count > 1
+        )
+        if duplicate_names:
+            print(f"::error::Duplicate benchmark results: {duplicate_names}")
+            sys.exit(2)
+
+        return sorted(results, key=lambda result: result.name)
+    finally:
+        for process in processes:
+            if process.poll() is None:
+                process.terminate()
+        for json_out_path in temp_paths:
+            try:
+                os.unlink(json_out_path)
+            except OSError:
+                pass
+
+
+def run_benchmark(
+    lib_path: str,
+    mode: str,
+    benchmark_dir: str,
+    extra_args: Optional[List[str]] = None,
+    repetitions: int = 3,
+    benchmark_min_time: Optional[str] = None,
+    benchmark_jobs: int = 1,
+) -> List[BenchmarkResult]:
+    """Run benchmark and parse JSON output.
+
+    Uses --benchmark_out to write JSON results to a temporary file so that
+    the human-readable benchmark progress streams to stdout/stderr in real
+    time (important for CI visibility).
+
+    When *repetitions* > 1, each benchmark runs N times and only the
+    median aggregate is kept, significantly reducing noise from ASLR and
+    shared-runner contention.
+    """
+    if benchmark_jobs < 1:
+        print("::error::--benchmark-jobs must be >= 1")
+        sys.exit(2)
+
+    return run_benchmark_parallel(
+        lib_path,
+        mode,
+        benchmark_dir,
+        extra_args,
+        repetitions,
+        benchmark_min_time,
+        benchmark_jobs,
+    )
 
 
 def parse_benchmark_json(json_output: str) -> List[BenchmarkResult]:
@@ -483,6 +698,12 @@ def main():
         default=None,
         help="Minimum execution time per benchmark (e.g., '1s' or '2.0'). Forwarded to evmone-bench.",
     )
+    parser.add_argument(
+        "--benchmark-jobs",
+        type=int,
+        default=1,
+        help="Run benchmark shards in parallel across CPUs (default: 1).",
+    )
 
     args = parser.parse_args()
 
@@ -502,6 +723,7 @@ def main():
             extra_args=bench_extra,
             repetitions=args.benchmark_repetitions,
             benchmark_min_time=args.benchmark_min_time,
+            benchmark_jobs=args.benchmark_jobs,
         )
     except Exception as e:
         print(f"::error::Failed to run benchmarks: {e}")

From 1f7965ba3b695f1916033e867588a3b19746002b Mon Sep 17 00:00:00 2001
From: Abmcar <abmcar@qq.com>
Date: Sat, 16 May 2026 22:11:59 +0800
Subject: [PATCH 2/2] ci: address evmone benchmark review comments

---
 .ci/run_test_suite.sh                 | 19 +++++++++---
 tools/check_performance_regression.py | 42 +++++++++++++++++++++++++--
 2 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/.ci/run_test_suite.sh b/.ci/run_test_suite.sh
index bb03e1c70..a44ac2638 100644
--- a/.ci/run_test_suite.sh
+++ b/.ci/run_test_suite.sh
@@ -322,10 +322,21 @@ for STACK_TYPE in ${STACK_TYPES[@]}; do
                 rm -rf "$EVMONE_DIR"
                 git clone --depth 1 --recurse-submodules -b "$EVMONE_REF" "$EVMONE_REPO" "$EVMONE_DIR"
             fi
-            git -C "$EVMONE_DIR" remote set-url origin "$EVMONE_REPO"
-            git -C "$EVMONE_DIR" fetch --depth 1 origin "$EVMONE_REF"
-            git -C "$EVMONE_DIR" checkout --detach "$EVMONE_COMMIT"
-            git -C "$EVMONE_DIR" submodule update --init --recursive
+
+            EVMONE_HEAD=$(git -C "$EVMONE_DIR" rev-parse HEAD 2>/dev/null || true)
+            EVMONE_SUBMODULES_READY=1
+            if git -C "$EVMONE_DIR" submodule status --recursive | grep -q '^-'; then
+                EVMONE_SUBMODULES_READY=0
+            fi
+
+            if [ "$EVMONE_HEAD" = "$EVMONE_COMMIT" ] && [ "$EVMONE_SUBMODULES_READY" = 1 ]; then
+                echo "Using cached evmone at $EVMONE_COMMIT"
+            else
+                git -C "$EVMONE_DIR" remote set-url origin "$EVMONE_REPO"
+                git -C "$EVMONE_DIR" fetch --depth 1 origin "$EVMONE_REF"
+                git -C "$EVMONE_DIR" checkout --detach "$EVMONE_COMMIT"
+                git -C "$EVMONE_DIR" submodule update --init --recursive
+            fi
 
             BENCHMARK_THRESHOLD=${BENCHMARK_THRESHOLD:-0.15}
             BENCHMARK_MODE=${BENCHMARK_MODE:-multipass}
diff --git a/tools/check_performance_regression.py b/tools/check_performance_regression.py
index 96a779a06..ae96eeba7 100755
--- a/tools/check_performance_regression.py
+++ b/tools/check_performance_regression.py
@@ -26,6 +26,7 @@
 import subprocess
 import sys
 import tempfile
+import time
 from collections import Counter
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
@@ -151,6 +152,43 @@ def parse_benchmark_json_file(json_out_path: str) -> List[BenchmarkResult]:
     return parse_benchmark_json(json_data)
 
 
+def stop_processes(processes: List[subprocess.Popen], timeout: float = 5.0) -> None:
+    running = [process for process in processes if process.poll() is None]
+    if not running:
+        return
+
+    for process in running:
+        try:
+            process.terminate()
+        except OSError:
+            pass
+
+    deadline = time.monotonic() + timeout
+    for process in running:
+        remaining = deadline - time.monotonic()
+        if remaining <= 0:
+            break
+        try:
+            process.wait(timeout=remaining)
+        except subprocess.TimeoutExpired:
+            pass
+
+    still_running = [process for process in running if process.poll() is None]
+    for process in still_running:
+        try:
+            process.kill()
+        except OSError:
+            pass
+
+    for process in still_running:
+        try:
+            process.wait(timeout=1.0)
+        except subprocess.TimeoutExpired:
+            print(
+                f"::warning::Benchmark shard process {process.pid} did not exit after kill"
+            )
+
+
 def run_benchmark_shard(
     lib_path: str,
     mode: str,
@@ -286,9 +324,7 @@ def run_benchmark_parallel(
 
         return sorted(results, key=lambda result: result.name)
     finally:
-        for process in processes:
-            if process.poll() is None:
-                process.terminate()
+        stop_processes(processes)
         for json_out_path in temp_paths:
             try:
                 os.unlink(json_out_path)