From 20941e1df1786ad156ef6beb76682dae07ea69ef Mon Sep 17 00:00:00 2001
From: Leo Lara <leo@leolara.me>
Date: Sat, 23 May 2026 11:02:46 +0700
Subject: [PATCH 1/9] feat(tooling): detect gas-checking tests via runtime
 taint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a pytest plugin enabled with `--detect-gas-checks` (fill mode only)
that identifies tests asserting specific gas values. The plugin uses two
detection modes, chosen per sink:

- **Storage in post** (`post[addr].storage[slot]`) — taint propagation.
  Wraps `Bytecode.gas_cost`, per-fork `gas_costs` /
  `opcode_gas_calculator` / `transaction_intrinsic_cost_calculator`, and
  patches `Number.__new__` + `FixedSizeHexNumber.__new__` so a
  `GasTainted(int)` carrier survives `HashInt` / `HexNumber`
  construction. Tests whose post-storage holds a tainted value are
  flagged.
- **Receipt / header / block-level / benchmark fields** — field-presence.
  The field name (`cumulative_gas_used`, `expected_gas_used`,
  `blob_gas_used`, etc.) is itself the assertion signal; if set, flag.

Tests with `tx.error`, `block_exception`, or the `exception_test` marker
are excluded so OOG-expecting tests don't pollute the output. Worker
results are aggregated to the master via `workeroutput` /
`pytest_testnodedown`.

`Block.expected_gas_used` and `BaseTest.expected_benchmark_gas_used`
change from `int` to `HexNumber`. Both are internal assertion-only and
not serialised to fixtures, so the type change has no fixture impact and
brings them under the same constructor path as the other gas sinks.
The benchmark validator widens a local var type to accommodate the
union.

Also adds `scripts/mark_tests.py`, an idempotent script that applies a
configurable `@pytest.mark.<name>` decorator to test functions listed in
a JSON file. Accepts three input shapes (bare mapping, bare list,
wrapped with embedded marker name); `--marker` overrides any embedded
value.
---
 .../pytest_commands/plugins/filler/filler.py  |  12 +
 .../plugins/filler/gas_taint.py               | 489 ++++++++++++++++++
 .../pytest_ini_files/pytest-fill.ini          |   1 +
 .../src/execution_testing/specs/base.py       |   8 +-
 .../src/execution_testing/specs/benchmark.py  |   2 +-
 .../src/execution_testing/specs/blockchain.py |   2 +-
 scripts/mark_tests.py                         | 366 +++++++++++++
 7 files changed, 875 insertions(+), 5 deletions(-)
 create mode 100644 packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/gas_taint.py
 create mode 100644 scripts/mark_tests.py
diff --git a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/filler.py b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/filler.py
index 5a733c09ad5..9a48c5191a7 100644
--- a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/filler.py
+++ b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/filler.py
@@ -1700,6 +1700,18 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
 
                 super(BaseTestWrapper, self).__init__(*args, **kwargs)
 
+                if getattr(request.config, "_gas_taint_enabled", False):
+                    from .gas_taint import collect_taint_hits
+
+                    hits = collect_taint_hits(self, request.node)
+                    if hits:
+                        request.config._gas_taint_results[  # type: ignore[attr-defined]
+                            request.node.nodeid
+                        ] = hits
+                        request.node.user_properties.append(
+                            ("gas_taint_hits", hits)
+                        )
+
                 # Get the filling session from config
                 session: FillingSession = request.config.filling_session  # type: ignore
                 assert isinstance(session, FillingSession)
diff --git a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/gas_taint.py b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/gas_taint.py
new file mode 100644
index 00000000000..050ba574ab2
--- /dev/null
+++ b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/gas_taint.py
@@ -0,0 +1,489 @@
+"""
+Pytest plugin that detects tests which positively assert specific gas values.
+
+Enabled with ``--detect-gas-checks``. After each test body constructs its
+``StateTest`` / ``BlockchainTest`` (but before the t8n runs), the walker
+inspects two categories of sinks:
+
+1. **Field-name implies gas assertion** — checked by presence (``is not
+   None``). No taint needed because the field name itself is the signal:
+
+   - ``tx.expected_receipt.{cumulative_gas_used, gas_used, blob_gas_used}``
+   - ``block.header_verify.{gas_used, blob_gas_used}``
+   - ``block.expected_gas_used``
+   - ``self.expected_benchmark_gas_used``
+
+2. **Storage slots in ``post``** — checked via taint propagation, since
+   a storage slot can hold any value (counter, identifier, etc.) and the
+   field alone doesn't tell us whether it's gas-related.
+
+For the storage sink the plugin wraps gas-source functions
+(``Bytecode.gas_cost``, ``fork.gas_costs()``, ``opcode_gas_calculator``,
+``transaction_intrinsic_cost_calculator``) so they return a ``GasTainted``
+int subclass carrying provenance, and patches ``Number.__new__`` /
+``FixedSizeHexNumber.__new__`` so taint survives construction of
+``HashInt`` / ``HexNumber`` / ``ZeroPaddedHexNumber``.
+
+Tests with any sink hit get a ``gas_check`` marker attached and an entry
+written to the JSON report. Tests that expect a transaction or block
+exception (OOG-style) are skipped — their gas value lives on
+``tx.gas_limit`` (an input, not a sink) and would never appear here.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import fields, replace
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Set, Tuple
+
+import pytest
+
+# ---------------------------------------------------------------------------
+# Taint carrier
+# ---------------------------------------------------------------------------
+
+
+class GasTainted(int):
+    """An int that remembers it was computed from a gas-cost source."""
+
+    origins: Tuple[str, ...]
+
+    def __new__(
+        cls, value: int, origins: Tuple[str, ...] = ()
+    ) -> "GasTainted":
+        """Build a tainted int with attached provenance tuple."""
+        inst = super().__new__(cls, int(value))
+        inst.origins = origins or ("?",)
+        return inst
+
+    def _merge(self, other: Any) -> Tuple[str, ...]:
+        o = getattr(other, "origins", ())
+        return tuple(dict.fromkeys((*self.origins, *o)))
+
+    def _propagate(self, raw: Any, other: Any) -> Any:
+        if raw is NotImplemented:
+            return NotImplemented
+        return GasTainted(raw, self._merge(other))
+
+    def __add__(self, o: Any) -> Any:  # noqa: D105
+        return self._propagate(int.__add__(self, o), o)
+
+    def __radd__(self, o: Any) -> Any:  # noqa: D105
+        return self._propagate(int.__radd__(self, o), o)
+
+    def __sub__(self, o: Any) -> Any:  # noqa: D105
+        return self._propagate(int.__sub__(self, o), o)
+
+    def __rsub__(self, o: Any) -> Any:  # noqa: D105
+        return self._propagate(int.__rsub__(self, o), o)
+
+    def __mul__(self, o: Any) -> Any:  # noqa: D105
+        return self._propagate(int.__mul__(self, o), o)
+
+    def __rmul__(self, o: Any) -> Any:  # noqa: D105
+        return self._propagate(int.__rmul__(self, o), o)
+
+    def __floordiv__(self, o: Any) -> Any:  # noqa: D105
+        return self._propagate(int.__floordiv__(self, o), o)
+
+    def __rfloordiv__(self, o: Any) -> Any:  # noqa: D105
+        return self._propagate(int.__rfloordiv__(self, o), o)
+
+    def __mod__(self, o: Any) -> Any:  # noqa: D105
+        return self._propagate(int.__mod__(self, o), o)
+
+    def __neg__(self) -> "GasTainted":  # noqa: D105
+        return GasTainted(int.__neg__(self), self.origins)
+
+
+def _origins_of(value: Any) -> Tuple[str, ...] | None:
+    return getattr(value, "origins", None)
+
+
+def _is_tainted(value: Any) -> bool:
+    return _origins_of(value) is not None
+
+
+# ---------------------------------------------------------------------------
+# Source instrumentation
+# ---------------------------------------------------------------------------
+
+_installed = False
+_originals: List[Callable[[], None]] = []
+
+
+def _all_subclasses(cls: type) -> Set[type]:
+    out: Set[type] = set()
+    for sub in cls.__subclasses__():
+        out.add(sub)
+        out.update(_all_subclasses(sub))
+    return out
+
+
+def _patch_classmethod_if_local(
+    cls: type, name: str, wrap: Callable[[type, Callable, tuple, dict], Any]
+) -> None:
+    """Wrap ``cls.name`` only if defined on ``cls`` itself (not inherited)."""
+    if name not in cls.__dict__:
+        return
+    original_cm = cls.__dict__[name]
+    if not isinstance(original_cm, classmethod):
+        return
+    original_fn = original_cm.__func__
+
+    def wrapped(c: type, *args: Any, **kwargs: Any) -> Any:
+        return wrap(c, original_fn, args, kwargs)
+
+    setattr(cls, name, classmethod(wrapped))
+
+    def revert() -> None:
+        setattr(cls, name, original_cm)
+
+    _originals.append(revert)
+
+
+def _wrap_gas_costs(c: type, fn: Callable, args: tuple, kwargs: dict) -> Any:
+    gc = fn(c, *args, **kwargs)
+    patched: Dict[str, Any] = {}
+    for f in fields(gc):
+        v = getattr(gc, f.name)
+        if isinstance(v, int) and not _is_tainted(v):
+            patched[f.name] = GasTainted(v, (f"gas_costs.{f.name}",))
+    return replace(gc, **patched) if patched else gc
+
+
+def _wrap_intrinsic(c: type, fn: Callable, args: tuple, kwargs: dict) -> Any:
+    inner = fn(c, *args, **kwargs)
+
+    def calc(*a: Any, **kw: Any) -> Any:
+        result = inner(*a, **kw)
+        if isinstance(result, int) and not _is_tainted(result):
+            return GasTainted(result, ("intrinsic_gas",))
+        return result
+
+    return calc
+
+
+def _wrap_opcode_calc(c: type, fn: Callable, args: tuple, kwargs: dict) -> Any:
+    inner = fn(c, *args, **kwargs)
+
+    def calc(opcode: Any) -> Any:
+        result = inner(opcode)
+        if isinstance(result, int) and not _is_tainted(result):
+            name = getattr(opcode, "_name_", None) or str(opcode)
+            return GasTainted(result, (f"opcode_gas[{name}]",))
+        return result
+
+    return calc
+
+
+def install_taint() -> None:
+    """Install all gas-source patches. Idempotent."""
+    global _installed
+    if _installed:
+        return
+    _installed = True
+
+    # Bytecode.gas_cost - taint the aggregate result
+    from execution_testing.vm.bytecode import Bytecode
+
+    original_bc = Bytecode.gas_cost
+
+    def gas_cost(self: Any, fork: Any) -> Any:
+        result = original_bc(self, fork)
+        if isinstance(result, int) and not _is_tainted(result):
+            return GasTainted(result, ("Bytecode.gas_cost",))
+        return result
+
+    Bytecode.gas_cost = gas_cost  # type: ignore[method-assign]
+    _originals.append(lambda: setattr(Bytecode, "gas_cost", original_bc))
+
+    # Concrete forks: walk subclasses and wrap any locally-defined
+    # gas_costs / transaction_intrinsic_cost_calculator / opcode_gas_calculator
+    from execution_testing.forks.base_fork import BaseFork
+
+    for cls in _all_subclasses(BaseFork):
+        _patch_classmethod_if_local(cls, "gas_costs", _wrap_gas_costs)
+        _patch_classmethod_if_local(
+            cls, "transaction_intrinsic_cost_calculator", _wrap_intrinsic
+        )
+        _patch_classmethod_if_local(
+            cls, "opcode_gas_calculator", _wrap_opcode_calc
+        )
+
+    # Number.__new__ / FixedSizeHexNumber.__new__ - preserve origins through
+    # HexNumber, ZeroPaddedHexNumber, HashInt, etc. (two separate hierarchies).
+    from execution_testing.base_types.base_types import (
+        FixedSizeHexNumber,
+        Number,
+    )
+
+    def _wrap_new(target: type) -> None:
+        orig = target.__new__
+
+        def new(cls: type, input_number: Any) -> Any:
+            inst: Any = orig(cls, input_number)
+            origins = getattr(input_number, "origins", None)
+            if origins is not None:
+                try:
+                    inst.origins = origins
+                except (AttributeError, TypeError):
+                    pass
+            return inst
+
+        target.__new__ = new  # type: ignore[assignment,method-assign]
+
+        def revert() -> None:
+            target.__new__ = orig  # type: ignore[method-assign]
+
+        _originals.append(revert)
+
+    _wrap_new(Number)
+    _wrap_new(FixedSizeHexNumber)
+
+
+def uninstall_taint() -> None:
+    """Revert all patches in LIFO order. Used by tests of the plugin."""
+    global _installed
+    while _originals:
+        try:
+            _originals.pop()()
+        except Exception:
+            pass
+    _installed = False
+
+
+# ---------------------------------------------------------------------------
+# Sink walker
+# ---------------------------------------------------------------------------
+
+
+def _record_tainted(
+    hits: List[dict], kind: str, location: str, value: Any
+) -> None:
+    """Record only if value carries gas-source taint (used for storage)."""
+    origins = _origins_of(value)
+    if origins is None:
+        return
+    hits.append(
+        {
+            "kind": kind,
+            "location": location,
+            "value": int(value),
+            "origins": list(origins),
+        }
+    )
+
+
+def _record_present(
+    hits: List[dict], kind: str, location: str, value: Any
+) -> None:
+    """Record if value is set (field-name-implies-gas-assertion sinks)."""
+    if value is None:
+        return
+    hit: Dict[str, Any] = {
+        "kind": kind,
+        "location": location,
+        "value": int(value),
+    }
+    origins = _origins_of(value)
+    if origins is not None:
+        hit["origins"] = list(origins)
+    hits.append(hit)
+
+
+def _walk_storage(hits: List[dict], post: Any) -> None:
+    """Storage slots are general-purpose; only flag tainted values."""
+    if post is None:
+        return
+    try:
+        items = post.items()
+    except AttributeError:
+        return
+    for address, account in items:
+        if account is None:
+            continue
+        storage = getattr(account, "storage", None)
+        if storage is None:
+            continue
+        for slot, value in dict(storage.root).items():
+            _record_tainted(
+                hits,
+                "storage",
+                f"{address}:{int(slot)}",
+                value,
+            )
+
+
+def _walk_receipt(hits: List[dict], tx: Any) -> None:
+    """Receipt gas fields are self-identifying — flag on presence."""
+    if tx is None:
+        return
+    receipt = getattr(tx, "expected_receipt", None)
+    if receipt is None:
+        return
+    for field in ("cumulative_gas_used", "gas_used", "blob_gas_used"):
+        _record_present(hits, "receipt", field, getattr(receipt, field, None))
+
+
+def _walk_header_and_block(hits: List[dict], block: Any, i: int) -> None:
+    """Header gas fields and expected_gas_used are self-identifying."""
+    header_verify = getattr(block, "header_verify", None)
+    if header_verify is not None:
+        for field in ("gas_used", "blob_gas_used"):
+            _record_present(
+                hits,
+                "header",
+                f"block[{i}].{field}",
+                getattr(header_verify, field, None),
+            )
+    _record_present(
+        hits,
+        "block_expected_gas_used",
+        f"block[{i}].expected_gas_used",
+        getattr(block, "expected_gas_used", None),
+    )
+
+
+def _is_oog_test(test: Any, node: pytest.Item | None) -> bool:
+    tx = getattr(test, "tx", None)
+    if tx is not None and getattr(tx, "error", None) is not None:
+        return True
+    if getattr(test, "block_exception", None):
+        return True
+    if (
+        node is not None
+        and node.get_closest_marker("exception_test") is not None
+    ):
+        return True
+    return False
+
+
+def collect_taint_hits(test: Any, node: pytest.Item | None) -> List[dict]:
+    """Walk all known gas-assertion sinks on the test object."""
+    if _is_oog_test(test, node):
+        return []
+
+    hits: List[dict] = []
+    _walk_storage(hits, getattr(test, "post", None))
+    _walk_receipt(hits, getattr(test, "tx", None))
+
+    blocks = getattr(test, "blocks", None)
+    if blocks is not None:
+        for i, block in enumerate(blocks):
+            _walk_header_and_block(hits, block, i)
+
+    # expected_benchmark_gas_used is auto-defaulted on every test by the
+    # filler, so only treat it as a sink on actual BenchmarkTest instances.
+    if any(c.__name__ == "BenchmarkTest" for c in type(test).__mro__):
+        _record_present(
+            hits,
+            "benchmark",
+            "expected_benchmark_gas_used",
+            getattr(test, "expected_benchmark_gas_used", None),
+        )
+
+    return hits
+
+
+# ---------------------------------------------------------------------------
+# Pytest hooks
+# ---------------------------------------------------------------------------
+
+
+_RESULTS_ATTR = "_gas_taint_results"
+_ENABLED_ATTR = "_gas_taint_enabled"
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    """Add ``--detect-gas-checks`` / ``--gas-check-report`` options."""
+    group = parser.getgroup(
+        "gas-taint", "Detect tests that assert specific gas values"
+    )
+    group.addoption(
+        "--detect-gas-checks",
+        action="store_true",
+        dest="detect_gas_checks",
+        default=False,
+        help=(
+            "Instrument gas-source functions, propagate provenance, and "
+            "emit a JSON report listing tests whose post-state asserts a "
+            "gas-derived value."
+        ),
+    )
+    group.addoption(
+        "--gas-check-report",
+        action="store",
+        dest="gas_check_report",
+        default="gas_check_report.json",
+        help="Output path for the gas-check JSON report.",
+    )
+
+
+@pytest.hookimpl(tryfirst=True)
+def pytest_configure(config: pytest.Config) -> None:
+    """Install taint and initialize result storage when enabled."""
+    config.addinivalue_line(
+        "markers",
+        "gas_check: test asserts a specific gas value (auto-applied).",
+    )
+    if not config.getoption("detect_gas_checks", default=False):
+        return
+    install_taint()
+    setattr(config, _ENABLED_ATTR, True)
+    setattr(config, _RESULTS_ATTR, {})
+
+
+@pytest.hookimpl(hookwrapper=True)
+def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo) -> Any:
+    """Attach a ``gas_check`` marker if the item has taint hits."""
+    outcome = yield
+    if call.when != "call":
+        return outcome
+    for key, value in item.user_properties:
+        if key == "gas_taint_hits" and value:
+            item.add_marker(pytest.mark.gas_check)
+            break
+    return outcome
+
+
+def pytest_sessionfinish(session: pytest.Session, exitstatus: int) -> None:
+    """Forward worker results to master, or write JSON on master."""
+    del exitstatus
+    config = session.config
+    if not getattr(config, _ENABLED_ATTR, False):
+        return
+    results = getattr(config, _RESULTS_ATTR, {})
+
+    try:
+        import xdist
+
+        is_worker = xdist.is_xdist_worker(session)
+    except ImportError:
+        is_worker = False
+
+    if is_worker:
+        # Send worker's results to master via workeroutput.
+        config.workeroutput["gas_taint_results"] = results  # type: ignore[attr-defined]
+        return
+
+    path = Path(config.getoption("gas_check_report"))
+    path.write_text(json.dumps(results, indent=2, sort_keys=True))
+
+
+def pytest_testnodedown(node: Any, error: Any) -> None:
+    """Aggregate worker results into the master's results dict."""
+    del error
+    config = node.config
+    if not getattr(config, _ENABLED_ATTR, False):
+        return
+    worker_results = getattr(node, "workeroutput", {}).get(
+        "gas_taint_results", {}
+    )
+    results = getattr(config, _RESULTS_ATTR, None)
+    if results is None:
+        return
+    results.update(worker_results)
+    path = Path(config.getoption("gas_check_report"))
+    path.write_text(json.dumps(results, indent=2, sort_keys=True))
diff --git a/packages/testing/src/execution_testing/cli/pytest_commands/pytest_ini_files/pytest-fill.ini b/packages/testing/src/execution_testing/cli/pytest_commands/pytest_ini_files/pytest-fill.ini
index 179ac2082f0..a34102e6c37 100644
--- a/packages/testing/src/execution_testing/cli/pytest_commands/pytest_ini_files/pytest-fill.ini
+++ b/packages/testing/src/execution_testing/cli/pytest_commands/pytest_ini_files/pytest-fill.ini
@@ -13,6 +13,7 @@ addopts =
     -p execution_testing.cli.pytest_commands.plugins.filler.pre_alloc
     -p execution_testing.cli.pytest_commands.plugins.filler.ported_tests
     -p execution_testing.cli.pytest_commands.plugins.filler.static_filler
+    -p execution_testing.cli.pytest_commands.plugins.filler.gas_taint
     -p execution_testing.cli.pytest_commands.plugins.shared.benchmarking
     -p execution_testing.cli.pytest_commands.plugins.shared.transaction_fixtures
     -p execution_testing.cli.pytest_commands.plugins.help.help
diff --git a/packages/testing/src/execution_testing/specs/base.py b/packages/testing/src/execution_testing/specs/base.py
index 8aee027f097..59b38f502c0 100644
--- a/packages/testing/src/execution_testing/specs/base.py
+++ b/packages/testing/src/execution_testing/specs/base.py
@@ -20,7 +20,7 @@
 from pydantic import BaseModel, ConfigDict, Field
 from typing_extensions import Self
 
-from execution_testing.base_types import to_hex
+from execution_testing.base_types import HexNumber, to_hex
 from execution_testing.client_clis import Result, TransitionTool
 from execution_testing.client_clis.cli_types import OpcodeCount
 from execution_testing.execution import (
@@ -113,7 +113,7 @@ class BaseTest(BaseModel):
     )
     operation_mode: OpMode | None = None
     gas_optimization_max_gas_limit: int | None = None
-    expected_benchmark_gas_used: int | None = None
+    expected_benchmark_gas_used: HexNumber | None = None
     skip_gas_used_validation: bool = False
     expected_receipt_status: int | None = None
     is_tx_gas_heavy_test: bool = False
@@ -278,7 +278,9 @@ def validate_benchmark_gas(
         if not self.skip_gas_used_validation:
             # Verify that the total gas consumed in the last block
             # matches expectations
-            expected_benchmark_gas_used = self.expected_benchmark_gas_used
+            expected_benchmark_gas_used: HexNumber | int | None = (
+                self.expected_benchmark_gas_used
+            )
             if expected_benchmark_gas_used is None:
                 expected_benchmark_gas_used = gas_benchmark_value
             diff = benchmark_gas_used - expected_benchmark_gas_used
diff --git a/packages/testing/src/execution_testing/specs/benchmark.py b/packages/testing/src/execution_testing/specs/benchmark.py
index 197cb5063eb..097247e6d6c 100644
--- a/packages/testing/src/execution_testing/specs/benchmark.py
+++ b/packages/testing/src/execution_testing/specs/benchmark.py
@@ -300,7 +300,7 @@ class BenchmarkTest(BaseTest):
         | None
     ) = None
     env: Environment = Field(default_factory=Environment)
-    expected_benchmark_gas_used: int | None = None
+    expected_benchmark_gas_used: HexNumber | None = None
     gas_benchmark_value: int = Field(
         default_factory=lambda: int(Environment().gas_limit)
     )
diff --git a/packages/testing/src/execution_testing/specs/blockchain.py b/packages/testing/src/execution_testing/specs/blockchain.py
index 39fc641c3fb..41332741a6c 100644
--- a/packages/testing/src/execution_testing/specs/blockchain.py
+++ b/packages/testing/src/execution_testing/specs/blockchain.py
@@ -306,7 +306,7 @@ class Block(Header):
     """Post state for verification after block execution in BlockchainTest"""
     block_access_list: Bytes | None = Field(None)
     """EIP-7928: Block-level access lists (serialized)."""
-    expected_gas_used: int | None = None
+    expected_gas_used: HexNumber | None = None
     """Expected gas used for the block."""
 
     def set_environment(self, env: Environment) -> Environment:
diff --git a/scripts/mark_tests.py b/scripts/mark_tests.py
new file mode 100644
index 00000000000..bc0fa3fe699
--- /dev/null
+++ b/scripts/mark_tests.py
@@ -0,0 +1,366 @@
+#!/usr/bin/env python3
+"""
+Add a ``@pytest.mark.<name>`` decorator to test functions listed in a
+JSON file. Idempotent — re-running leaves already-marked tests alone.
+
+INPUT FORMATS
+-------------
+
+The input JSON may be in any of these shapes. Parametrize brackets
+(``[…]``) are stripped from node IDs since markers attach to the
+function, not to individual parameter cases.
+
+1. **Bare mapping** (the format emitted by ``--detect-gas-checks``)::
+
+       {
+         "tests/foo/test_bar.py::test_baz[case_id]": [ ... hits ... ],
+         "tests/foo/test_bar.py::TestClass::test_method[other]": [ ... ]
+       }
+
+   Marker name must be supplied via ``--marker``.
+
+2. **Bare list of node IDs**::
+
+       [
+         "tests/foo/test_bar.py::test_baz",
+         "tests/foo/test_bar.py::TestClass::test_method"
+       ]
+
+   Marker name must be supplied via ``--marker``.
+
+3. **Wrapped (self-describing)** — the JSON declares which marker it
+   maps to, so producers can ship a single file without remembering to
+   pass ``--marker``::
+
+       {
+         "marker": "gas_check",
+         "tests": {
+           "tests/foo/test_bar.py::test_baz[case]": [ ... ]
+         }
+       }
+
+   or::
+
+       {
+         "marker": "slow",
+         "nodeids": [
+           "tests/foo/test_bar.py::test_baz",
+           "tests/foo/test_bar.py::test_qux"
+         ]
+       }
+
+   ``--marker`` on the command line overrides the value in the file.
+
+USAGE
+-----
+
+::
+
+    uv run python scripts/mark_tests.py REPORT.json --marker gas_check
+    uv run python scripts/mark_tests.py --dry-run wrapped_report.json
+"""
+
+from __future__ import annotations
+
+import argparse
+import ast
+import json
+import sys
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, Iterable, List, Set, Tuple
+
+DEFAULT_MARKER = "gas_check"
+
+
+# ---------------------------------------------------------------------------
+# Input parsing
+# ---------------------------------------------------------------------------
+
+
+def _coerce_nodeids(raw: object) -> Iterable[str]:
+    if isinstance(raw, dict):
+        return raw.keys()
+    if isinstance(raw, list):
+        return [n for n in raw if isinstance(n, str)]
+    raise ValueError(
+        f"expected dict or list of node IDs, got {type(raw).__name__}"
+    )
+
+
+def load_input(
+    payload: object, cli_marker: str | None
+) -> Tuple[str, Dict[str, Set[str]]]:
+    """
+    Resolve the marker name and the ``{module: {qualname, ...}}`` map.
+
+    Format detection:
+      - ``dict`` with a ``"marker"`` key → wrapped form; nodeids come
+        from ``"tests"`` or ``"nodeids"``.
+      - any other ``dict`` → bare mapping (keys are nodeids).
+      - ``list`` → bare list of nodeids.
+    """
+    marker_in_file: str | None = None
+    if isinstance(payload, dict) and "marker" in payload:
+        marker_in_file = payload["marker"]
+        body = payload.get("tests")
+        if body is None:
+            body = payload.get("nodeids")
+        if body is None:
+            raise ValueError(
+                "wrapped JSON must have a 'tests' or 'nodeids' field"
+            )
+        nodeids = list(_coerce_nodeids(body))
+    else:
+        nodeids = list(_coerce_nodeids(payload))
+
+    marker = cli_marker or marker_in_file or DEFAULT_MARKER
+    if not marker.isidentifier():
+        raise ValueError(
+            f"marker name {marker!r} is not a valid Python identifier"
+        )
+
+    targets: Dict[str, Set[str]] = defaultdict(set)
+    for nodeid in nodeids:
+        module, sep, rest = nodeid.partition("::")
+        if not sep:
+            continue
+        bracket = rest.find("[")
+        if bracket != -1:
+            rest = rest[:bracket]
+        targets[module].add(rest)
+
+    return marker, targets
+
+
+# ---------------------------------------------------------------------------
+# AST navigation
+# ---------------------------------------------------------------------------
+
+
+def _is_marker(node: ast.AST, name: str) -> bool:
+    """Match ``@pytest.mark.NAME`` and ``@pytest.mark.NAME(...)``."""
+    if isinstance(node, ast.Call):
+        node = node.func
+    if not isinstance(node, ast.Attribute) or node.attr != name:
+        return False
+    mark = node.value
+    if not isinstance(mark, ast.Attribute) or mark.attr != "mark":
+        return False
+    pytest_node = mark.value
+    return isinstance(pytest_node, ast.Name) and pytest_node.id == "pytest"
+
+
+def _has_marker(
+    func: ast.FunctionDef | ast.AsyncFunctionDef, name: str
+) -> bool:
+    return any(_is_marker(d, name) for d in func.decorator_list)
+
+
+def _find_function(
+    body: List[ast.stmt], qualname: str
+) -> ast.FunctionDef | ast.AsyncFunctionDef | None:
+    """Resolve ``Class::Method::…`` (or just ``test_foo``) inside ``body``."""
+    parts = qualname.split("::")
+    current_body: List[ast.stmt] = body
+    for i, part in enumerate(parts):
+        last = i == len(parts) - 1
+        for node in current_body:
+            if (
+                last
+                and isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))
+                and node.name == part
+            ):
+                return node
+            if (
+                not last
+                and isinstance(node, ast.ClassDef)
+                and node.name == part
+            ):
+                current_body = node.body
+                break
+        else:
+            return None
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Source modification
+# ---------------------------------------------------------------------------
+
+
+def _decorator_insert_line(
+    func: ast.FunctionDef | ast.AsyncFunctionDef,
+) -> int:
+    """
+    1-based line where a new decorator should be inserted.
+
+    Above any existing decorators; above the ``def`` if none.
+    """
+    if func.decorator_list:
+        return func.decorator_list[0].lineno
+    return func.lineno
+
+
+def _line_indent(line: str) -> str:
+    return line[: len(line) - len(line.lstrip())]
+
+
+def insert_markers(
+    source: str, marker: str, targets: Set[str]
+) -> Tuple[str, List[str], List[str]]:
+    """
+    Return ``(new_source, added_qualnames, missing_qualnames)``.
+
+    Targets already carrying the marker are silently skipped.
+    """
+    tree = ast.parse(source)
+    found: List[Tuple[str, ast.FunctionDef | ast.AsyncFunctionDef, bool]] = []
+    missing: List[str] = []
+
+    for qn in sorted(targets):
+        func = _find_function(tree.body, qn)
+        if func is None:
+            missing.append(qn)
+            continue
+        found.append((qn, func, _has_marker(func, marker)))
+
+    to_add = [(qn, func) for qn, func, present in found if not present]
+    if not to_add:
+        return source, [], missing
+
+    lines = source.splitlines(keepends=True)
+    # Insert in descending order so earlier insertions don't shift later
+    # line numbers.
+    to_add.sort(key=lambda pair: _decorator_insert_line(pair[1]), reverse=True)
+    added: List[str] = []
+    decorator_line = f"@pytest.mark.{marker}\n"
+    for qn, func in to_add:
+        idx = _decorator_insert_line(func) - 1  # 1-based -> 0-based
+        if idx < 0 or idx >= len(lines):
+            missing.append(qn)
+            continue
+        indent = _line_indent(lines[idx])
+        lines.insert(idx, indent + decorator_line)
+        added.append(qn)
+
+    return "".join(lines), added, missing
+
+
+# ---------------------------------------------------------------------------
+# Driver
+# ---------------------------------------------------------------------------
+
+
+def main(argv: List[str] | None = None) -> int:
+    """Parse command-line arguments and mark the listed test functions."""
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "report",
+        type=Path,
+        help="Path to JSON file listing node IDs (see module docstring "
+        "for accepted shapes).",
+    )
+    parser.add_argument(
+        "--marker",
+        default=None,
+        help=(
+            "Pytest marker to apply. Overrides any 'marker' field in "
+            f"the JSON. Defaults to {DEFAULT_MARKER!r} when neither is "
+            "set."
+        ),
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Report what would change without writing any file.",
+    )
+    parser.add_argument(
+        "--root",
+        type=Path,
+        default=Path.cwd(),
+        help=(
+            "Repository root; node ID module paths are resolved against "
+            "this directory (default: current directory)."
+        ),
+    )
+    args = parser.parse_args(argv)
+
+    try:
+        payload = json.loads(args.report.read_text())
+    except (OSError, json.JSONDecodeError) as exc:
+        print(f"error: failed to read {args.report}: {exc}", file=sys.stderr)
+        return 2
+
+    try:
+        marker, targets_by_module = load_input(payload, args.marker)
+    except ValueError as exc:
+        print(f"error: {exc}", file=sys.stderr)
+        return 2
+
+    print(f"Marker: @pytest.mark.{marker}")
+
+    total_added = 0
+    total_already = 0
+    total_missing = 0
+    files_changed = 0
+    files_skipped: List[Path] = []
+
+    for module_rel, qualnames in sorted(targets_by_module.items()):
+        module_path = args.root / module_rel
+        if not module_path.is_file():
+            files_skipped.append(module_path)
+            total_missing += len(qualnames)
+            continue
+        try:
+            original = module_path.read_text()
+        except OSError as exc:
+            print(f"  skip {module_rel}: {exc}", file=sys.stderr)
+            continue
+
+        try:
+            new_source, added, missing = insert_markers(
+                original, marker, qualnames
+            )
+        except SyntaxError as exc:
+            print(
+                f"  skip {module_rel}: syntax error at line "
+                f"{exc.lineno}: {exc.msg}",
+                file=sys.stderr,
+            )
+            continue
+
+        already = len(qualnames) - len(added) - len(missing)
+        total_added += len(added)
+        total_already += already
+        total_missing += len(missing)
+
+        if added:
+            files_changed += 1
+            verb = "would mark" if args.dry_run else "marked"
+            print(f"  {module_rel}: {verb} {len(added)}")
+            for qn in added:
+                print(f"    + {qn}")
+            if not args.dry_run:
+                module_path.write_text(new_source)
+        elif missing:
+            print(f"  {module_rel}: nothing to add")
+        for qn in missing:
+            print(f"    ? {qn}  (not found)")
+
+    total_targets = sum(len(s) for s in targets_by_module.values())
+    print()
+    print(f"Targets in input  : {total_targets}")
+    print(f"Markers added     : {total_added}")
+    print(f"Already present   : {total_already}")
+    print(f"Could not locate  : {total_missing}")
+    print(f"Files changed     : {files_changed}")
+    if files_skipped:
+        print(f"Files skipped     : {len(files_skipped)} (not on disk)")
+    if args.dry_run:
+        print("(dry run — no files modified)")
+    return 0 if total_missing == 0 else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From 128c21fa71a18b6949983340a5399512572d8501 Mon Sep 17 00:00:00 2001
From: Leo Lara <leo@leolara.me>
Date: Sat, 23 May 2026 11:19:26 +0700
Subject: [PATCH 2/9] chore(tests): mark gas-checking tests with
 `@pytest.mark.gas_check`

Applies `@pytest.mark.gas_check` to the 54 test functions identified by
running `fill --detect-gas-checks` on the full suite. Produced by
`scripts/mark_tests.py gas_check_report.json`; re-running the script is
idempotent.

Affects 31 files across 16 EIP directories; one decorator each, inserted
at the top of the existing decorator stack.
---
 .../test_gas_accounting.py                                  | 5 +++++
 .../test_max_code_size.py                                   | 1 +
 .../test_additional_coverage.py                             | 6 ++++++
 .../test_execution_gas.py                                   | 2 ++
 .../eip7976_increase_calldata_floor_cost/test_refunds.py    | 1 +
 .../test_access_list_cost.py                                | 1 +
 tests/berlin/eip2929_gas_cost_increases/test_call.py        | 1 +
 tests/berlin/eip2929_gas_cost_increases/test_create.py      | 2 ++
 .../eip2929_gas_cost_increases/test_warm_status_revert.py   | 2 ++
 tests/berlin/eip2930_access_list/test_acl.py                | 2 ++
 tests/byzantium/eip196_ec_add_mul/test_gas.py               | 1 +
 tests/byzantium/eip197_ec_pairing/test_gas.py               | 1 +
 tests/cancun/eip1153_tstore/test_basic_tload.py             | 1 +
 tests/cancun/eip1153_tstore/test_tstorage.py                | 1 +
 tests/cancun/eip4844_blobs/test_blob_txs.py                 | 2 ++
 tests/cancun/eip4844_blobs/test_blobhash_opcode.py          | 1 +
 tests/cancun/eip4844_blobs/test_excess_blob_gas.py          | 3 +++
 .../eip4844_blobs/test_point_evaluation_precompile.py       | 1 +
 .../eip4844_blobs/test_point_evaluation_precompile_gas.py   | 1 +
 tests/cancun/eip5656_mcopy/test_mcopy_memory_expansion.py   | 2 ++
 tests/frontier/opcodes/test_all_opcodes.py                  | 1 +
 tests/frontier/opcodes/test_call.py                         | 3 +++
 tests/frontier/opcodes/test_exp.py                          | 1 +
 tests/frontier/opcodes/test_log.py                          | 1 +
 .../osaka/eip7918_blob_reserve_price/test_blob_base_fee.py  | 2 ++
 .../eip7939_count_leading_zeros/test_count_leading_zeros.py | 1 +
 .../eip7623_increase_calldata_cost/test_execution_gas.py    | 2 ++
 tests/prague/eip7623_increase_calldata_cost/test_refunds.py | 1 +
 tests/prague/eip7702_set_code_tx/test_set_code_txs.py       | 1 +
 tests/prague/eip7702_set_code_tx/test_set_code_txs_2.py     | 3 +++
 tests/shanghai/eip3651_warm_coinbase/test_warm_coinbase.py  | 1 +
 31 files changed, 54 insertions(+)

diff --git a/tests/amsterdam/eip7778_block_gas_accounting_without_refunds/test_gas_accounting.py b/tests/amsterdam/eip7778_block_gas_accounting_without_refunds/test_gas_accounting.py
index d1b9f9121af..e3dfbc937e8 100644
--- a/tests/amsterdam/eip7778_block_gas_accounting_without_refunds/test_gas_accounting.py
+++ b/tests/amsterdam/eip7778_block_gas_accounting_without_refunds/test_gas_accounting.py
@@ -168,6 +168,7 @@ def build_refund_tx(
     )
 
 
+@pytest.mark.gas_check
 @pytest.mark.parametrize(
     "refund_tx_reverts",
     [
@@ -215,6 +216,7 @@ def test_simple_gas_accounting(
     )
 
 
+@pytest.mark.gas_check
 @pytest.mark.parametrize(
     "refund_tx_reverts",
     [
@@ -356,6 +358,7 @@ class CallDataTestType(Enum):
     """calldata_floor > tx_gas_before_refund."""
 
 
+@pytest.mark.gas_check
 @pytest.mark.parametrize(
     "refund_tx_reverts",
     [
@@ -487,6 +490,7 @@ def test_varying_calldata_costs(
     )
 
 
+@pytest.mark.gas_check
 @pytest.mark.parametrize(
     "refund_tx_reverts",
     [
@@ -533,6 +537,7 @@ def test_multiple_refund_types_in_one_tx(
     )
 
 
+@pytest.mark.gas_check
 @pytest.mark.execute(pytest.mark.skip(reason="Requires specific gas price"))
 @pytest.mark.valid_from("EIP7778")
 def test_mixed_gas_regimes(
diff --git a/tests/amsterdam/eip7954_increase_max_contract_size/test_max_code_size.py b/tests/amsterdam/eip7954_increase_max_contract_size/test_max_code_size.py
index 85ed98451c9..47a26d35cde 100644
--- a/tests/amsterdam/eip7954_increase_max_contract_size/test_max_code_size.py
+++ b/tests/amsterdam/eip7954_increase_max_contract_size/test_max_code_size.py
@@ -273,6 +273,7 @@ def test_max_code_size_self_opcodes(
     state_test(pre=pre, tx=tx, post=post)
 
 
+@pytest.mark.gas_check
 @pytest.mark.parametrize(
     "create_opcode",
     [
diff --git a/tests/amsterdam/eip7976_increase_calldata_floor_cost/test_additional_coverage.py b/tests/amsterdam/eip7976_increase_calldata_floor_cost/test_additional_coverage.py
index c62550daa9d..b1645a14f3a 100644
--- a/tests/amsterdam/eip7976_increase_calldata_floor_cost/test_additional_coverage.py
+++ b/tests/amsterdam/eip7976_increase_calldata_floor_cost/test_additional_coverage.py
@@ -49,6 +49,7 @@ def to(self, pre: Alloc) -> Address:
         """Deploy a simple contract that does nothing."""
         return pre.deploy_contract(Op.STOP)
 
+    @pytest.mark.gas_check
     @pytest.mark.parametrize(
         "calldata,expected_standard_tokens,description",
         [
@@ -185,6 +186,7 @@ def to(self, pre: Alloc) -> Address:
         """Deploy a simple contract."""
         return pre.deploy_contract(Op.STOP)
 
+    @pytest.mark.gas_check
     def test_maximum_calldata_size(
         self,
         state_test: StateTestFiller,
@@ -535,6 +537,7 @@ def to(self, pre: Alloc) -> Address:
         """Deploy a simple contract."""
         return pre.deploy_contract(Op.STOP)
 
+    @pytest.mark.gas_check
     @pytest.mark.parametrize(
         "access_list,authorization_list",
         [
@@ -667,6 +670,7 @@ def to(self, pre: Alloc) -> Address:
         """Deploy a simple contract."""
         return pre.deploy_contract(Op.STOP)
 
+    @pytest.mark.gas_check
     @pytest.mark.parametrize(
         "num_authorizations",
         [
@@ -767,6 +771,7 @@ def sender(self, pre: Alloc) -> Address:
         """Create sender account."""
         return pre.fund_eoa()
 
+    @pytest.mark.gas_check
     def test_refund_cap_at_one_fifth(
         self,
         state_test: StateTestFiller,
@@ -854,6 +859,7 @@ def test_refund_cap_at_one_fifth(
             tx=tx,
         )
 
+    @pytest.mark.gas_check
     def test_floor_cost_not_reduced_by_refunds(
         self,
         state_test: StateTestFiller,
diff --git a/tests/amsterdam/eip7976_increase_calldata_floor_cost/test_execution_gas.py b/tests/amsterdam/eip7976_increase_calldata_floor_cost/test_execution_gas.py
index a46c9d5daf6..160e5f4ad30 100644
--- a/tests/amsterdam/eip7976_increase_calldata_floor_cost/test_execution_gas.py
+++ b/tests/amsterdam/eip7976_increase_calldata_floor_cost/test_execution_gas.py
@@ -55,6 +55,7 @@ def to(
         """
         return pre.deploy_contract(Op.INVALID)
 
+    @pytest.mark.gas_check
     @pytest.mark.parametrize(
         "ty,protected,authorization_list",
         [
@@ -133,6 +134,7 @@ def to(
             (Op.JUMPDEST * (execution_gas - 1)) + Op.STOP
         )
 
+    @pytest.mark.gas_check
     @pytest.mark.parametrize(
         "ty,protected,authorization_list",
         [
diff --git a/tests/amsterdam/eip7976_increase_calldata_floor_cost/test_refunds.py b/tests/amsterdam/eip7976_increase_calldata_floor_cost/test_refunds.py
index 4fcddcd316c..0cdbd338caa 100644
--- a/tests/amsterdam/eip7976_increase_calldata_floor_cost/test_refunds.py
+++ b/tests/amsterdam/eip7976_increase_calldata_floor_cost/test_refunds.py
@@ -271,6 +271,7 @@ def tx_gas_limit(
     return tx_gas_limit
 
 
+@pytest.mark.gas_check
 @pytest.mark.parametrize(
     "refund_test_type",
     [
diff --git a/tests/amsterdam/eip7981_increase_access_list_cost/test_access_list_cost.py b/tests/amsterdam/eip7981_increase_access_list_cost/test_access_list_cost.py
index e9479b276c9..a102b6bdd99 100644
--- a/tests/amsterdam/eip7981_increase_access_list_cost/test_access_list_cost.py
+++ b/tests/amsterdam/eip7981_increase_access_list_cost/test_access_list_cost.py
@@ -134,6 +134,7 @@ def test_access_list_token_calculation(
     )
 
 
+@pytest.mark.gas_check
 @pytest.mark.with_all_tx_types(selector=lambda tx_type: tx_type >= 1)
 @pytest.mark.parametrize(
     "access_list,tx_data",
diff --git a/tests/berlin/eip2929_gas_cost_increases/test_call.py b/tests/berlin/eip2929_gas_cost_increases/test_call.py
index 7322e1b1d2f..8776b386b5f 100644
--- a/tests/berlin/eip2929_gas_cost_increases/test_call.py
+++ b/tests/berlin/eip2929_gas_cost_increases/test_call.py
@@ -16,6 +16,7 @@
 REFERENCE_SPEC_VERSION = "0e11417265a623adb680c527b15d0cb6701b870b"
 
 
+@pytest.mark.gas_check
 @pytest.mark.valid_from("Berlin")
 @pytest.mark.eels_base_coverage
 def test_call_insufficient_balance(
diff --git a/tests/berlin/eip2929_gas_cost_increases/test_create.py b/tests/berlin/eip2929_gas_cost_increases/test_create.py
index f216a39242e..8a2c333b436 100644
--- a/tests/berlin/eip2929_gas_cost_increases/test_create.py
+++ b/tests/berlin/eip2929_gas_cost_increases/test_create.py
@@ -41,6 +41,7 @@
 REFERENCE_SPEC_VERSION = "0e11417265a623adb680c527b15d0cb6701b870b"
 
 
+@pytest.mark.gas_check
 @pytest.mark.valid_from("Berlin")
 @pytest.mark.parametrize(
     "create_opcode",
@@ -176,6 +177,7 @@ def test_create_insufficient_balance(
     )
 
 
+@pytest.mark.gas_check
 @pytest.mark.valid_from("Berlin")
 @pytest.mark.parametrize(
     "create_opcode",
diff --git a/tests/berlin/eip2929_gas_cost_increases/test_warm_status_revert.py b/tests/berlin/eip2929_gas_cost_increases/test_warm_status_revert.py
index 3925cd89958..980f032a2f5 100644
--- a/tests/berlin/eip2929_gas_cost_increases/test_warm_status_revert.py
+++ b/tests/berlin/eip2929_gas_cost_increases/test_warm_status_revert.py
@@ -19,6 +19,7 @@
 REFERENCE_SPEC_VERSION = "0e11417265a623adb680c527b15d0cb6701b870b"
 
 
+@pytest.mark.gas_check
 @pytest.mark.valid_from("Berlin")
 def test_storage_warm_status_reverted_by_subcall(
     state_test: StateTestFiller,
@@ -89,6 +90,7 @@ def test_storage_warm_status_reverted_by_subcall(
     )
 
 
+@pytest.mark.gas_check
 @pytest.mark.valid_from("Berlin")
 def test_account_warm_status_reverted_by_subcall(
     state_test: StateTestFiller,
diff --git a/tests/berlin/eip2930_access_list/test_acl.py b/tests/berlin/eip2930_access_list/test_acl.py
index 7f0f3a82498..347d22a6ed3 100644
--- a/tests/berlin/eip2930_access_list/test_acl.py
+++ b/tests/berlin/eip2930_access_list/test_acl.py
@@ -24,6 +24,7 @@
 pytestmark = pytest.mark.valid_from("Berlin")
 
 
+@pytest.mark.gas_check
 @pytest.mark.parametrize(
     "account_warm,storage_key_warm",
     [
@@ -278,6 +279,7 @@ def test_transaction_intrinsic_gas_cost(
     state_test(env=env, pre=pre, post=post, tx=tx)
 
 
+@pytest.mark.gas_check
 def test_repeated_address_acl(
     state_test: StateTestFiller,
     pre: Alloc,
diff --git a/tests/byzantium/eip196_ec_add_mul/test_gas.py b/tests/byzantium/eip196_ec_add_mul/test_gas.py
index bcf5012bc25..405d920feef 100644
--- a/tests/byzantium/eip196_ec_add_mul/test_gas.py
+++ b/tests/byzantium/eip196_ec_add_mul/test_gas.py
@@ -65,6 +65,7 @@ def test_gas_costs(
     state_test(pre=pre, post=post, tx=tx)
 
 
+@pytest.mark.gas_check
 @pytest.mark.valid_from("Byzantium")
 @pytest.mark.parametrize(
     "precompile_address, invalid_input",
diff --git a/tests/byzantium/eip197_ec_pairing/test_gas.py b/tests/byzantium/eip197_ec_pairing/test_gas.py
index 1a1108c37f0..edabde581c0 100644
--- a/tests/byzantium/eip197_ec_pairing/test_gas.py
+++ b/tests/byzantium/eip197_ec_pairing/test_gas.py
@@ -62,6 +62,7 @@ def test_gas_costs(
     state_test(pre=pre, post=post, tx=tx)
 
 
+@pytest.mark.gas_check
 @pytest.mark.valid_from("Byzantium")
 @pytest.mark.parametrize(
     "input_data",
diff --git a/tests/cancun/eip1153_tstore/test_basic_tload.py b/tests/cancun/eip1153_tstore/test_basic_tload.py
index 6cd9eddd647..72c8bbd0cc9 100644
--- a/tests/cancun/eip1153_tstore/test_basic_tload.py
+++ b/tests/cancun/eip1153_tstore/test_basic_tload.py
@@ -186,6 +186,7 @@ def test_basic_tload_other_after_tstore(
     state_test(env=Environment(), pre=pre, post=post, tx=tx)
 
 
+@pytest.mark.gas_check
 @pytest.mark.ported_from(
     [
         "https://github.com/ethereum/tests/blob/v13.3/src/GeneralStateTestsFiller/Cancun/stEIP1153-transientStorage/16_tloadGasFiller.yml",
diff --git a/tests/cancun/eip1153_tstore/test_tstorage.py b/tests/cancun/eip1153_tstore/test_tstorage.py
index a32bb0504df..27f68ece331 100644
--- a/tests/cancun/eip1153_tstore/test_tstorage.py
+++ b/tests/cancun/eip1153_tstore/test_tstorage.py
@@ -239,6 +239,7 @@ class GasMeasureTestCases(PytestParameterEnum):
     }
 
 
+@pytest.mark.gas_check
 @pytest.mark.ported_from(
     [
         "https://github.com/ethereum/tests/blob/v13.3/src/GeneralStateTestsFiller/Cancun/stEIP1153-transientStorage/17_tstoreGasFiller.yml",  # noqa: E501
diff --git a/tests/cancun/eip4844_blobs/test_blob_txs.py b/tests/cancun/eip4844_blobs/test_blob_txs.py
index 50be25f9524..2d0071ba453 100644
--- a/tests/cancun/eip4844_blobs/test_blob_txs.py
+++ b/tests/cancun/eip4844_blobs/test_blob_txs.py
@@ -390,6 +390,7 @@ def block(
     )
 
 
+@pytest.mark.gas_check
 @pytest.mark.parametrize_by_fork(
     "blobs_per_tx",
     SpecHelpers.all_valid_blob_combinations,
@@ -729,6 +730,7 @@ def test_sufficient_balance_blob_tx(
     )
 
 
+@pytest.mark.gas_check
 @pytest.mark.parametrize_by_fork(
     "blobs_per_tx",
     lambda fork: [
diff --git a/tests/cancun/eip4844_blobs/test_blobhash_opcode.py b/tests/cancun/eip4844_blobs/test_blobhash_opcode.py
index fdf1d87e1b7..9b551bab386 100644
--- a/tests/cancun/eip4844_blobs/test_blobhash_opcode.py
+++ b/tests/cancun/eip4844_blobs/test_blobhash_opcode.py
@@ -172,6 +172,7 @@ def generate_blobhash_bytecode(
         return scenario
 
 
+@pytest.mark.gas_check
 @pytest.mark.parametrize("blobhash_index", blobhash_index_values)
 @pytest.mark.with_all_tx_types
 def test_blobhash_gas_cost(
diff --git a/tests/cancun/eip4844_blobs/test_excess_blob_gas.py b/tests/cancun/eip4844_blobs/test_excess_blob_gas.py
index 5d6c14a10e8..1f9e1e2acd7 100644
--- a/tests/cancun/eip4844_blobs/test_excess_blob_gas.py
+++ b/tests/cancun/eip4844_blobs/test_excess_blob_gas.py
@@ -301,6 +301,7 @@ def post(  # noqa: D103
     }
 
 
+@pytest.mark.gas_check
 @pytest.mark.parametrize_by_fork(
     "parent_blobs",
     lambda fork: range(0, fork.max_blobs_per_block() + 1),
@@ -365,6 +366,7 @@ def generator_function(fork: Fork) -> List[int]:
     return generator_function
 
 
+@pytest.mark.gas_check
 @pytest.mark.parametrize_by_fork(
     "parent_excess_blobs",
     generate_blob_gas_cost_increases_tests(-1),
@@ -401,6 +403,7 @@ def test_correct_increasing_blob_gas_costs(
     )
 
 
+@pytest.mark.gas_check
 @pytest.mark.parametrize_by_fork(
     "parent_excess_blobs",
     generate_blob_gas_cost_increases_tests(0),
diff --git a/tests/cancun/eip4844_blobs/test_point_evaluation_precompile.py b/tests/cancun/eip4844_blobs/test_point_evaluation_precompile.py
index 952b41069ea..547e814e4e6 100644
--- a/tests/cancun/eip4844_blobs/test_point_evaluation_precompile.py
+++ b/tests/cancun/eip4844_blobs/test_point_evaluation_precompile.py
@@ -534,6 +534,7 @@ def test_call_opcode_types(
     )
 
 
+@pytest.mark.gas_check
 @pytest.mark.parametrize(
     "call_gas",
     [
diff --git a/tests/cancun/eip4844_blobs/test_point_evaluation_precompile_gas.py b/tests/cancun/eip4844_blobs/test_point_evaluation_precompile_gas.py
index 231f084a7f9..733bad52fb8 100644
--- a/tests/cancun/eip4844_blobs/test_point_evaluation_precompile_gas.py
+++ b/tests/cancun/eip4844_blobs/test_point_evaluation_precompile_gas.py
@@ -188,6 +188,7 @@ def post(
     }
 
 
+@pytest.mark.gas_check
 @pytest.mark.parametrize(
     "call_type",
     [Op.CALL, Op.DELEGATECALL, Op.CALLCODE, Op.STATICCALL],
diff --git a/tests/cancun/eip5656_mcopy/test_mcopy_memory_expansion.py b/tests/cancun/eip5656_mcopy/test_mcopy_memory_expansion.py
index 23f32896ed1..a4cff355b04 100644
--- a/tests/cancun/eip5656_mcopy/test_mcopy_memory_expansion.py
+++ b/tests/cancun/eip5656_mcopy/test_mcopy_memory_expansion.py
@@ -151,6 +151,7 @@ def post(  # noqa: D103
     }
 
 
+@pytest.mark.gas_check
 @pytest.mark.parametrize(
     "dest,src,length",
     [
@@ -210,6 +211,7 @@ def test_mcopy_memory_expansion(
     )
 
 
+@pytest.mark.gas_check
 @pytest.mark.parametrize(
     "dest,src,length",
     [
diff --git a/tests/frontier/opcodes/test_all_opcodes.py b/tests/frontier/opcodes/test_all_opcodes.py
index c13bb4f1563..6bdec21fb47 100644
--- a/tests/frontier/opcodes/test_all_opcodes.py
+++ b/tests/frontier/opcodes/test_all_opcodes.py
@@ -305,6 +305,7 @@ def constant_gas_opcodes(fork: Fork) -> Generator[ParameterSet, None, None]:
         )
 
 
+@pytest.mark.gas_check
 @pytest.mark.valid_from("Berlin")
 @pytest.mark.parametrize_by_fork("opcode", constant_gas_opcodes)
 @pytest.mark.eels_base_coverage
diff --git a/tests/frontier/opcodes/test_call.py b/tests/frontier/opcodes/test_call.py
index 95ad749a868..b0cdd94e956 100644
--- a/tests/frontier/opcodes/test_call.py
+++ b/tests/frontier/opcodes/test_call.py
@@ -16,6 +16,7 @@
 # TODO: There's an issue with gas definitions on forks previous to Berlin,
 # remove this when fixed. https://github.com/ethereum/execution-spec-
 # tests/pull/1952#discussion_r2237634275
+@pytest.mark.gas_check
 @pytest.mark.valid_from("Berlin")
 def test_call_large_offset_mstore(
     state_test: StateTestFiller,
@@ -83,6 +84,7 @@ def test_call_large_offset_mstore(
 # TODO: There's an issue with gas definitions on forks previous to Berlin,
 # remove this when fixed. https://github.com/ethereum/execution-spec-
 # tests/pull/1952#discussion_r2237634275
+@pytest.mark.gas_check
 @pytest.mark.valid_from("Berlin")
 def test_call_memory_expands_on_early_revert(
     state_test: StateTestFiller,
@@ -170,6 +172,7 @@ def test_call_memory_expands_on_early_revert(
 # TODO: There's an issue with gas definitions on forks previous to Berlin,
 # remove this when fixed. https://github.com/ethereum/execution-spec-
 # tests/pull/1952#discussion_r2237634275
+@pytest.mark.gas_check
 @pytest.mark.with_all_call_opcodes
 @pytest.mark.valid_from("Berlin")
 def test_call_large_args_offset_size_zero(
diff --git a/tests/frontier/opcodes/test_exp.py b/tests/frontier/opcodes/test_exp.py
index 2fcbbe9f990..1cab6ba6338 100644
--- a/tests/frontier/opcodes/test_exp.py
+++ b/tests/frontier/opcodes/test_exp.py
@@ -15,6 +15,7 @@
 REFERENCE_SPEC_VERSION = "N/A"
 
 
+@pytest.mark.gas_check
 @pytest.mark.valid_from("Berlin")
 @pytest.mark.parametrize(
     "a", [0, 1, pytest.param(2**256 - 1, id="a2to256minus1")]
diff --git a/tests/frontier/opcodes/test_log.py b/tests/frontier/opcodes/test_log.py
index e3cfe61c4fc..e25130673d0 100644
--- a/tests/frontier/opcodes/test_log.py
+++ b/tests/frontier/opcodes/test_log.py
@@ -15,6 +15,7 @@
 REFERENCE_SPEC_VERSION = "N/A"
 
 
+@pytest.mark.gas_check
 @pytest.mark.valid_from("Berlin")
 @pytest.mark.parametrize(
     "opcode,topics",
diff --git a/tests/osaka/eip7918_blob_reserve_price/test_blob_base_fee.py b/tests/osaka/eip7918_blob_reserve_price/test_blob_base_fee.py
index cd6b444f227..3b778c96fd5 100644
--- a/tests/osaka/eip7918_blob_reserve_price/test_blob_base_fee.py
+++ b/tests/osaka/eip7918_blob_reserve_price/test_blob_base_fee.py
@@ -133,6 +133,7 @@ def post(
     }
 
 
+@pytest.mark.gas_check
 @pytest.mark.parametrize(
     "block_base_fee_per_gas",
     [1, 7, 15, 16, 17, 100, 1000, 10000],
@@ -228,6 +229,7 @@ def get_boundary_scenarios(fork: Fork) -> Iterator[Any]:
         yield pytest.param(excess_blobs, delta)
 
 
+@pytest.mark.gas_check
 @pytest.mark.parametrize_by_fork(
     "parent_excess_blobs,block_base_fee_per_gas_delta",
     get_boundary_scenarios,
diff --git a/tests/osaka/eip7939_count_leading_zeros/test_count_leading_zeros.py b/tests/osaka/eip7939_count_leading_zeros/test_count_leading_zeros.py
index 171b8255431..daf0762cc12 100644
--- a/tests/osaka/eip7939_count_leading_zeros/test_count_leading_zeros.py
+++ b/tests/osaka/eip7939_count_leading_zeros/test_count_leading_zeros.py
@@ -125,6 +125,7 @@ def test_clz_opcode_scenarios(
     state_test(pre=pre, post=post, tx=tx)
 
 
+@pytest.mark.gas_check
 @pytest.mark.valid_from("Osaka")
 def test_clz_gas_cost(
     state_test: StateTestFiller, pre: Alloc, fork: Fork
diff --git a/tests/prague/eip7623_increase_calldata_cost/test_execution_gas.py b/tests/prague/eip7623_increase_calldata_cost/test_execution_gas.py
index 7e8bc2c80f6..0ba75d781a3 100644
--- a/tests/prague/eip7623_increase_calldata_cost/test_execution_gas.py
+++ b/tests/prague/eip7623_increase_calldata_cost/test_execution_gas.py
@@ -57,6 +57,7 @@ def to(
         """
         return pre.deploy_contract(Op.INVALID)
 
+    @pytest.mark.gas_check
     @pytest.mark.parametrize(
         "ty,protected,authorization_list",
         [
@@ -135,6 +136,7 @@ def to(
             (Op.JUMPDEST * (execution_gas - 1)) + Op.STOP
         )
 
+    @pytest.mark.gas_check
     @pytest.mark.parametrize(
         "ty,protected,authorization_list",
         [
diff --git a/tests/prague/eip7623_increase_calldata_cost/test_refunds.py b/tests/prague/eip7623_increase_calldata_cost/test_refunds.py
index fdbe63e0c03..57eb183444c 100644
--- a/tests/prague/eip7623_increase_calldata_cost/test_refunds.py
+++ b/tests/prague/eip7623_increase_calldata_cost/test_refunds.py
@@ -279,6 +279,7 @@ def tx_gas_limit(
     return tx_gas_limit
 
 
+@pytest.mark.gas_check
 @pytest.mark.parametrize(
     "refund_test_type",
     [
diff --git a/tests/prague/eip7702_set_code_tx/test_set_code_txs.py b/tests/prague/eip7702_set_code_tx/test_set_code_txs.py
index 0a08b2cbfb9..f38488e3374 100644
--- a/tests/prague/eip7702_set_code_tx/test_set_code_txs.py
+++ b/tests/prague/eip7702_set_code_tx/test_set_code_txs.py
@@ -3113,6 +3113,7 @@ def test_set_code_to_precompile(
     )
 
 
+@pytest.mark.gas_check
 @pytest.mark.with_all_precompiles
 def test_set_code_to_precompile_not_enough_gas_for_precompile_execution(
     state_test: StateTestFiller,
diff --git a/tests/prague/eip7702_set_code_tx/test_set_code_txs_2.py b/tests/prague/eip7702_set_code_tx/test_set_code_txs_2.py
index c92a3c34ff6..b6a47ebb83a 100644
--- a/tests/prague/eip7702_set_code_tx/test_set_code_txs_2.py
+++ b/tests/prague/eip7702_set_code_tx/test_set_code_txs_2.py
@@ -656,6 +656,7 @@ class AccessListTo(Enum):
     CONTRACT_ADDRESS = 2
 
 
+@pytest.mark.gas_check
 @pytest.mark.parametrize(
     "access_list_rule",
     [
@@ -876,6 +877,7 @@ def test_gas_diff_pointer_vs_direct_call(
     )
 
 
+@pytest.mark.gas_check
 @pytest.mark.valid_from("Prague")
 def test_pointer_call_followed_by_direct_call(
     state_test: StateTestFiller,
@@ -2096,6 +2098,7 @@ def test_set_code_type_tx_pre_fork(
     )
 
 
+@pytest.mark.gas_check
 @pytest.mark.valid_from("Prague")
 @pytest.mark.xdist_group(name="bigmem")
 def test_delegation_replacement_call_previous_contract(
diff --git a/tests/shanghai/eip3651_warm_coinbase/test_warm_coinbase.py b/tests/shanghai/eip3651_warm_coinbase/test_warm_coinbase.py
index 1a33c174470..9188777851f 100644
--- a/tests/shanghai/eip3651_warm_coinbase/test_warm_coinbase.py
+++ b/tests/shanghai/eip3651_warm_coinbase/test_warm_coinbase.py
@@ -136,6 +136,7 @@ def test_warm_coinbase_call_out_of_gas(
 ]
 
 
+@pytest.mark.gas_check
 @pytest.mark.valid_from("Berlin")  # these tests fill for fork >= Berlin
 @pytest.mark.parametrize(
     "opcode,measured_code,extra_stack_items",

From 47f70598e7c1ea55642e245d135e1c3f5269a304 Mon Sep 17 00:00:00 2001
From: Leo Lara <leo@leolara.me>
Date: Sat, 23 May 2026 11:50:22 +0700
Subject: [PATCH 3/9] test(tooling): unit tests for the gas_taint plugin

38 tests covering:

- ``GasTainted`` carrier: construction, arithmetic propagation (incl.
  reflected ops), origin union/dedup, and the NotImplemented forward
  path that fixes ``bytes_literal * GasTainted(n)``.
- ``install_taint`` / ``uninstall_taint``: taint reaches Bytecode and
  fork gas-cost outputs, survives ``HashInt`` / ``HexNumber`` /
  ``ZeroPaddedHexNumber`` and Pydantic ``Storage`` validation, doesn't
  leak onto plain ints, and uninstall reverts.
- ``collect_taint_hits``: storage taint detection, presence-based
  receipt / header / block-level / benchmark detection, the
  BenchmarkTest MRO gate that prevents filler-default false positives,
  and the OOG exclusions (tx.error, block_exception, exception_test).
---
 .../plugins/filler/tests/test_gas_taint.py    | 490 ++++++++++++++++++
 1 file changed, 490 insertions(+)
 create mode 100644 packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_gas_taint.py

diff --git a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_gas_taint.py b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_gas_taint.py
new file mode 100644
index 00000000000..8e7ff9e03cc
--- /dev/null
+++ b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_gas_taint.py
@@ -0,0 +1,490 @@
+"""Unit tests for the gas_taint plugin."""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+from typing import Any, Callable, Iterator, cast
+
+import pytest
+
+from execution_testing.base_types.base_types import (
+    HashInt,
+    HexNumber,
+    ZeroPaddedHexNumber,
+)
+from execution_testing.base_types.composite_types import Storage
+from execution_testing.cli.pytest_commands.plugins.filler.gas_taint import (
+    GasTainted,
+    _is_tainted,
+    _origins_of,
+    collect_taint_hits,
+    install_taint,
+    uninstall_taint,
+)
+
+# ---------------------------------------------------------------------------
+# GasTainted carrier
+# ---------------------------------------------------------------------------
+
+
+class TestGasTaintedCarrier:
+    """The int subclass that carries provenance through arithmetic."""
+
+    def test_construction_attaches_origins(self) -> None:
+        """Test construction attaches origins."""
+        t = GasTainted(42, ("source.X",))
+        assert int(t) == 42
+        assert t.origins == ("source.X",)
+        assert isinstance(t, int)
+
+    def test_default_origin_when_empty(self) -> None:
+        """Test default origin when empty."""
+        # An empty tuple gets replaced with the "?" placeholder so callers
+        # can always rely on ``origins`` being non-empty.
+        t = GasTainted(7)
+        assert t.origins == ("?",)
+
+    def test_is_tainted_and_origins_of(self) -> None:
+        """Test is tainted and origins of."""
+        assert _is_tainted(GasTainted(1, ("X",)))
+        assert not _is_tainted(1)
+        assert _origins_of(GasTainted(5, ("Y",))) == ("Y",)
+        assert _origins_of(5) is None
+
+    @pytest.mark.parametrize(
+        "op,expected",
+        [
+            (lambda a, b: a + b, 13),
+            (lambda a, b: a - b, 7),
+            (lambda a, b: a * b, 30),
+            (lambda a, b: a // b, 3),
+            (lambda a, b: a % b, 1),
+        ],
+    )
+    def test_arithmetic_preserves_taint(
+        self, op: Callable[[int, int], int], expected: int
+    ) -> None:
+        """Test arithmetic preserves taint."""
+        a = GasTainted(10, ("A",))
+        result = op(a, 3)
+        assert isinstance(result, GasTainted)
+        assert int(result) == expected
+        assert result.origins == ("A",)
+
+    def test_negation_preserves_taint(self) -> None:
+        """Test negation preserves taint."""
+        n = -GasTainted(5, ("X",))
+        assert isinstance(n, GasTainted)
+        assert int(n) == -5
+        assert n.origins == ("X",)
+
+    def test_reflected_ops_preserve_taint(self) -> None:
+        """Test reflected ops preserve taint."""
+        # ``5 + GasTainted(3)`` triggers __radd__ on the tainted side.
+        result = 5 + GasTainted(3, ("R",))
+        assert isinstance(result, GasTainted)
+        assert int(result) == 8
+        assert result.origins == ("R",)
+
+    def test_merge_origins_unions(self) -> None:
+        """Test merge origins unions."""
+        a = GasTainted(2, ("A",))
+        b = GasTainted(3, ("B",))
+        s = a + b
+        # Order preserved (a's origins first, then b's).
+        assert s.origins == ("A", "B")
+
+    def test_merge_origins_deduplicates(self) -> None:
+        """Test merge origins deduplicates."""
+        a = GasTainted(2, ("X", "Y"))
+        b = GasTainted(3, ("Y", "Z"))
+        s = a + b
+        assert s.origins == ("X", "Y", "Z")
+
+    def test_rmul_with_bytes_returns_notimplemented_gracefully(self) -> None:
+        """Test rmul with bytes returns notimplemented gracefully."""
+        # ``b"\x01" * GasTainted(n)`` is a real expression in tests that
+        # compute calldata sizes from gas. ``bytes.__mul__`` doesn't accept
+        # an arbitrary int subclass, so Python falls back to
+        # ``GasTainted.__rmul__(n, b"...")``; ``int.__rmul__`` returns
+        # ``NotImplemented``, which the dunder must forward — wrapping it
+        # in a new GasTainted blows up on ``int(NotImplemented)``.
+        result = b"\x01" * GasTainted(5, ("X",))
+        assert result == b"\x01\x01\x01\x01\x01"
+        assert type(result) is bytes
+
+
+# ---------------------------------------------------------------------------
+# install_taint / uninstall_taint
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def taint_installed() -> Iterator[None]:
+    """Install taint patches for one test, then revert."""
+    install_taint()
+    try:
+        yield
+    finally:
+        uninstall_taint()
+
+
+@pytest.mark.usefixtures("taint_installed")
+class TestTaintInstallation:
+    """The monkey-patches that make gas values carry provenance."""
+
+    def test_bytecode_gas_cost_is_tainted(self) -> None:
+        """Test bytecode gas cost is tainted."""
+        from execution_testing import Op
+        from execution_testing.forks.forks.forks import Cancun
+
+        result = Op.PUSH1[1].gas_cost(Cancun)
+        assert _is_tainted(result)
+        # Origin can be either the per-opcode label (when the wrapped
+        # opcode_gas_calculator already returns tainted gas_costs.X) or
+        # the outer Bytecode.gas_cost label (when summing untainted
+        # constants). The important guarantee is that *some* gas origin
+        # is recorded.
+        assert any("gas" in o for o in cast(GasTainted, result).origins)
+
+    def test_fork_gas_costs_fields_are_tainted(self) -> None:
+        """Test fork gas costs fields are tainted."""
+        from execution_testing.forks.forks.forks import Cancun
+
+        gc = Cancun.gas_costs()
+        assert _is_tainted(gc.STORAGE_SET)
+        assert (
+            "gas_costs.STORAGE_SET" in cast(GasTainted, gc.STORAGE_SET).origins
+        )
+
+    def test_hashint_preserves_taint(self) -> None:
+        """Test hashint preserves taint."""
+        # HashInt inherits from FixedSizeHexNumber, *not* from Number.
+        # This is the case that motivated patching both __new__ methods.
+        tainted = GasTainted(123, ("src",))
+        h = HashInt(tainted)
+        assert _is_tainted(h)
+        assert cast(GasTainted, h).origins == ("src",)
+
+    def test_hexnumber_preserves_taint(self) -> None:
+        """Test hexnumber preserves taint."""
+        # HexNumber inherits from Number — covered by the Number.__new__
+        # patch.
+        tainted = GasTainted(456, ("src",))
+        x = HexNumber(tainted)
+        assert _is_tainted(x)
+
+    def test_zero_padded_hexnumber_preserves_taint(self) -> None:
+        """Test zero padded hexnumber preserves taint."""
+        tainted = GasTainted(789, ("src",))
+        x = ZeroPaddedHexNumber(tainted)
+        assert _is_tainted(x)
+
+    def test_plain_value_through_constructors_is_not_tainted(self) -> None:
+        """Test plain value through constructors is not tainted."""
+        # The patch must be opt-in: an ordinary int passed through these
+        # constructors must NOT become tainted, or every storage slot
+        # would be flagged.
+        assert not _is_tainted(HashInt(42))
+        assert not _is_tainted(HexNumber(42))
+
+    def test_storage_dict_preserves_taint(self) -> None:
+        """Test storage dict preserves taint."""
+        # Storage is a Pydantic RootModel; the value goes through HashInt
+        # coercion. This is the critical end-to-end taint path.
+        tainted = GasTainted(99, ("from_test",))
+        s = Storage(cast(Any, {0: tainted}))
+        stored = next(iter(s.root.values()))
+        assert _is_tainted(stored)
+        assert cast(GasTainted, stored).origins == ("from_test",)
+
+    def test_install_is_idempotent(self) -> None:
+        """Test install is idempotent."""
+        install_taint()
+        try:
+            install_taint()  # second call must not double-wrap.
+            from execution_testing import Op
+            from execution_testing.forks.forks.forks import Cancun
+
+            result = Op.PUSH1[1].gas_cost(Cancun)
+            # If install double-wrapped, origins would contain
+            # 'Bytecode.gas_cost' twice. The carrier dedupes within a
+            # tuple, but the double-wrap would still produce nested
+            # GasTainted instances and confused arithmetic.
+            assert _is_tainted(result)
+            # Sanity check: it's a flat int subclass, not a nested
+            # GasTainted-of-GasTainted.
+            assert type(result).__name__ == "GasTainted"
+            assert type(int(result)) is int
+        finally:
+            uninstall_taint()
+
+    def test_uninstall_reverts_patches(self) -> None:
+        """Test uninstall reverts patches."""
+        install_taint()
+        uninstall_taint()
+        from execution_testing import Op
+        from execution_testing.forks.forks.forks import Cancun
+
+        result = Op.PUSH1[1].gas_cost(Cancun)
+        assert not _is_tainted(result)
+        gc = Cancun.gas_costs()
+        assert not _is_tainted(gc.STORAGE_SET)
+
+
+# ---------------------------------------------------------------------------
+# collect_taint_hits walker
+# ---------------------------------------------------------------------------
+
+
+def _make_account(storage_dict: dict | None) -> SimpleNamespace:
+    return SimpleNamespace(
+        storage=SimpleNamespace(root=storage_dict) if storage_dict else None
+    )
+
+
+def _make_post(accounts: dict) -> dict:
+    """Return a dict that quacks like ``Alloc`` (provides ``.items()``)."""
+    return accounts
+
+
+def _make_test(
+    *,
+    post: Any = None,
+    tx: Any = None,
+    blocks: Any = None,
+    block_exception: Any = None,
+    expected_benchmark_gas_used: Any = None,
+) -> SimpleNamespace:
+    """
+    Build a minimal test object with the attributes the walker reads.
+
+    The benchmark sink gates on the class MRO containing a class named
+    ``BenchmarkTest``; SimpleNamespace doesn't satisfy that, so the
+    benchmark tests build a separate real subclass below.
+    """
+    return SimpleNamespace(
+        post=post,
+        tx=tx,
+        blocks=blocks,
+        block_exception=block_exception,
+        expected_benchmark_gas_used=expected_benchmark_gas_used,
+    )
+
+
+def _make_node(has_exception_marker: bool = False) -> Any:
+    # Returned as Any so test sites can pass the duck-typed namespace
+    # to ``collect_taint_hits`` (which is typed ``pytest.Item | None``).
+    return SimpleNamespace(
+        get_closest_marker=lambda name: (
+            object()
+            if name == "exception_test" and has_exception_marker
+            else None
+        )
+    )
+
+
+class TestStorageSink:
+    """post[addr].storage[slot] requires taint to flag."""
+
+    def test_tainted_storage_value_is_recorded(self) -> None:
+        """Test tainted storage value is recorded."""
+        post = _make_post(
+            {"0xabc": _make_account({0: GasTainted(42, ("Y",))})}
+        )
+        hits = collect_taint_hits(_make_test(post=post), _make_node())
+        assert len(hits) == 1
+        assert hits[0]["kind"] == "storage"
+        assert hits[0]["value"] == 42
+        assert hits[0]["origins"] == ["Y"]
+        assert "0xabc" in hits[0]["location"]
+
+    def test_untainted_storage_value_is_skipped(self) -> None:
+        """Test untainted storage value is skipped."""
+        # A plain int in storage — the whole point of using taint here is
+        # that storage slots can hold anything.
+        post = _make_post({"0xabc": _make_account({0: 42})})
+        hits = collect_taint_hits(_make_test(post=post), _make_node())
+        assert hits == []
+
+    def test_account_with_no_storage_is_skipped(self) -> None:
+        """Test account with no storage is skipped."""
+        post = _make_post({"0xabc": _make_account(None)})
+        hits = collect_taint_hits(_make_test(post=post), _make_node())
+        assert hits == []
+
+    def test_none_post_is_skipped(self) -> None:
+        """Test none post is skipped."""
+        hits = collect_taint_hits(_make_test(post=None), _make_node())
+        assert hits == []
+
+
+class TestReceiptSink:
+    """expected_receipt fields are flagged on presence, not taint."""
+
+    def test_cumulative_gas_used_recorded(self) -> None:
+        """Test cumulative gas used recorded."""
+        tx = SimpleNamespace(
+            error=None,
+            expected_receipt=SimpleNamespace(
+                cumulative_gas_used=21000,
+                gas_used=None,
+                blob_gas_used=None,
+            ),
+        )
+        hits = collect_taint_hits(_make_test(tx=tx), _make_node())
+        assert len(hits) == 1
+        assert hits[0]["kind"] == "receipt"
+        assert hits[0]["location"] == "cumulative_gas_used"
+        assert hits[0]["value"] == 21000
+        # No taint involved — origins absent from the hit dict.
+        assert "origins" not in hits[0]
+
+    def test_all_three_receipt_fields(self) -> None:
+        """Test all three receipt fields."""
+        tx = SimpleNamespace(
+            error=None,
+            expected_receipt=SimpleNamespace(
+                cumulative_gas_used=100,
+                gas_used=200,
+                blob_gas_used=300,
+            ),
+        )
+        hits = collect_taint_hits(_make_test(tx=tx), _make_node())
+        locations = {h["location"] for h in hits}
+        assert locations == {
+            "cumulative_gas_used",
+            "gas_used",
+            "blob_gas_used",
+        }
+
+    def test_no_expected_receipt_skipped(self) -> None:
+        """Test no expected receipt skipped."""
+        tx = SimpleNamespace(error=None, expected_receipt=None)
+        assert collect_taint_hits(_make_test(tx=tx), _make_node()) == []
+
+
+class TestHeaderAndBlockSinks:
+    """block.header_verify and block.expected_gas_used flagged on presence."""
+
+    def test_header_verify_blob_gas_used(self) -> None:
+        """Test header verify blob gas used."""
+        block = SimpleNamespace(
+            header_verify=SimpleNamespace(gas_used=None, blob_gas_used=131072),
+            expected_gas_used=None,
+        )
+        hits = collect_taint_hits(_make_test(blocks=[block]), _make_node())
+        assert len(hits) == 1
+        assert hits[0]["kind"] == "header"
+        assert hits[0]["location"] == "block[0].blob_gas_used"
+        assert hits[0]["value"] == 131072
+
+    def test_block_expected_gas_used(self) -> None:
+        """Test block expected gas used."""
+        block = SimpleNamespace(header_verify=None, expected_gas_used=199_156)
+        hits = collect_taint_hits(_make_test(blocks=[block]), _make_node())
+        assert len(hits) == 1
+        assert hits[0]["kind"] == "block_expected_gas_used"
+        assert hits[0]["value"] == 199_156
+
+    def test_multiple_blocks_indexed(self) -> None:
+        """Test multiple blocks indexed."""
+        blocks = [
+            SimpleNamespace(header_verify=None, expected_gas_used=100),
+            SimpleNamespace(header_verify=None, expected_gas_used=200),
+        ]
+        hits = collect_taint_hits(_make_test(blocks=blocks), _make_node())
+        assert {h["location"] for h in hits} == {
+            "block[0].expected_gas_used",
+            "block[1].expected_gas_used",
+        }
+
+
+class _FakeBenchmarkTest:
+    """Stand-in for ``BenchmarkTest`` so the MRO name check fires."""
+
+    # The walker checks ``c.__name__ == "BenchmarkTest"`` in mro; faking
+    # the class name is enough to exercise the gate.
+
+
+_FakeBenchmarkTest.__name__ = "BenchmarkTest"
+
+
+class TestBenchmarkSink:
+    """expected_benchmark_gas_used requires the BenchmarkTest MRO gate."""
+
+    def test_skipped_on_non_benchmark_test(self) -> None:
+        """Test skipped on non benchmark test."""
+        # SimpleNamespace's MRO doesn't include BenchmarkTest.
+        test = _make_test(expected_benchmark_gas_used=120_000_000)
+        assert collect_taint_hits(test, _make_node()) == []
+
+    def test_recorded_on_benchmark_test(self) -> None:
+        """Test recorded on benchmark test."""
+        # Build a real subclass whose MRO contains a class named
+        # "BenchmarkTest". The walker scans names, not identity.
+        bt = _FakeBenchmarkTest()
+        bt.post = None  # type: ignore[attr-defined]
+        bt.tx = None  # type: ignore[attr-defined]
+        bt.blocks = None  # type: ignore[attr-defined]
+        bt.block_exception = None  # type: ignore[attr-defined]
+        bt.expected_benchmark_gas_used = 99_999_999  # type: ignore[attr-defined]
+        hits = collect_taint_hits(bt, _make_node())
+        assert len(hits) == 1
+        assert hits[0]["kind"] == "benchmark"
+        assert hits[0]["value"] == 99_999_999
+
+
+class TestOOGExclusion:
+    """OOG-style tests are excluded — they don't write to positive sinks."""
+
+    def test_tx_error_set_excludes_test(self) -> None:
+        """Test tx error set excludes test."""
+        # Even with a tainted storage value, an error on tx means the
+        # test expects rejection — drop it.
+        post = _make_post({"0xa": _make_account({0: GasTainted(1, ("X",))})})
+        tx = SimpleNamespace(error="some_error", expected_receipt=None)
+        assert (
+            collect_taint_hits(_make_test(post=post, tx=tx), _make_node())
+            == []
+        )
+
+    def test_block_exception_excludes_test(self) -> None:
+        """Test block exception excludes test."""
+        post = _make_post({"0xa": _make_account({0: GasTainted(1, ("X",))})})
+        assert (
+            collect_taint_hits(
+                _make_test(post=post, block_exception="boom"),
+                _make_node(),
+            )
+            == []
+        )
+
+    def test_exception_test_marker_excludes_test(self) -> None:
+        """Test exception test marker excludes test."""
+        post = _make_post({"0xa": _make_account({0: GasTainted(1, ("X",))})})
+        assert (
+            collect_taint_hits(
+                _make_test(post=post), _make_node(has_exception_marker=True)
+            )
+            == []
+        )
+
+
+class TestMixedSinks:
+    """Multiple sinks can fire for the same test."""
+
+    def test_storage_and_receipt(self) -> None:
+        """Test storage and receipt."""
+        post = _make_post({"0xa": _make_account({0: GasTainted(50, ("X",))})})
+        tx = SimpleNamespace(
+            error=None,
+            expected_receipt=SimpleNamespace(
+                cumulative_gas_used=21000,
+                gas_used=None,
+                blob_gas_used=None,
+            ),
+        )
+        hits = collect_taint_hits(_make_test(post=post, tx=tx), _make_node())
+        kinds = {h["kind"] for h in hits}
+        assert kinds == {"storage", "receipt"}

From 815477aa7a2e6400227348ad4b10724238817833 Mon Sep 17 00:00:00 2001
From: Leo Lara <leo@leolara.me>
Date: Tue, 26 May 2026 18:48:47 +0700
Subject: [PATCH 4/9] refactor(tooling): type the gas_taint walker against real
 spec models

Replace ``Any`` and ``getattr``-based field access in the sink walker
with direct attribute access against ``BaseTest`` / ``StateTest`` /
``BlockchainTest`` / ``BenchmarkTest`` / ``Alloc`` / ``Transaction`` /
``Block``. Spec-type-specific access (``test.tx``, ``test.blocks``) now
dispatches via ``isinstance`` rather than ``getattr(test, "...", None)``,
so a future rename of ``post`` / ``tx`` / ``expected_receipt`` / etc.
fails type-checking instead of silently returning empty hits.

Also tightens the few remaining ``Any``s where possible: the wrapped
opcode-gas calculator is now typed as ``Callable[[OpcodeBase], int]``,
and ``pytest_testnodedown`` uses ``xdist.workermanage.WorkerController``
(stub added) and ``object | None``, matching the upstream xdist hook.

The unit tests built with ``SimpleNamespace`` fakes no longer satisfy
the new typed signature and will be replaced in a follow-up commit.
---
 .../plugins/filler/gas_taint.py               | 129 ++++++++++++------
 packages/testing/stubs/xdist/workermanage.pyi |   7 +
 2 files changed, 91 insertions(+), 45 deletions(-)
 create mode 100644 packages/testing/stubs/xdist/workermanage.pyi

diff --git a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/gas_taint.py b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/gas_taint.py
index 050ba574ab2..2f9b182d7b0 100644
--- a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/gas_taint.py
+++ b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/gas_taint.py
@@ -35,10 +35,31 @@
 import json
 from dataclasses import fields, replace
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Set, Tuple
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    List,
+    Set,
+    Tuple,
+)
 
 import pytest
 
+from execution_testing.specs.base import BaseTest
+from execution_testing.specs.benchmark import BenchmarkTest
+from execution_testing.specs.blockchain import Block, BlockchainTest
+from execution_testing.specs.state import StateTest
+from execution_testing.test_types.account_types import Alloc
+from execution_testing.test_types.transaction_types import Transaction
+from execution_testing.test_types.utils import Removable
+from execution_testing.vm.bases import OpcodeBase
+
+if TYPE_CHECKING:
+    from xdist.workermanage import WorkerController
+
 # ---------------------------------------------------------------------------
 # Taint carrier
 # ---------------------------------------------------------------------------
@@ -168,7 +189,7 @@ def calc(*a: Any, **kw: Any) -> Any:
 def _wrap_opcode_calc(c: type, fn: Callable, args: tuple, kwargs: dict) -> Any:
     inner = fn(c, *args, **kwargs)
 
-    def calc(opcode: Any) -> Any:
+    def calc(opcode: OpcodeBase) -> int:
         result = inner(opcode)
         if isinstance(result, int) and not _is_tainted(result):
             name = getattr(opcode, "_name_", None) or str(opcode)
@@ -260,9 +281,11 @@ def uninstall_taint() -> None:
 
 
 def _record_tainted(
-    hits: List[dict], kind: str, location: str, value: Any
+    hits: List[dict], kind: str, location: str, value: int | None
 ) -> None:
     """Record only if value carries gas-source taint (used for storage)."""
+    if value is None:
+        return
     origins = _origins_of(value)
     if origins is None:
         return
@@ -277,12 +300,12 @@ def _record_tainted(
 
 
 def _record_present(
-    hits: List[dict], kind: str, location: str, value: Any
+    hits: List[dict], kind: str, location: str, value: int | None
 ) -> None:
     """Record if value is set (field-name-implies-gas-assertion sinks)."""
     if value is None:
         return
-    hit: Dict[str, Any] = {
+    hit: Dict[str, object] = {
         "kind": kind,
         "location": location,
         "value": int(value),
@@ -293,21 +316,14 @@ def _record_present(
     hits.append(hit)
 
 
-def _walk_storage(hits: List[dict], post: Any) -> None:
+def _walk_storage(hits: List[dict], post: Alloc | None) -> None:
     """Storage slots are general-purpose; only flag tainted values."""
     if post is None:
         return
-    try:
-        items = post.items()
-    except AttributeError:
-        return
-    for address, account in items:
+    for address, account in post.items():
         if account is None:
             continue
-        storage = getattr(account, "storage", None)
-        if storage is None:
-            continue
-        for slot, value in dict(storage.root).items():
+        for slot, value in account.storage.root.items():
             _record_tainted(
                 hits,
                 "storage",
@@ -316,41 +332,56 @@ def _walk_storage(hits: List[dict], post: Any) -> None:
             )
 
 
-def _walk_receipt(hits: List[dict], tx: Any) -> None:
+def _walk_receipt(hits: List[dict], tx: Transaction | None) -> None:
     """Receipt gas fields are self-identifying — flag on presence."""
     if tx is None:
         return
-    receipt = getattr(tx, "expected_receipt", None)
+    receipt = tx.expected_receipt
     if receipt is None:
         return
-    for field in ("cumulative_gas_used", "gas_used", "blob_gas_used"):
-        _record_present(hits, "receipt", field, getattr(receipt, field, None))
+    _record_present(
+        hits, "receipt", "cumulative_gas_used", receipt.cumulative_gas_used
+    )
+    _record_present(hits, "receipt", "gas_used", receipt.gas_used)
+    _record_present(hits, "receipt", "blob_gas_used", receipt.blob_gas_used)
 
 
-def _walk_header_and_block(hits: List[dict], block: Any, i: int) -> None:
+def _walk_header_and_block(hits: List[dict], block: Block, i: int) -> None:
     """Header gas fields and expected_gas_used are self-identifying."""
-    header_verify = getattr(block, "header_verify", None)
+    header_verify = block.header_verify
     if header_verify is not None:
-        for field in ("gas_used", "blob_gas_used"):
+        _record_present(
+            hits, "header", f"block[{i}].gas_used", header_verify.gas_used
+        )
+        # blob_gas_used is ``Removable | HexNumber | None`` — the
+        # ``Removable`` sentinel means "delete this from the verified
+        # header" and is not a gas assertion.
+        blob_gas_used = header_verify.blob_gas_used
+        if not isinstance(blob_gas_used, Removable):
             _record_present(
                 hits,
                 "header",
-                f"block[{i}].{field}",
-                getattr(header_verify, field, None),
+                f"block[{i}].blob_gas_used",
+                blob_gas_used,
             )
     _record_present(
         hits,
         "block_expected_gas_used",
         f"block[{i}].expected_gas_used",
-        getattr(block, "expected_gas_used", None),
+        block.expected_gas_used,
     )
 
 
-def _is_oog_test(test: Any, node: pytest.Item | None) -> bool:
-    tx = getattr(test, "tx", None)
-    if tx is not None and getattr(tx, "error", None) is not None:
+def _is_oog_test(test: BaseTest, node: pytest.Item | None) -> bool:
+    if (
+        isinstance(test, StateTest)
+        and test.tx.error is not None
+    ):
         return True
-    if getattr(test, "block_exception", None):
+    if (
+        isinstance(test, StateTest)
+        and test.block_exception is not None
+    ):
         return True
     if (
         node is not None
@@ -360,28 +391,33 @@ def _is_oog_test(test: Any, node: pytest.Item | None) -> bool:
     return False
 
 
-def collect_taint_hits(test: Any, node: pytest.Item | None) -> List[dict]:
+def collect_taint_hits(
+    test: BaseTest, node: pytest.Item | None
+) -> List[dict]:
     """Walk all known gas-assertion sinks on the test object."""
     if _is_oog_test(test, node):
         return []
 
     hits: List[dict] = []
-    _walk_storage(hits, getattr(test, "post", None))
-    _walk_receipt(hits, getattr(test, "tx", None))
-
-    blocks = getattr(test, "blocks", None)
-    if blocks is not None:
-        for i, block in enumerate(blocks):
+    if isinstance(test, StateTest):
+        _walk_storage(hits, test.post)
+        _walk_receipt(hits, test.tx)
+    elif isinstance(test, BlockchainTest):
+        _walk_storage(hits, test.post)
+        for i, block in enumerate(test.blocks):
             _walk_header_and_block(hits, block, i)
+            for tx in block.txs:
+                _walk_receipt(hits, tx)
 
-    # expected_benchmark_gas_used is auto-defaulted on every test by the
-    # filler, so only treat it as a sink on actual BenchmarkTest instances.
-    if any(c.__name__ == "BenchmarkTest" for c in type(test).__mro__):
+    # ``expected_benchmark_gas_used`` is auto-defaulted on every test by
+    # the filler, so only treat it as a sink on actual BenchmarkTest
+    # instances.
+    if isinstance(test, BenchmarkTest):
         _record_present(
             hits,
             "benchmark",
             "expected_benchmark_gas_used",
-            getattr(test, "expected_benchmark_gas_used", None),
+            test.expected_benchmark_gas_used,
         )
 
     return hits
@@ -436,16 +472,17 @@ def pytest_configure(config: pytest.Config) -> None:
 
 
 @pytest.hookimpl(hookwrapper=True)
-def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo) -> Any:
+def pytest_runtest_makereport(
+    item: pytest.Item, call: pytest.CallInfo
+) -> Generator[None, None, None]:
     """Attach a ``gas_check`` marker if the item has taint hits."""
-    outcome = yield
+    yield
     if call.when != "call":
-        return outcome
+        return
     for key, value in item.user_properties:
         if key == "gas_taint_hits" and value:
             item.add_marker(pytest.mark.gas_check)
             break
-    return outcome
 
 
 def pytest_sessionfinish(session: pytest.Session, exitstatus: int) -> None:
@@ -472,7 +509,9 @@ def pytest_sessionfinish(session: pytest.Session, exitstatus: int) -> None:
     path.write_text(json.dumps(results, indent=2, sort_keys=True))
 
 
-def pytest_testnodedown(node: Any, error: Any) -> None:
+def pytest_testnodedown(
+    node: "WorkerController", error: object | None
+) -> None:
     """Aggregate worker results into the master's results dict."""
     del error
     config = node.config
diff --git a/packages/testing/stubs/xdist/workermanage.pyi b/packages/testing/stubs/xdist/workermanage.pyi
new file mode 100644
index 00000000000..334839b5fe1
--- /dev/null
+++ b/packages/testing/stubs/xdist/workermanage.pyi
@@ -0,0 +1,7 @@
+from typing import Any
+
+import pytest
+
+class WorkerController:
+    config: pytest.Config
+    workeroutput: dict[str, Any]

From c5fa8d7f1330aea438ae07966ee0935b81602e9b Mon Sep 17 00:00:00 2001
From: Leo Lara <leo@leolara.me>
Date: Tue, 26 May 2026 18:56:23 +0700
Subject: [PATCH 5/9] test(tooling): use real spec models in gas_taint walker
 tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the SimpleNamespace-based fakes in the walker tests with real
``StateTest`` / ``BlockchainTest`` / ``BenchmarkTest`` Pydantic instances
so that field renames or retypings on the spec models fail the unit
tests instead of silently masking them.

The storage and mixed-sink tests now use the ``taint_installed`` fixture
because Pydantic ``Storage`` validation strips the ``GasTainted``
subclass without the ``FixedSizeHexNumber.__new__`` patch.

Carrier tests (``TestGasTaintedCarrier``) and install tests
(``TestTaintInstallation``) are unchanged — they already exercised the
real types.
---
 .../plugins/filler/gas_taint.py               |  14 +-
 .../plugins/filler/tests/test_gas_taint.py    | 310 ++++++++++--------
 2 files changed, 175 insertions(+), 149 deletions(-)

diff --git a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/gas_taint.py b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/gas_taint.py
index 2f9b182d7b0..0d6db824ac3 100644
--- a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/gas_taint.py
+++ b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/gas_taint.py
@@ -373,15 +373,9 @@ def _walk_header_and_block(hits: List[dict], block: Block, i: int) -> None:
 
 
 def _is_oog_test(test: BaseTest, node: pytest.Item | None) -> bool:
-    if (
-        isinstance(test, StateTest)
-        and test.tx.error is not None
-    ):
+    if isinstance(test, StateTest) and test.tx.error is not None:
         return True
-    if (
-        isinstance(test, StateTest)
-        and test.block_exception is not None
-    ):
+    if isinstance(test, StateTest) and test.block_exception is not None:
         return True
     if (
         node is not None
@@ -391,9 +385,7 @@ def _is_oog_test(test: BaseTest, node: pytest.Item | None) -> bool:
     return False
 
 
-def collect_taint_hits(
-    test: BaseTest, node: pytest.Item | None
-) -> List[dict]:
+def collect_taint_hits(test: BaseTest, node: pytest.Item | None) -> List[dict]:
     """Walk all known gas-assertion sinks on the test object."""
     if _is_oog_test(test, node):
         return []
diff --git a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_gas_taint.py b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_gas_taint.py
index 8e7ff9e03cc..b712cda217c 100644
--- a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_gas_taint.py
+++ b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_gas_taint.py
@@ -2,11 +2,12 @@
 
 from __future__ import annotations
 
-from types import SimpleNamespace
 from typing import Any, Callable, Iterator, cast
+from unittest.mock import MagicMock
 
 import pytest
 
+from execution_testing.base_types import Account, Address
 from execution_testing.base_types.base_types import (
     HashInt,
     HexNumber,
@@ -21,6 +22,16 @@
     install_taint,
     uninstall_taint,
 )
+from execution_testing.exceptions.exceptions.transaction import (
+    TransactionException,
+)
+from execution_testing.forks.forks.forks import Cancun
+from execution_testing.specs.benchmark import BenchmarkTest
+from execution_testing.specs.blockchain import Block, BlockchainTest, Header
+from execution_testing.specs.state import StateTest
+from execution_testing.test_types.account_types import Alloc
+from execution_testing.test_types.receipt_types import TransactionReceipt
+from execution_testing.test_types.transaction_types import Transaction
 
 # ---------------------------------------------------------------------------
 # GasTainted carrier
@@ -234,105 +245,93 @@ def test_uninstall_reverts_patches(self) -> None:
 
 # ---------------------------------------------------------------------------
 # collect_taint_hits walker
+#
+# These tests instantiate real ``StateTest`` / ``BlockchainTest`` /
+# ``BenchmarkTest`` Pydantic models so that the walker's typed signature
+# guards the field surface. Renaming or retyping a sink field becomes a
+# compile-time error rather than a silent miss.
 # ---------------------------------------------------------------------------
 
 
-def _make_account(storage_dict: dict | None) -> SimpleNamespace:
-    return SimpleNamespace(
-        storage=SimpleNamespace(root=storage_dict) if storage_dict else None
-    )
-
-
-def _make_post(accounts: dict) -> dict:
-    """Return a dict that quacks like ``Alloc`` (provides ``.items()``)."""
-    return accounts
-
+_ADDR = Address(0x1234)
 
-def _make_test(
-    *,
-    post: Any = None,
-    tx: Any = None,
-    blocks: Any = None,
-    block_exception: Any = None,
-    expected_benchmark_gas_used: Any = None,
-) -> SimpleNamespace:
-    """
-    Build a minimal test object with the attributes the walker reads.
 
-    The benchmark sink gates on the class MRO containing a class named
-    ``BenchmarkTest``; SimpleNamespace doesn't satisfy that, so the
-    benchmark tests build a separate real subclass below.
-    """
-    return SimpleNamespace(
-        post=post,
-        tx=tx,
-        blocks=blocks,
-        block_exception=block_exception,
-        expected_benchmark_gas_used=expected_benchmark_gas_used,
+def _state_test_with_storage(value: int | None) -> StateTest:
+    storage = (
+        Storage(cast(Any, {0: value})) if value is not None else Storage()
+    )
+    return StateTest(
+        fork=Cancun,
+        pre=Alloc(),
+        post=Alloc(cast(Any, {_ADDR: Account(storage=storage)})),
+        tx=Transaction(),
     )
 
 
-def _make_node(has_exception_marker: bool = False) -> Any:
-    # Returned as Any so test sites can pass the duck-typed namespace
-    # to ``collect_taint_hits`` (which is typed ``pytest.Item | None``).
-    return SimpleNamespace(
-        get_closest_marker=lambda name: (
-            object()
-            if name == "exception_test" and has_exception_marker
-            else None
-        )
+def _node_with_marker(name: str | None) -> pytest.Item:
+    """Mock ``pytest.Item`` whose ``get_closest_marker`` returns truthy."""
+    node = MagicMock(spec=pytest.Item)
+    node.get_closest_marker.side_effect = lambda n: (
+        object() if name is not None and n == name else None
     )
+    return cast(pytest.Item, node)
 
 
+@pytest.mark.usefixtures("taint_installed")
 class TestStorageSink:
-    """post[addr].storage[slot] requires taint to flag."""
+    """
+    ``post[addr].storage[slot]`` requires taint to flag.
+
+    The ``taint_installed`` fixture is needed so that ``GasTainted``
+    values survive the ``Storage`` Pydantic validator (which constructs
+    a ``HashInt`` for each value); the walker itself doesn't depend on
+    installation.
+    """
 
     def test_tainted_storage_value_is_recorded(self) -> None:
         """Test tainted storage value is recorded."""
-        post = _make_post(
-            {"0xabc": _make_account({0: GasTainted(42, ("Y",))})}
-        )
-        hits = collect_taint_hits(_make_test(post=post), _make_node())
+        test = _state_test_with_storage(GasTainted(42, ("Y",)))
+        hits = collect_taint_hits(test, None)
         assert len(hits) == 1
         assert hits[0]["kind"] == "storage"
         assert hits[0]["value"] == 42
         assert hits[0]["origins"] == ["Y"]
-        assert "0xabc" in hits[0]["location"]
+        assert str(_ADDR) in hits[0]["location"]
 
     def test_untainted_storage_value_is_skipped(self) -> None:
         """Test untainted storage value is skipped."""
         # A plain int in storage — the whole point of using taint here is
         # that storage slots can hold anything.
-        post = _make_post({"0xabc": _make_account({0: 42})})
-        hits = collect_taint_hits(_make_test(post=post), _make_node())
-        assert hits == []
+        test = _state_test_with_storage(42)
+        assert collect_taint_hits(test, None) == []
 
     def test_account_with_no_storage_is_skipped(self) -> None:
         """Test account with no storage is skipped."""
-        post = _make_post({"0xabc": _make_account(None)})
-        hits = collect_taint_hits(_make_test(post=post), _make_node())
-        assert hits == []
+        test = StateTest(
+            fork=Cancun,
+            pre=Alloc(),
+            post=Alloc(cast(Any, {_ADDR: Account()})),
+            tx=Transaction(),
+        )
+        assert collect_taint_hits(test, None) == []
 
-    def test_none_post_is_skipped(self) -> None:
-        """Test none post is skipped."""
-        hits = collect_taint_hits(_make_test(post=None), _make_node())
-        assert hits == []
+    def test_empty_post_is_skipped(self) -> None:
+        """Test empty post is skipped."""
+        test = StateTest(
+            fork=Cancun, pre=Alloc(), post=Alloc(), tx=Transaction()
+        )
+        assert collect_taint_hits(test, None) == []
 
 
 class TestReceiptSink:
-    """expected_receipt fields are flagged on presence, not taint."""
+    """``expected_receipt`` fields are flagged on presence, not taint."""
 
     def test_cumulative_gas_used_recorded(self) -> None:
         """Test cumulative gas used recorded."""
-        tx = SimpleNamespace(
-            error=None,
-            expected_receipt=SimpleNamespace(
-                cumulative_gas_used=21000,
-                gas_used=None,
-                blob_gas_used=None,
-            ),
-        )
-        hits = collect_taint_hits(_make_test(tx=tx), _make_node())
+        tx = Transaction()
+        tx.expected_receipt = TransactionReceipt(cumulative_gas_used=21000)
+        test = StateTest(fork=Cancun, pre=Alloc(), post=Alloc(), tx=tx)
+        hits = collect_taint_hits(test, None)
         assert len(hits) == 1
         assert hits[0]["kind"] == "receipt"
         assert hits[0]["location"] == "cumulative_gas_used"
@@ -342,15 +341,12 @@ def test_cumulative_gas_used_recorded(self) -> None:
 
     def test_all_three_receipt_fields(self) -> None:
         """Test all three receipt fields."""
-        tx = SimpleNamespace(
-            error=None,
-            expected_receipt=SimpleNamespace(
-                cumulative_gas_used=100,
-                gas_used=200,
-                blob_gas_used=300,
-            ),
+        tx = Transaction()
+        tx.expected_receipt = TransactionReceipt(
+            cumulative_gas_used=100, gas_used=200, blob_gas_used=300
         )
-        hits = collect_taint_hits(_make_test(tx=tx), _make_node())
+        test = StateTest(fork=Cancun, pre=Alloc(), post=Alloc(), tx=tx)
+        hits = collect_taint_hits(test, None)
         locations = {h["location"] for h in hits}
         assert locations == {
             "cumulative_gas_used",
@@ -360,20 +356,22 @@ def test_all_three_receipt_fields(self) -> None:
 
     def test_no_expected_receipt_skipped(self) -> None:
         """Test no expected receipt skipped."""
-        tx = SimpleNamespace(error=None, expected_receipt=None)
-        assert collect_taint_hits(_make_test(tx=tx), _make_node()) == []
+        test = StateTest(
+            fork=Cancun, pre=Alloc(), post=Alloc(), tx=Transaction()
+        )
+        assert collect_taint_hits(test, None) == []
 
 
 class TestHeaderAndBlockSinks:
-    """block.header_verify and block.expected_gas_used flagged on presence."""
+    """``header_verify`` and ``expected_gas_used`` flagged on presence."""
 
     def test_header_verify_blob_gas_used(self) -> None:
         """Test header verify blob gas used."""
-        block = SimpleNamespace(
-            header_verify=SimpleNamespace(gas_used=None, blob_gas_used=131072),
-            expected_gas_used=None,
+        block = Block(header_verify=Header(blob_gas_used=131072))
+        test = BlockchainTest(
+            fork=Cancun, pre=Alloc(), post=Alloc(), blocks=[block]
         )
-        hits = collect_taint_hits(_make_test(blocks=[block]), _make_node())
+        hits = collect_taint_hits(test, None)
         assert len(hits) == 1
         assert hits[0]["kind"] == "header"
         assert hits[0]["location"] == "block[0].blob_gas_used"
@@ -381,8 +379,14 @@ def test_header_verify_blob_gas_used(self) -> None:
 
     def test_block_expected_gas_used(self) -> None:
         """Test block expected gas used."""
-        block = SimpleNamespace(header_verify=None, expected_gas_used=199_156)
-        hits = collect_taint_hits(_make_test(blocks=[block]), _make_node())
+        block = Block(expected_gas_used=HexNumber(199_156))
+        test = BlockchainTest(
+            fork=Cancun, pre=Alloc(), post=Alloc(), blocks=[block]
+        )
+        hits = collect_taint_hits(test, None)
+        # Block inherits gas_used / blob_gas_used from Header (both
+        # default None), so only the explicit expected_gas_used is
+        # recorded.
         assert len(hits) == 1
         assert hits[0]["kind"] == "block_expected_gas_used"
         assert hits[0]["value"] == 199_156
@@ -390,101 +394,131 @@ def test_block_expected_gas_used(self) -> None:
     def test_multiple_blocks_indexed(self) -> None:
         """Test multiple blocks indexed."""
         blocks = [
-            SimpleNamespace(header_verify=None, expected_gas_used=100),
-            SimpleNamespace(header_verify=None, expected_gas_used=200),
+            Block(expected_gas_used=HexNumber(100)),
+            Block(expected_gas_used=HexNumber(200)),
         ]
-        hits = collect_taint_hits(_make_test(blocks=blocks), _make_node())
+        test = BlockchainTest(
+            fork=Cancun, pre=Alloc(), post=Alloc(), blocks=blocks
+        )
+        hits = collect_taint_hits(test, None)
         assert {h["location"] for h in hits} == {
             "block[0].expected_gas_used",
             "block[1].expected_gas_used",
         }
 
 
-class _FakeBenchmarkTest:
-    """Stand-in for ``BenchmarkTest`` so the MRO name check fires."""
-
-    # The walker checks ``c.__name__ == "BenchmarkTest"`` in mro; faking
-    # the class name is enough to exercise the gate.
-
-
-_FakeBenchmarkTest.__name__ = "BenchmarkTest"
-
-
 class TestBenchmarkSink:
-    """expected_benchmark_gas_used requires the BenchmarkTest MRO gate."""
+    """``expected_benchmark_gas_used`` is recorded only on BenchmarkTest."""
 
     def test_skipped_on_non_benchmark_test(self) -> None:
         """Test skipped on non benchmark test."""
-        # SimpleNamespace's MRO doesn't include BenchmarkTest.
-        test = _make_test(expected_benchmark_gas_used=120_000_000)
-        assert collect_taint_hits(test, _make_node()) == []
+        # The filler auto-defaults expected_benchmark_gas_used on every
+        # test; without the isinstance(test, BenchmarkTest) gate, this
+        # would emit a benchmark hit even on a plain StateTest.
+        test = StateTest(
+            fork=Cancun, pre=Alloc(), post=Alloc(), tx=Transaction()
+        )
+        # Sanity: set it just like the filler would for non-benchmark
+        # tests.
+        test.expected_benchmark_gas_used = HexNumber(120_000_000)
+        assert collect_taint_hits(test, None) == []
 
     def test_recorded_on_benchmark_test(self) -> None:
         """Test recorded on benchmark test."""
-        # Build a real subclass whose MRO contains a class named
-        # "BenchmarkTest". The walker scans names, not identity.
-        bt = _FakeBenchmarkTest()
-        bt.post = None  # type: ignore[attr-defined]
-        bt.tx = None  # type: ignore[attr-defined]
-        bt.blocks = None  # type: ignore[attr-defined]
-        bt.block_exception = None  # type: ignore[attr-defined]
-        bt.expected_benchmark_gas_used = 99_999_999  # type: ignore[attr-defined]
-        hits = collect_taint_hits(bt, _make_node())
+        bench = BenchmarkTest(
+            fork=Cancun,
+            pre=Alloc(),
+            tx=Transaction(),
+            expected_benchmark_gas_used=HexNumber(99_999_999),
+        )
+        hits = collect_taint_hits(bench, None)
         assert len(hits) == 1
         assert hits[0]["kind"] == "benchmark"
         assert hits[0]["value"] == 99_999_999
 
 
+@pytest.mark.usefixtures("taint_installed")
 class TestOOGExclusion:
     """OOG-style tests are excluded — they don't write to positive sinks."""
 
     def test_tx_error_set_excludes_test(self) -> None:
         """Test tx error set excludes test."""
-        # Even with a tainted storage value, an error on tx means the
-        # test expects rejection — drop it.
-        post = _make_post({"0xa": _make_account({0: GasTainted(1, ("X",))})})
-        tx = SimpleNamespace(error="some_error", expected_receipt=None)
-        assert (
-            collect_taint_hits(_make_test(post=post, tx=tx), _make_node())
-            == []
+        # Even with a tainted storage value, a tx.error means the test
+        # expects rejection — drop it.
+        tx = Transaction(error=TransactionException.INTRINSIC_GAS_TOO_LOW)
+        test = StateTest(
+            fork=Cancun,
+            pre=Alloc(),
+            post=Alloc(
+                cast(
+                    Any,
+                    {
+                        _ADDR: Account(
+                            storage=Storage(
+                                cast(Any, {0: GasTainted(1, ("X",))})
+                            )
+                        )
+                    },
+                )
+            ),
+            tx=tx,
         )
+        assert collect_taint_hits(test, None) == []
 
     def test_block_exception_excludes_test(self) -> None:
         """Test block exception excludes test."""
-        post = _make_post({"0xa": _make_account({0: GasTainted(1, ("X",))})})
-        assert (
-            collect_taint_hits(
-                _make_test(post=post, block_exception="boom"),
-                _make_node(),
-            )
-            == []
+        test = StateTest(
+            fork=Cancun,
+            pre=Alloc(),
+            post=Alloc(
+                cast(
+                    Any,
+                    {
+                        _ADDR: Account(
+                            storage=Storage(
+                                cast(Any, {0: GasTainted(1, ("X",))})
+                            )
+                        )
+                    },
+                )
+            ),
+            tx=Transaction(),
+            block_exception=TransactionException.INTRINSIC_GAS_TOO_LOW,
         )
+        assert collect_taint_hits(test, None) == []
 
     def test_exception_test_marker_excludes_test(self) -> None:
         """Test exception test marker excludes test."""
-        post = _make_post({"0xa": _make_account({0: GasTainted(1, ("X",))})})
-        assert (
-            collect_taint_hits(
-                _make_test(post=post), _make_node(has_exception_marker=True)
-            )
-            == []
-        )
+        test = _state_test_with_storage(GasTainted(1, ("X",)))
+        node = _node_with_marker("exception_test")
+        assert collect_taint_hits(test, node) == []
 
 
+@pytest.mark.usefixtures("taint_installed")
 class TestMixedSinks:
     """Multiple sinks can fire for the same test."""
 
     def test_storage_and_receipt(self) -> None:
         """Test storage and receipt."""
-        post = _make_post({"0xa": _make_account({0: GasTainted(50, ("X",))})})
-        tx = SimpleNamespace(
-            error=None,
-            expected_receipt=SimpleNamespace(
-                cumulative_gas_used=21000,
-                gas_used=None,
-                blob_gas_used=None,
+        tx = Transaction()
+        tx.expected_receipt = TransactionReceipt(cumulative_gas_used=21000)
+        test = StateTest(
+            fork=Cancun,
+            pre=Alloc(),
+            post=Alloc(
+                cast(
+                    Any,
+                    {
+                        _ADDR: Account(
+                            storage=Storage(
+                                cast(Any, {0: GasTainted(50, ("X",))})
+                            )
+                        )
+                    },
+                )
             ),
+            tx=tx,
         )
-        hits = collect_taint_hits(_make_test(post=post, tx=tx), _make_node())
+        hits = collect_taint_hits(test, None)
         kinds = {h["kind"] for h in hits}
         assert kinds == {"storage", "receipt"}

From 1af713f1a13ca1e67fbeaaca312295c6f6c09a09 Mon Sep 17 00:00:00 2001
From: Leo Lara <leo@leolara.me>
Date: Tue, 26 May 2026 19:04:02 +0700
Subject: [PATCH 6/9] test(tooling): pytester-based e2e tests for gas_taint
 plugin

Add four end-to-end tests that load the real fill plugin chain into a
pytester subprocess and exercise the bits the unit tests deliberately
skip:

- ``--detect-gas-checks`` / ``--gas-check-report`` appear in
  ``fill --help``.
- A synthetic test using ``CodeGasMeasure`` is detected and recorded as
  a ``storage`` hit with gas-derived origins.
- A synthetic test marked ``@pytest.mark.exception_test`` is excluded
  from the report.
- Without the flag, no report file is produced.

Co-located with ``test_benchmarking.py`` and added to ``--ignore`` in
``test-tests`` / ``test-tests-pypy`` so unit-test runs stay fast; runs
under ``test-tests-bench`` alongside the existing pytester suite. Needs
``EVM_BIN`` (defaults to ``evm``) on the runner.
---
 Justfile                                      |   5 +-
 .../filler/tests/test_gas_taint_e2e.py        | 229 ++++++++++++++++++
 2 files changed, 233 insertions(+), 1 deletion(-)
 create mode 100644 packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_gas_taint_e2e.py

diff --git a/Justfile b/Justfile
index 39041974066..e116c458d41 100644
--- a/Justfile
+++ b/Justfile
@@ -181,6 +181,7 @@ test-tests *args:
         -n {{ xdist_workers }} \
         --basetemp="{{ output_dir }}/test-tests/tmp" \
         --ignore=src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_benchmarking.py \
+        --ignore=src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_gas_taint_e2e.py \
         "$@" \
         src
 
@@ -192,6 +193,7 @@ test-tests-pypy *args:
         -n auto --maxprocesses 6 \
         --basetemp="{{ output_dir }}/test-tests-pypy/tmp" \
         --ignore=src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_benchmarking.py \
+        --ignore=src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_gas_taint_e2e.py \
         "$@" \
         src
 
@@ -203,7 +205,8 @@ test-tests-bench *args:
     uv run pytest \
         --basetemp="{{ output_dir }}/test-tests-bench/tmp" \
         "$@" \
-        packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_benchmarking.py
+        packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_benchmarking.py \
+        packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_gas_taint_e2e.py
 
 # Run CI release script integration tests
 [group('unit tests')]
diff --git a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_gas_taint_e2e.py b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_gas_taint_e2e.py
new file mode 100644
index 00000000000..c2e54c2d4a2
--- /dev/null
+++ b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_gas_taint_e2e.py
@@ -0,0 +1,229 @@
+"""
+End-to-end pytester tests for the gas_taint plugin.
+
+These tests load the real fill plugin chain into a pytester subprocess
+and verify that ``--detect-gas-checks`` produces the expected JSON
+report. They cover the integration that the unit tests in
+``test_gas_taint.py`` deliberately don't:
+
+- ``pytest_addoption`` / ``pytest_configure`` parse and propagate the
+  flag.
+- ``install_taint`` runs once at session start.
+- The hook inside ``BaseTestWrapper.__init__`` actually fires for each
+  test and pushes hits into ``request.config._gas_taint_results``.
+- ``pytest_sessionfinish`` / ``pytest_testnodedown`` write the JSON
+  report to the path given to ``--gas-check-report``.
+
+Each test runs a full ``fill`` invocation against a synthetic test
+module, so an actual t8n binary must be available (defaults to ``evm``;
+override with ``EVM_BIN`` like the benchmarking tests).
+"""
+
+import json
+import os
+import textwrap
+from pathlib import Path
+
+import pytest
+
+GAS_TAINT_EVM_T8N = os.environ.get("EVM_BIN", "evm")
+
+
+# A synthetic test module that asserts a gas-derived value in post
+# storage via ``CodeGasMeasure``. Picks up via the ``storage`` sink kind
+# in the report.
+GAS_CHECK_TEST_MODULE = textwrap.dedent(
+    """\
+    import pytest
+    from execution_testing import (
+        Account,
+        Alloc,
+        CodeGasMeasure,
+        Environment,
+        Op,
+        StateTestFiller,
+        Transaction,
+    )
+
+    @pytest.mark.valid_at("Cancun")
+    def test_dummy_gas_check(
+        state_test: StateTestFiller, pre: Alloc, fork
+    ) -> None:
+        gas_measure = CodeGasMeasure(code=Op.PUSH1[1] + Op.POP)
+        expected_gas = (Op.PUSH1[1] + Op.POP).gas_cost(fork)
+        contract = pre.deploy_contract(code=gas_measure)
+        sender = pre.fund_eoa()
+        state_test(
+            env=Environment(),
+            pre=pre,
+            post={contract: Account(storage={0: expected_gas})},
+            tx=Transaction(sender=sender, to=contract, gas_limit=200_000),
+        )
+    """
+)
+
+
+# A synthetic OOG-style test marked with ``exception_test``. The walker
+# must skip it even though it ends up running through the same hook.
+OOG_TEST_MODULE = textwrap.dedent(
+    """\
+    import pytest
+    from execution_testing import (
+        Account,
+        Alloc,
+        Environment,
+        Op,
+        StateTestFiller,
+        Transaction,
+        TransactionException,
+    )
+
+    @pytest.mark.valid_at("Cancun")
+    @pytest.mark.exception_test
+    def test_dummy_oog(
+        state_test: StateTestFiller, pre: Alloc
+    ) -> None:
+        contract = pre.deploy_contract(code=Op.STOP)
+        sender = pre.fund_eoa()
+        state_test(
+            env=Environment(),
+            pre=pre,
+            post={},
+            tx=Transaction(
+                sender=sender,
+                to=contract,
+                gas_limit=20_999,
+                error=TransactionException.INTRINSIC_GAS_TOO_LOW,
+            ),
+        )
+    """
+)
+
+
+def _setup_pytester(
+    pytester: pytest.Pytester, test_content: str, filename: str
+) -> Path:
+    """Drop a synthetic test module under ``tests/`` and copy fill.ini."""
+    tests_dir = pytester.mkdir("tests")
+    dummy_dir = tests_dir / "dummy_module"
+    dummy_dir.mkdir()
+    module_path = dummy_dir / filename
+    module_path.write_text(test_content)
+    pytester.copy_example(
+        name="src/execution_testing/cli/pytest_commands/pytest_ini_files/pytest-fill.ini"
+    )
+    return module_path
+
+
+def test_detect_gas_checks_option_added(pytester: pytest.Pytester) -> None:
+    """``--detect-gas-checks`` appears in ``fill --help``."""
+    pytester.copy_example(
+        name="src/execution_testing/cli/pytest_commands/pytest_ini_files/pytest-fill.ini"
+    )
+    result = pytester.runpytest("-c", "pytest-fill.ini", "--help")
+    assert result.ret == 0
+    assert any("--detect-gas-checks" in line for line in result.outlines), (
+        "expected --detect-gas-checks in help output"
+    )
+    assert any("--gas-check-report" in line for line in result.outlines), (
+        "expected --gas-check-report in help output"
+    )
+
+
+def test_gas_check_report_records_storage_hit(
+    pytester: pytest.Pytester, tmp_path: Path
+) -> None:
+    """A synthetic CodeGasMeasure test ends up in the JSON report."""
+    _setup_pytester(pytester, GAS_CHECK_TEST_MODULE, "test_dummy_gas.py")
+    report_path = tmp_path / "gas_check_report.json"
+    output_dir = tmp_path / "fixtures"
+
+    result = pytester.runpytest(
+        "-c",
+        "pytest-fill.ini",
+        "--fork",
+        "Cancun",
+        "--detect-gas-checks",
+        f"--gas-check-report={report_path}",
+        "--no-html",
+        "--skip-index",
+        f"--output={output_dir}",
+        "tests/dummy_module/",
+        "-q",
+    )
+    assert result.ret == 0, f"fill failed:\n{result.outlines}"
+    assert report_path.exists(), "expected report file to be written"
+
+    report = json.loads(report_path.read_text())
+    storage_entries = [
+        (nodeid, hit)
+        for nodeid, hits in report.items()
+        for hit in hits
+        if hit["kind"] == "storage"
+    ]
+    assert storage_entries, (
+        f"expected at least one storage hit; got {report!r}"
+    )
+    for _, hit in storage_entries:
+        # Origins should mention either Bytecode.gas_cost or one of the
+        # underlying gas_costs.* constants.
+        assert any("gas" in origin for origin in hit["origins"]), (
+            f"unexpected origins for hit {hit!r}"
+        )
+
+
+def test_gas_check_report_excludes_oog_test(
+    pytester: pytest.Pytester, tmp_path: Path
+) -> None:
+    """Tests marked ``exception_test`` are excluded from the report."""
+    _setup_pytester(pytester, OOG_TEST_MODULE, "test_dummy_oog.py")
+    report_path = tmp_path / "gas_check_report.json"
+    output_dir = tmp_path / "fixtures"
+
+    result = pytester.runpytest(
+        "-c",
+        "pytest-fill.ini",
+        "--fork",
+        "Cancun",
+        "--detect-gas-checks",
+        f"--gas-check-report={report_path}",
+        "--no-html",
+        "--skip-index",
+        f"--output={output_dir}",
+        "tests/dummy_module/",
+        "-q",
+    )
+    assert result.ret == 0, f"fill failed:\n{result.outlines}"
+    assert report_path.exists()
+
+    report = json.loads(report_path.read_text())
+    # No entry should reference the OOG test.
+    assert not any("test_dummy_oog" in nodeid for nodeid in report), (
+        f"OOG test should be excluded; got report keys: {list(report)!r}"
+    )
+
+
+def test_no_report_written_without_flag(
+    pytester: pytest.Pytester, tmp_path: Path
+) -> None:
+    """Without ``--detect-gas-checks`` no report file is produced."""
+    _setup_pytester(pytester, GAS_CHECK_TEST_MODULE, "test_dummy_gas.py")
+    report_path = tmp_path / "gas_check_report.json"
+    output_dir = tmp_path / "fixtures"
+
+    result = pytester.runpytest(
+        "-c",
+        "pytest-fill.ini",
+        "--fork",
+        "Cancun",
+        f"--gas-check-report={report_path}",  # path supplied but flag off
+        "--no-html",
+        "--skip-index",
+        f"--output={output_dir}",
+        "tests/dummy_module/",
+        "-q",
+    )
+    assert result.ret == 0, f"fill failed:\n{result.outlines}"
+    assert not report_path.exists(), (
+        "report must not be written when detector is disabled"
+    )

From e5259215804c23bd2fab37566e35768702eadbd7 Mon Sep 17 00:00:00 2001
From: Leo Lara <leo@leolara.me>
Date: Tue, 26 May 2026 20:02:43 +0700
Subject: [PATCH 7/9] =?UTF-8?q?test(tooling):=20e2e=20=E2=80=94=20plain=20?=
 =?UTF-8?q?non-gas=20test=20isn't=20flagged?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a negative integration test that exercises a synthetic state test
which stores a literal value (no ``CodeGasMeasure``, no
``expected_receipt``, no header verify) and asserts the resulting
report is empty. Guards against false positives: a test that simply
runs successfully without touching any of the detector's triggers must
not appear in the JSON output.
---
 .../filler/tests/test_gas_taint_e2e.py        | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_gas_taint_e2e.py b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_gas_taint_e2e.py
index c2e54c2d4a2..aaede425e4c 100644
--- a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_gas_taint_e2e.py
+++ b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_gas_taint_e2e.py
@@ -63,6 +63,37 @@ def test_dummy_gas_check(
 )
 
 
+# A plain non-gas test: deploys a tiny contract, asserts a literal
+# storage value (``1`` — not derived from any gas constant), and sets no
+# expected receipt or header field. The walker should produce no hits.
+PLAIN_TEST_MODULE = textwrap.dedent(
+    """\
+    import pytest
+    from execution_testing import (
+        Account,
+        Alloc,
+        Environment,
+        Op,
+        StateTestFiller,
+        Transaction,
+    )
+
+    @pytest.mark.valid_at("Cancun")
+    def test_dummy_non_gas(
+        state_test: StateTestFiller, pre: Alloc
+    ) -> None:
+        contract = pre.deploy_contract(code=Op.SSTORE(0, 1) + Op.STOP)
+        sender = pre.fund_eoa()
+        state_test(
+            env=Environment(),
+            pre=pre,
+            post={contract: Account(storage={0: 1})},
+            tx=Transaction(sender=sender, to=contract, gas_limit=100_000),
+        )
+    """
+)
+
+
 # A synthetic OOG-style test marked with ``exception_test``. The walker
 # must skip it even though it ends up running through the same hook.
 OOG_TEST_MODULE = textwrap.dedent(
@@ -203,6 +234,39 @@ def test_gas_check_report_excludes_oog_test(
     )
 
 
+def test_gas_check_report_excludes_non_gas_test(
+    pytester: pytest.Pytester, tmp_path: Path
+) -> None:
+    """A passing test that doesn't assert any gas value isn't flagged."""
+    _setup_pytester(pytester, PLAIN_TEST_MODULE, "test_dummy_plain.py")
+    report_path = tmp_path / "gas_check_report.json"
+    output_dir = tmp_path / "fixtures"
+
+    result = pytester.runpytest(
+        "-c",
+        "pytest-fill.ini",
+        "--fork",
+        "Cancun",
+        "--detect-gas-checks",
+        f"--gas-check-report={report_path}",
+        "--no-html",
+        "--skip-index",
+        f"--output={output_dir}",
+        "tests/dummy_module/",
+        "-q",
+    )
+    assert result.ret == 0, f"fill failed:\n{result.outlines}"
+    assert report_path.exists()
+
+    report = json.loads(report_path.read_text())
+    # The synthetic test stores a literal 1 with no expected_receipt /
+    # header_verify / expected_gas_used — none of the detector's
+    # triggers should fire.
+    assert report == {}, (
+        f"plain test must produce empty report; got {report!r}"
+    )
+
+
 def test_no_report_written_without_flag(
     pytester: pytest.Pytester, tmp_path: Path
 ) -> None:

From b4211a168ec5ad550e63edca3f64e9320784134d Mon Sep 17 00:00:00 2001
From: Leo Lara <leo@leolara.me>
Date: Tue, 26 May 2026 20:14:50 +0700
Subject: [PATCH 8/9] build(tooling): add `mark_gas_tests` /
 `mark_gas_tests_test` just recipes

- ``just mark_gas_tests <path-or-selector>`` runs ``fill
  --detect-gas-checks`` against the given target, then pipes the report
  through ``scripts/mark_tests.py`` to apply ``@pytest.mark.gas_check``
  to the detected tests. Accepts any pytest path/selector, so the
  workflow can be scoped to a single file or function.

- ``just mark_gas_tests_test`` runs the pytester end-to-end tests for
  the gas_taint plugin in their own ``[gas check]`` group, instead of
  piggybacking on ``test-tests-bench`` (which is for benchmark
  framework tests, not plugin e2e).
---
 Justfile | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/Justfile b/Justfile
index e116c458d41..1851cfc9995 100644
--- a/Justfile
+++ b/Justfile
@@ -205,7 +205,31 @@ test-tests-bench *args:
     uv run pytest \
         --basetemp="{{ output_dir }}/test-tests-bench/tmp" \
         "$@" \
-        packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_benchmarking.py \
+        packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_benchmarking.py
+
+# Scan tests for gas assertions and apply @pytest.mark.gas_check to them
+[group('gas check')]
+mark_gas_tests *args:
+    @mkdir -p "{{ output_dir }}/mark-gas-tests/tmp"
+    uv run fill \
+        --detect-gas-checks \
+        --gas-check-report="{{ output_dir }}/mark-gas-tests/gas_check_report.json" \
+        -m "not slow" \
+        -n {{ xdist_workers }} --dist=loadgroup \
+        --clean \
+        --skip-index \
+        --output="{{ output_dir }}/mark-gas-tests/fixtures" \
+        "$@"
+    uv run python scripts/mark_tests.py \
+        "{{ output_dir }}/mark-gas-tests/gas_check_report.json"
+
+# Run the gas_taint plugin end-to-end (pytester) tests
+[group('gas check')]
+mark_gas_tests_test *args:
+    @mkdir -p "{{ output_dir }}/mark-gas-tests-test/tmp"
+    uv run pytest \
+        --basetemp="{{ output_dir }}/mark-gas-tests-test/tmp" \
+        "$@" \
         packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_gas_taint_e2e.py
 
 # Run CI release script integration tests

From 1ab1fd05a7865535af6cd7d374562b25586bbbcb Mon Sep 17 00:00:00 2001
From: Leo Lara <leo@leolara.me>
Date: Fri, 29 May 2026 15:18:50 +0700
Subject: [PATCH 9/9] feat(tooling): detect and mark out-of-gas-by-design tests

Add scripts/detect_oog_by_design.py, which scans EELS EIP-3155 traces
under an --evm-dump-dir and flags every test whose execution ran out of
gas by design -- at the top-level transaction or any sub-call depth.
The signal is exact equality on the trace error field == "OutOfGasError"
(the EELS exception class name), giving zero false positives against
other all-gas-consuming halts (StackOverflowError, StackUnderflowError,
InvalidOpcode, Revert). Fill success is the "by design" filter.

Output is the mark_tests.py wrapped form {marker, nodeids}, so the two
chain directly. Sanitized dump-dir names are reconstructed into pytest
node IDs and verified against the source AST (re-adding the test_ prefix
and locating the function, including any enclosing class) before being
emitted; unverifiable names are reported rather than guessed.

Add a `mark_oog_tests` just recipe (detect + mark from pre-produced
traces; the heavy traced fill is documented in the recipe header rather
than run inline, so the fork and worker count stay in the user's hands)
and scripts/fill_for_oog_detection.sh to produce the traces detached.
Register the `out_of_gas` marker so the applied decorator is recognised.

Validated end-to-end against EELS traces: 6 positive OOG variants
detected (top-level, sub-call depth 2-4, ooge/oogm, CREATE-init,
multi-depth in one tx), 5 non-OOG halts correctly ignored, a 1:1 match
with ground truth.
---
 Justfile                                      |  34 ++
 .../plugins/shared/execute_fill.py            |   6 +
 scripts/detect_oog_by_design.py               | 483 ++++++++++++++++++
 scripts/fill_for_oog_detection.sh             |  61 +++
 4 files changed, 584 insertions(+)
 create mode 100644 scripts/detect_oog_by_design.py
 create mode 100755 scripts/fill_for_oog_detection.sh

diff --git a/Justfile b/Justfile
index 1851cfc9995..f4404584794 100644
--- a/Justfile
+++ b/Justfile
@@ -232,6 +232,40 @@ mark_gas_tests_test *args:
         "$@" \
         packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_gas_taint_e2e.py
 
+# Apply @pytest.mark.out_of_gas to tests that ran out of gas by design.
+#
+# This scans EELS EIP-3155 traces you have produced beforehand and marks the
+# tests behind them. Producing the traces is a separate, heavy step (traces
+# are large and memory-hungry) and is deliberately NOT part of this recipe, so
+# you choose the fork, worker count and scope. Produce them with a traced
+# fill, e.g.:
+#
+#   TMPDIR=~/.tmp uv run fill tests/ported_static \
+#       --fork Osaka --traces --evm-dump-dir=~/.tmp/oog-traces \
+#       -m "not slow" -n 6 --dist=loadgroup --clean --skip-index \
+#       --output=~/.tmp/oog-fixtures
+#
+# or via the detached helper (same defaults):
+#
+#   scripts/fill_for_oog_detection.sh tests/ported_static
+#
+# A single fork is enough (OOG-by-design is fork-invariant in practice).
+# Then detect + mark (default trace dir: ~/.tmp/oog-traces):
+#
+#   just mark_oog_tests
+#   just mark_oog_tests /path/to/traces        # custom trace dir
+[group('oog check')]
+[doc('Detect + mark out-of-gas tests from pre-produced EELS traces')]
+mark_oog_tests evm_dump_dir="~/.tmp/oog-traces" *args:
+    @mkdir -p "{{ output_dir }}/mark-oog-tests"
+    uv run python scripts/detect_oog_by_design.py \
+        --evm-dump-dir="{{ evm_dump_dir }}" \
+        --repo-root="{{ root }}" \
+        {{ args }} \
+        --output="{{ output_dir }}/mark-oog-tests/oog_report.json"
+    uv run python scripts/mark_tests.py \
+        "{{ output_dir }}/mark-oog-tests/oog_report.json"
+
 # Run CI release script integration tests
 [group('unit tests')]
 test-ci-scripts *args:
diff --git a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/shared/execute_fill.py b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/shared/execute_fill.py
index d827d96927a..d14101b8f70 100644
--- a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/shared/execute_fill.py
+++ b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/shared/execute_fill.py
@@ -193,6 +193,12 @@ def pytest_configure(config: pytest.Config) -> None:
         "exception_test: Negative tests that include an invalid block or "
         "transaction.",
     )
+    config.addinivalue_line(
+        "markers",
+        "out_of_gas: test whose execution runs out of gas by design, at the "
+        "top-level transaction or a sub-call (applied by "
+        "scripts/mark_tests.py from scripts/detect_oog_by_design.py).",
+    )
     config.addinivalue_line(
         "markers",
         "eip_checklist(item_id, eip=None): Mark a test as implementing a "
diff --git a/scripts/detect_oog_by_design.py b/scripts/detect_oog_by_design.py
new file mode 100644
index 00000000000..e3c3584fed0
--- /dev/null
+++ b/scripts/detect_oog_by_design.py
@@ -0,0 +1,483 @@
+#!/usr/bin/env -S uv run --script
+# /// script
+# dependencies = []
+# ///
+r"""
+Detect EEST tests that are out-of-gas by design.
+
+Scans EIP-3155 traces emitted by EELS (the default t8n in this repo)
+under a `--evm-dump-dir` and reports every transaction step whose
+`error` field is exactly "OutOfGasError" -- the Python exception class
+name written by EELS at
+src/ethereum_spec_tools/evm_tools/t8n/evm_trace/eip3155.py:69-70.
+
+The OOG class is uniformly named `OutOfGasError` across every fork
+(src/ethereum/forks/*/vm/exceptions.py:51) and distinct from all
+other halt classes, so exact equality on that string gives zero
+false positives.
+
+Filtering for "by design" is delegated to `fill`: if a test was
+filled successfully and its trace contains `OutOfGasError`, the
+post-state matched the author's declaration, so the OOG is by
+definition expected. Caveat (by-design under EELS semantics): in
+the rare case where an author only asserts generic post-state
+(e.g. balance unchanged) and an EELS OOG happens to satisfy it
+incidentally, this script will flag the test anyway. Within EELS
+semantics this is the strongest signal obtainable without a
+separate ground-truth source.
+
+The default JSON output is consumable by `scripts/mark_tests.py`:
+it is the wrapped self-describing form
+
+    {"marker": "<name>", "nodeids": ["tests/foo/test_bar.py::test_baz", ...]}
+
+so the two scripts chain directly:
+
+    uv run scripts/detect_oog_by_design.py tests/ported_static \
+        --evm-dump-dir ~/.tmp/oog-traces --output oog_report.json
+    uv run scripts/mark_tests.py oog_report.json
+
+Trace dump dirs are keyed by the *sanitized* test name
+(`ported_static__stRevertTest__revert_prefound_oog__test_...`), which
+is lossy: the module's `test_` prefix is stripped
+(fixtures/collector.py:114-120) and class names are not encoded. Each
+sanitized name is therefore reconstructed into a pytest node ID and
+**verified against the source AST** (the `test_` prefix is re-added and
+the function is located, including any enclosing class) before being
+emitted. Names that cannot be verified are reported as unresolved
+rather than guessed -- and `mark_tests.py` independently re-checks every
+node ID against the filesystem, so a wrong ID can never silently
+mis-mark a test.
+
+This script is stdlib-only on purpose: it does NOT import
+`execution_testing`, so it is unaffected by the latent substring-match
+bug in that package's `_is_out_of_gas_error` helper, which tests for the
+substring "out of gas" and so never matches the EELS class name
+"OutOfGasError".
+"""
+
+import argparse
+import ast
+import json
+import re
+import subprocess
+import sys
+from collections import defaultdict
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import IO, Iterator
+
+OOG_ERROR_NAME = "OutOfGasError"
+TRACE_FILE_RE = re.compile(r"trace-(\d+)-0x[0-9a-fA-F]+\.jsonl")
+DEFAULT_MARKER = "out_of_gas"
+
+
+@dataclass(frozen=True)
+class OogEvent:
+    """Single out-of-gas event recorded from a trace step line."""
+
+    test_id: str
+    config: str
+    block: int
+    tx_index: int
+    depth: int
+    op_name: str
+    pc: int
+    trace_file: str
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description=(
+            "Detect EEST tests that are out-of-gas by design "
+            "(top-level or sub-call) by scanning EELS EIP-3155 traces. "
+            "Emit mark_tests.py-compatible node IDs."
+        ),
+    )
+    parser.add_argument(
+        "test_path",
+        nargs="?",
+        default="tests/ported_static",
+        help=(
+            "Test path forwarded to fill and used to scope the trace "
+            "scan. Default: tests/ported_static."
+        ),
+    )
+    parser.add_argument(
+        "--evm-dump-dir",
+        default=str(Path.home() / ".tmp" / "oog-traces"),
+        help=(
+            "Directory containing per-test trace subdirectories. "
+            "Default: ~/.tmp/oog-traces."
+        ),
+    )
+    parser.add_argument(
+        "--run-fill",
+        action="store_true",
+        help=(
+            "Run `fill` with --traces before scanning. Without this "
+            "flag the script only scans an existing --evm-dump-dir."
+        ),
+    )
+    parser.add_argument(
+        "--workers",
+        default="auto",
+        help="Value passed to fill's -n flag. Default: auto.",
+    )
+    parser.add_argument(
+        "--fork",
+        default=None,
+        help=(
+            "Optional fork override forwarded to fill. When unset, "
+            "fill iterates every valid fork per test."
+        ),
+    )
+    parser.add_argument(
+        "--marker",
+        default=DEFAULT_MARKER,
+        help=(
+            "Marker name embedded in the JSON output for mark_tests.py. "
+            f"Default: {DEFAULT_MARKER!r}."
+        ),
+    )
+    parser.add_argument(
+        "--repo-root",
+        default=str(Path.cwd()),
+        help=(
+            "Repository root used to verify node IDs against the source "
+            "(node ID module paths are relative to this). Default: cwd."
+        ),
+    )
+    parser.add_argument(
+        "--output",
+        default="-",
+        help=(
+            "Output destination. `-` (default) prints a text report to "
+            "stdout. A path ending in `.json` writes the mark_tests.py "
+            "wrapped form {marker, nodeids}; any other path writes the "
+            "text report to that file."
+        ),
+    )
+    parser.add_argument(
+        "--events-json",
+        default=None,
+        help=(
+            "Optional path to also write the detailed per-event records "
+            "(test_id, config, block, tx, depth, op, pc, trace_file)."
+        ),
+    )
+    parser.add_argument(
+        "--top-level-only",
+        action="store_true",
+        help="Report only events at depth == 1.",
+    )
+    return parser.parse_args()
+
+
+def run_fill(
+    test_path: str,
+    dump_dir: Path,
+    workers: str,
+    fork: str | None,
+) -> None:
+    """Invoke `uv run fill ...` with tracing enabled."""
+    cmd = [
+        "uv",
+        "run",
+        "fill",
+        test_path,
+        "--traces",
+        "--evm-dump-dir",
+        str(dump_dir),
+        "-n",
+        workers,
+    ]
+    if fork is not None:
+        cmd += ["--fork", fork]
+    print(f"+ {' '.join(cmd)}", file=sys.stderr)
+    subprocess.run(cmd, check=True)
+
+
+def parse_relative_path(
+    path: Path,
+    dump_root: Path,
+) -> tuple[str, str, int, int] | None:
+    """
+    Decode a trace-file path under DUMP into its identifiers.
+
+    Return (test_id, config, block_idx, tx_idx) or None if the path
+    does not match the expected layout. The test_id is the sanitized
+    function-level name (fixtures/collector.py:146-171).
+    """
+    try:
+        rel = path.relative_to(dump_root)
+    except ValueError:
+        return None
+    parts = rel.parts
+    if len(parts) < 4:
+        return None
+    config, block_str, filename = parts[-3], parts[-2], parts[-1]
+    test_id = "/".join(parts[:-3])
+    m = TRACE_FILE_RE.match(filename)
+    if m is None:
+        return None
+    try:
+        block_idx = int(block_str)
+    except ValueError:
+        return None
+    return test_id, config, block_idx, int(m.group(1))
+
+
+def scan_trace_file(
+    path: Path,
+    ids: tuple[str, str, int, int],
+) -> Iterator[OogEvent]:
+    """
+    Yield deduplicated OOG events from a single trace .jsonl file.
+
+    Stream the file line by line. Skip lines lacking `depth` -- the
+    trailing FinalTrace summary has no depth (see eip3155.py:54-70)
+    and would otherwise double-count top-level OOG. Deduplicate
+    within a file by (depth, pc, op_name) to suppress the tracer's
+    occasional double-stamping of the same event.
+    """
+    test_id, config, block, tx_idx = ids
+    seen: set[tuple[int, int, str]] = set()
+    with path.open("r") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            if "depth" not in obj:
+                continue
+            if obj.get("error") != OOG_ERROR_NAME:
+                continue
+            key = (
+                int(obj["depth"]),
+                int(obj.get("pc", -1)),
+                str(obj.get("opName", "")),
+            )
+            if key in seen:
+                continue
+            seen.add(key)
+            yield OogEvent(
+                test_id=test_id,
+                config=config,
+                block=block,
+                tx_index=tx_idx,
+                depth=key[0],
+                op_name=key[2],
+                pc=key[1],
+                trace_file=str(path),
+            )
+
+
+def scan(dump_root: Path, top_level_only: bool) -> list[OogEvent]:
+    """Walk DUMP_ROOT for trace files and collect all OOG events."""
+    events: list[OogEvent] = []
+    for path in sorted(dump_root.rglob("trace-*.jsonl")):
+        ids = parse_relative_path(path, dump_root)
+        if ids is None:
+            continue
+        for ev in scan_trace_file(path, ids):
+            if top_level_only and ev.depth != 1:
+                continue
+            events.append(ev)
+    return events
+
+
+# ---------------------------------------------------------------------------
+# Node-ID resolution
+# ---------------------------------------------------------------------------
+
+
+def _find_function_qualnames(module_path: Path, funcname: str) -> list[str]:
+    """
+    Return the qualnames of every function named ``funcname`` in a module.
+
+    A top-level ``def funcname`` yields ``"funcname"``; a method inside
+    ``class Foo`` yields ``"Foo::funcname"`` (the ``::`` separator that
+    mark_tests.py's `_find_function` expects). More than one result means
+    the sanitized name is ambiguous (same function name in two classes);
+    all are returned so the caller can over-mark rather than guess.
+    """
+    try:
+        tree = ast.parse(module_path.read_text())
+    except (OSError, SyntaxError):
+        return []
+    out: list[str] = []
+
+    def walk(body: list[ast.stmt], prefix: list[str]) -> None:
+        for node in body:
+            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+                if node.name == funcname:
+                    out.append("::".join(prefix + [node.name]))
+            elif isinstance(node, ast.ClassDef):
+                walk(node.body, prefix + [node.name])
+
+    walk(tree.body, [])
+    return out
+
+
+def resolve_node_ids(
+    test_id: str,
+    repo_root: Path,
+) -> tuple[list[str], str | None]:
+    """
+    Reconstruct verified pytest node IDs from a sanitized test_id.
+
+    The sanitized name is ``<reldir>__<stem>__<funcname>`` where reldir
+    parts are joined by ``__``, ``stem`` is the module file stem with its
+    ``test_`` prefix stripped, and ``funcname`` is the (unstripped) test
+    function. The module file is located by re-adding the ``test_``
+    prefix (with fallbacks) and confirming it exists; the function is then
+    located in its AST. Return ``(node_ids, error)`` -- ``node_ids`` is
+    empty and ``error`` set when nothing verifies.
+    """
+    tokens = test_id.split("__")
+    if len(tokens) < 2:
+        return [], f"{test_id}: too few path components"
+    funcname = tokens[-1]
+    stem = tokens[-2]
+    reldir_parts = tokens[:-2]
+    candidates = (f"test_{stem}.py", f"{stem}.py", f"{stem}Filler.py")
+    for fname in candidates:
+        module_rel = Path("tests", *reldir_parts, fname)
+        abs_path = repo_root / module_rel
+        if not abs_path.is_file():
+            continue
+        qualnames = _find_function_qualnames(abs_path, funcname)
+        if qualnames:
+            return (
+                [f"{module_rel.as_posix()}::{qn}" for qn in qualnames],
+                None,
+            )
+        return [], (
+            f"{test_id}: found {module_rel.as_posix()} but no function "
+            f"{funcname!r}"
+        )
+    return [], f"{test_id}: no module file for stem {stem!r}"
+
+
+def resolve_all(
+    events: list[OogEvent],
+    repo_root: Path,
+) -> tuple[list[str], list[str]]:
+    """
+    Map distinct OOG test_ids to sorted unique node IDs.
+
+    Return (node_ids, warnings). Warnings list test_ids that could not
+    be resolved to a verified node ID.
+    """
+    node_ids: set[str] = set()
+    warnings: list[str] = []
+    for test_id in sorted({ev.test_id for ev in events}):
+        resolved, err = resolve_node_ids(test_id, repo_root)
+        if err:
+            warnings.append(err)
+        node_ids.update(resolved)
+    return sorted(node_ids), warnings
+
+
+# ---------------------------------------------------------------------------
+# Output
+# ---------------------------------------------------------------------------
+
+
+def emit_text(
+    events: list[OogEvent],
+    node_ids: list[str],
+    warnings: list[str],
+    marker: str,
+    stream: IO[str],
+) -> None:
+    """Write a grouped text report to ``stream``."""
+    by_test: dict[str, list[OogEvent]] = defaultdict(list)
+    for ev in events:
+        by_test[ev.test_id].append(ev)
+    for test_id in sorted(by_test):
+        print(test_id, file=stream)
+        for ev in by_test[test_id]:
+            print(
+                f"  {ev.config}  block={ev.block}  tx={ev.tx_index}  "
+                f"depth={ev.depth}  op={ev.op_name}  pc={ev.pc}",
+                file=stream,
+            )
+    total = len(by_test)
+    sub_call = sum(
+        1 for evs in by_test.values() if any(e.depth > 1 for e in evs)
+    )
+    top_only = total - sub_call
+    print("", file=stream)
+    print(f"Marker:                    @pytest.mark.{marker}", file=stream)
+    print(f"Total OOG-by-design tests: {total}", file=stream)
+    print(f"  Top-level OOG only:      {top_only}", file=stream)
+    print(f"  Any sub-call OOG:        {sub_call}", file=stream)
+    print(f"Resolved node IDs:         {len(node_ids)}", file=stream)
+    if warnings:
+        print(f"Unresolved test IDs:       {len(warnings)}", file=stream)
+        for w in warnings:
+            print(f"  ? {w}", file=stream)
+
+
+def emit_mark_tests_json(
+    node_ids: list[str],
+    marker: str,
+    stream: IO[str],
+) -> None:
+    """Write the mark_tests.py wrapped form {marker, nodeids}."""
+    json.dump({"marker": marker, "nodeids": node_ids}, stream, indent=2)
+    stream.write("\n")
+
+
+def emit_events_json(events: list[OogEvent], path: Path) -> None:
+    """Write the detailed per-event records to ``path``."""
+    with path.open("w") as f:
+        json.dump([asdict(e) for e in events], f, indent=2)
+        f.write("\n")
+
+
+def main() -> int:
+    """Entry point."""
+    args = parse_args()
+    dump_dir = Path(args.evm_dump_dir).expanduser()
+    repo_root = Path(args.repo_root).expanduser()
+    if args.run_fill:
+        dump_dir.mkdir(parents=True, exist_ok=True)
+        run_fill(args.test_path, dump_dir, args.workers, args.fork)
+    if not dump_dir.exists():
+        print(
+            f"error: --evm-dump-dir {dump_dir} does not exist "
+            "(use --run-fill to generate traces)",
+            file=sys.stderr,
+        )
+        return 2
+    events = scan(dump_dir, args.top_level_only)
+    node_ids, warnings = resolve_all(events, repo_root)
+
+    if args.events_json:
+        emit_events_json(events, Path(args.events_json).expanduser())
+
+    if args.output == "-":
+        emit_text(events, node_ids, warnings, args.marker, sys.stdout)
+    else:
+        out = Path(args.output).expanduser()
+        with out.open("w") as f:
+            if out.suffix == ".json":
+                emit_mark_tests_json(node_ids, args.marker, f)
+            else:
+                emit_text(events, node_ids, warnings, args.marker, f)
+        # Always surface unresolved IDs on stderr so they are never lost
+        # when the primary output is the machine-readable JSON.
+        for w in warnings:
+            print(f"warning: unresolved {w}", file=sys.stderr)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/fill_for_oog_detection.sh b/scripts/fill_for_oog_detection.sh
new file mode 100755
index 00000000000..0c458d75c7c
--- /dev/null
+++ b/scripts/fill_for_oog_detection.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+# Run `fill` on tests/ported_static for Osaka with traces enabled, detached
+# from the controlling terminal so it survives logout / harness timeouts.
+#
+# Used as the producer step for `scripts/detect_oog_by_design.py`.
+#
+# Usage:
+#   scripts/fill_for_oog_detection.sh            # default: tests/ported_static, -n 6
+#   scripts/fill_for_oog_detection.sh -n 3       # custom worker count
+#   scripts/fill_for_oog_detection.sh tests/foo  # custom test path
+#
+# All temp files land under $HOME/.tmp (avoids filling tmpfs /tmp). Traces go
+# to $HOME/.tmp/oog-traces; fixtures go to $HOME/.tmp/oog-fixtures.
+#
+# Monitor with:
+#   tail -f $HOME/.tmp/oog-fill.log
+#   grep -oE '\[ *[0-9]+/[0-9]+\]' $HOME/.tmp/oog-fill.log | tail -1
+#
+# When fill exits, run:
+#   uv run scripts/detect_oog_by_design.py <test_path> \
+#       --evm-dump-dir=$HOME/.tmp/oog-traces \
+#       --output=oog-by-design.json
+
+set -euo pipefail
+
+TEST_PATH="${1:-tests/ported_static}"
+shift || true
+# Remaining args are forwarded verbatim to `fill` (e.g. -n 3, --fork Prague).
+EXTRA_ARGS=("$@")
+if [[ ${#EXTRA_ARGS[@]} -eq 0 ]]; then
+    EXTRA_ARGS=(-n 2)
+fi
+
+DUMP_DIR="$HOME/.tmp/oog-traces"
+FIXTURES_DIR="$HOME/.tmp/oog-fixtures"
+LOG_FILE="$HOME/.tmp/oog-fill.log"
+
+echo "Cleaning previous run state..."
+rm -rf "$DUMP_DIR" "$FIXTURES_DIR" "$LOG_FILE"
+mkdir -p "$DUMP_DIR" "$FIXTURES_DIR"
+
+echo "Starting fill (detached)..."
+setsid nohup bash -c "
+  TMPDIR=\$HOME/.tmp uv run fill '$TEST_PATH' \\
+    --fork Osaka \\
+    --traces \\
+    --evm-dump-dir='$DUMP_DIR' \\
+    --output='$FIXTURES_DIR' \\
+    ${EXTRA_ARGS[*]}
+" > "$LOG_FILE" 2>&1 < /dev/null &
+PID=$!
+disown
+
+echo "fill pid: $PID"
+echo "log:      $LOG_FILE"
+echo "traces:   $DUMP_DIR"
+echo
+echo "Monitor:"
+echo "  tail -f $LOG_FILE"
+echo "  grep -oE '\\[ *[0-9]+/[0-9]+\\]' $LOG_FILE | tail -1"
+echo "  ps -p $PID"