From 31aa230b5eca3250e1d2d11250a47932451bffd8 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 13 May 2026 02:12:04 +0000
Subject: [PATCH 1/2] swtbench: strip non-test files from model_patch in
 post-processing

SWT-bench only scores diffs against existing test files. Today's
post-processing in eval_infer.py removes a small blacklist of setup
files (pyproject.toml, tox.ini, setup.py) but leaves everything else
alone. That means any source-code 'fix' the agent committed alongside
its test, plus scratch artifacts like reproduction.py, FIX_SUMMARY.md,
build/lib/*, and docs/ changes, all flow through into model_patch.

Source-code diffs are particularly harmful: when SWT-bench applies
model_patch and runs the new test, the test is now executing against
the agent's own 'fixed' code rather than the buggy code, so the F2P
signal is silenced and the instance is marked unresolved.

Empirically (run litellm_proxy-openrouter-qwen-qwen3-coder-next/24103435463,
424 instances):

  Only test files in patch          : 22  instances, 10 resolved (45.5%)
  Mixed test + source code in patch : 332 instances,  4 resolved ( 1.2%)
  Only source code in patch         : 64  instances,  0 resolved ( 0.0%)
  Empty patch                       :  3  instances,  0 resolved ( 0.0%)
  Total                             : 424 instances, 14 resolved ( 3.3%)

This adds keep_only_test_files() in benchmarks/utils/patch_utils.py
that mirrors remove_files_from_patch() and applies the inverse
predicate: keep diffs whose target file lives under tests/, test/,
testing/, or whose basename matches test_*.py / *_test.py /
conftest.py. Repo-root paths are intentionally rejected so the agent's
own root-level scratch test_repro.py files are dropped too.

eval_infer.py applies this immediately after the existing setup-file
strip, so all existing SWT-bench runs can be rescored from
output.jsonl without re-running inference. See OpenHands/benchmarks#708.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swtbench/eval_infer.py         | 12 +++-
 benchmarks/utils/patch_utils.py           | 56 +++++++++++++++++
 tests/test_patch_utils_keep_test_files.py | 74 +++++++++++++++++++++++
 3 files changed, 141 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_patch_utils_keep_test_files.py

diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index acf7a0eab..0cadfdecd 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -25,7 +25,10 @@
 )
 from benchmarks.utils.constants import MODEL_NAME_OR_PATH
 from benchmarks.utils.laminar import LaminarService
-from benchmarks.utils.patch_utils import remove_files_from_patch
+from benchmarks.utils.patch_utils import (
+    keep_only_test_files,
+    remove_files_from_patch,
+)
 from benchmarks.utils.report_costs import generate_cost_report
 from openhands.sdk import get_logger
 
@@ -205,6 +208,13 @@ def convert_to_swtbench_format(input_file: str, output_file: str) -> None:
                 # postprocess git_patch
                 setup_files = ["pyproject.toml", "tox.ini", "setup.py"]
                 git_patch = remove_files_from_patch(git_patch, setup_files)
+                # SWT-bench only scores diffs to existing test files. Strip
+                # everything else (source-code "fix" attempts, scratch files
+                # like reproduction.py / FIX_SUMMARY.md, build/, docs/, etc.):
+                # a non-test diff that lands in model_patch can silence the
+                # F2P signal because the test then runs against the agent's
+                # own patched code instead of the buggy code.
+                git_patch = keep_only_test_files(git_patch)
 
                 # Create SWT-Bench format entry
                 swtbench_entry = {
diff --git a/benchmarks/utils/patch_utils.py b/benchmarks/utils/patch_utils.py
index aa8afc011..954098142 100644
--- a/benchmarks/utils/patch_utils.py
+++ b/benchmarks/utils/patch_utils.py
@@ -69,6 +69,62 @@ def remove_files_from_patch(git_patch, files):
     return result
 
 
+def _is_test_path(path: str) -> bool:
+    """Heuristic: does ``path`` look like a project test file?
+
+    A path counts as a test if it lives under a ``tests``/``test``/``testing``
+    directory anywhere in its tree, or if its basename matches the standard
+    pytest/unittest discovery patterns (``test_*.py``, ``*_test.py``,
+    ``conftest.py``).
+
+    Files at the repository root are *not* considered tests even when they
+    match the naming pattern (e.g. ``test_repro.py``), because in SWT-bench
+    those are agent-authored scratch files that the harness ignores.
+    """
+    if "/" not in path:
+        return False
+    parts = path.split("/")
+    if any(seg in ("tests", "test", "testing") for seg in parts[:-1]):
+        return True
+    base = parts[-1]
+    return (
+        base.startswith("test_") or base.endswith("_test.py") or base == "conftest.py"
+    )
+
+
+def keep_only_test_files(git_patch: str) -> str:
+    """Return ``git_patch`` with every non-test file diff removed.
+
+    Useful for benchmarks that only score test-file changes (e.g. SWT-bench).
+    A file is kept iff :func:`_is_test_path` returns ``True`` for either the
+    pre- or post-image path in its ``diff --git`` header.
+
+    The implementation mirrors :func:`remove_files_from_patch` so the two
+    helpers behave consistently.
+    """
+    if not git_patch:
+        return git_patch
+
+    diff_pattern = r"diff --git [^\n]*\n"
+    diff_matches = list(re.finditer(diff_pattern, git_patch))
+    if not diff_matches:
+        return git_patch
+
+    kept = []
+    for i, match in enumerate(diff_matches):
+        start = match.start()
+        end = (
+            diff_matches[i + 1].start() if i + 1 < len(diff_matches) else len(git_patch)
+        )
+        diff = git_patch[start:end]
+        header = diff.split("\n", 1)[0]
+        m = re.match(r"diff --git a/(.+) b/(.+)", header)
+        if m and (_is_test_path(m.group(1)) or _is_test_path(m.group(2))):
+            kept.append(diff)
+
+    return "".join(kept)
+
+
 def remove_binary_diffs(patch_text):
     """
     Remove binary file diffs from a git patch.
diff --git a/tests/test_patch_utils_keep_test_files.py b/tests/test_patch_utils_keep_test_files.py
new file mode 100644
index 000000000..c78eeba91
--- /dev/null
+++ b/tests/test_patch_utils_keep_test_files.py
@@ -0,0 +1,74 @@
+"""Tests for patch_utils.keep_only_test_files."""
+
+from benchmarks.utils.patch_utils import keep_only_test_files
+
+
+def _diff(path: str, body: str = "@@ -1 +1 @@\n-old\n+new\n") -> str:
+    """Build a minimal ``diff --git`` block for ``path``."""
+    return (
+        f"diff --git a/{path} b/{path}\n"
+        f"--- a/{path}\n"
+        f"+++ b/{path}\n"
+        f"{body}"
+    )
+
+
+class TestKeepOnlyTestFiles:
+    def test_empty_returns_empty(self):
+        assert keep_only_test_files("") == ""
+
+    def test_passthrough_when_no_diff_header(self):
+        patch = "not a real patch\n"
+        assert keep_only_test_files(patch) == patch
+
+    def test_keeps_files_under_tests_dir(self):
+        patch = _diff("tests/foo/test_bar.py") + _diff("sympy/core/basic.py")
+        out = keep_only_test_files(patch)
+        assert "tests/foo/test_bar.py" in out
+        assert "sympy/core/basic.py" not in out
+
+    def test_keeps_files_under_testing_dir(self):
+        patch = _diff("django/testing/helpers.py")
+        assert "django/testing/helpers.py" in keep_only_test_files(patch)
+
+    def test_keeps_conftest(self):
+        patch = _diff("pkg/conftest.py")
+        assert "pkg/conftest.py" in keep_only_test_files(patch)
+
+    def test_keeps_test_prefix_outside_test_dir(self):
+        patch = _diff("pkg/sub/test_helpers.py")
+        assert "pkg/sub/test_helpers.py" in keep_only_test_files(patch)
+
+    def test_keeps_underscore_test_suffix(self):
+        patch = _diff("pkg/sub/helpers_test.py")
+        assert "pkg/sub/helpers_test.py" in keep_only_test_files(patch)
+
+    def test_drops_root_level_scratch_repros(self):
+        # Agent scratch files at the repo root are not real tests even when
+        # they match the naming convention.
+        patch = (
+            _diff("reproduction.py")
+            + _diff("test_repro.py")
+            + _diff("FIX_SUMMARY.md")
+            + _diff("tests/test_real.py")
+        )
+        out = keep_only_test_files(patch)
+        assert "reproduction.py" not in out
+        assert "test_repro.py" not in out
+        assert "FIX_SUMMARY.md" not in out
+        assert "tests/test_real.py" in out
+
+    def test_drops_build_and_docs(self):
+        patch = (
+            _diff("build/lib/requests/__init__.py")
+            + _diff("docs/intro.rst")
+            + _diff("tests/test_real.py")
+        )
+        out = keep_only_test_files(patch)
+        assert "build/lib/requests/__init__.py" not in out
+        assert "docs/intro.rst" not in out
+        assert "tests/test_real.py" in out
+
+    def test_all_non_test_returns_empty_string(self):
+        patch = _diff("sympy/core/basic.py") + _diff("django/db/models/base.py")
+        assert keep_only_test_files(patch) == ""

From 2acf0946b54f37ec16986df2bc2e01ac647e7a68 Mon Sep 17 00:00:00 2001
From: Juan Michelini <juan@juan.com.uy>
Date: Sat, 30 May 2026 12:32:45 -0300
Subject: [PATCH 2/2] Update benchmarks/utils/patch_utils.py

Co-authored-by: OpenHands Bot <contact@all-hands.dev>
---
 benchmarks/utils/patch_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/utils/patch_utils.py b/benchmarks/utils/patch_utils.py
index 954098142..9ccbee7ab 100644
--- a/benchmarks/utils/patch_utils.py
+++ b/benchmarks/utils/patch_utils.py
@@ -88,7 +88,7 @@ def _is_test_path(path: str) -> bool:
         return True
     base = parts[-1]
     return (
-        base.startswith("test_") or base.endswith("_test.py") or base == "conftest.py"
+        (base.startswith("test_") and base.endswith(".py")) or base.endswith("_test.py") or base == "conftest.py"
     )