OpenHands · juanmichelini · May 13, 2026 · May 30, 2026 · all-hands-bot · May 30, 2026
diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
@@ -25,7 +25,10 @@
 )
 from benchmarks.utils.constants import MODEL_NAME_OR_PATH
 from benchmarks.utils.laminar import LaminarService
-from benchmarks.utils.patch_utils import remove_files_from_patch
+from benchmarks.utils.patch_utils import (
+    keep_only_test_files,
+    remove_files_from_patch,
+)
 from benchmarks.utils.report_costs import generate_cost_report
 from openhands.sdk import get_logger
 
@@ -205,6 +208,13 @@ def convert_to_swtbench_format(input_file: str, output_file: str) -> None:
                 # postprocess git_patch
                 setup_files = ["pyproject.toml", "tox.ini", "setup.py"]
                 git_patch = remove_files_from_patch(git_patch, setup_files)
+                # SWT-bench only scores diffs to existing test files. Strip
+                # everything else (source-code "fix" attempts, scratch files
+                # like reproduction.py / FIX_SUMMARY.md, build/, docs/, etc.):
+                # a non-test diff that lands in model_patch can silence the
+                # F2P signal because the test then runs against the agent's
+                # own patched code instead of the buggy code.
+                git_patch = keep_only_test_files(git_patch)
 
                 # Create SWT-Bench format entry
                 swtbench_entry = {

diff --git a/benchmarks/utils/patch_utils.py b/benchmarks/utils/patch_utils.py
@@ -69,6 +69,62 @@ def remove_files_from_patch(git_patch, files):
     return result
 
 
+def _is_test_path(path: str) -> bool:
+    """Heuristic: does ``path`` look like a project test file?
+
+    A path counts as a test if it lives under a ``tests``/``test``/``testing``
+    directory anywhere in its tree, or if its basename matches the standard
+    pytest/unittest discovery patterns (``test_*.py``, ``*_test.py``,
+    ``conftest.py``).
+
+    Files at the repository root are *not* considered tests even when they
+    match the naming pattern (e.g. ``test_repro.py``), because in SWT-bench
+    those are agent-authored scratch files that the harness ignores.
+    """
+    if "/" not in path:
+        return False
+    parts = path.split("/")
+    if any(seg in ("tests", "test", "testing") for seg in parts[:-1]):
+        return True
+    base = parts[-1]
+    return (
+        (base.startswith("test_") and base.endswith(".py")) or base.endswith("_test.py") or base == "conftest.py"
+    )
+
+
+def keep_only_test_files(git_patch: str) -> str:
+    """Return ``git_patch`` with every non-test file diff removed.
+
+    Useful for benchmarks that only score test-file changes (e.g. SWT-bench).
+    A file is kept iff :func:`_is_test_path` returns ``True`` for either the
+    pre- or post-image path in its ``diff --git`` header.
+
+    The implementation mirrors :func:`remove_files_from_patch` so the two
+    helpers behave consistently.
+    """
+    if not git_patch:
+        return git_patch
+
+    diff_pattern = r"diff --git [^\n]*\n"
+    diff_matches = list(re.finditer(diff_pattern, git_patch))
+    if not diff_matches:
+        return git_patch
+
+    kept = []
+    for i, match in enumerate(diff_matches):
+        start = match.start()
+        end = (
+            diff_matches[i + 1].start() if i + 1 < len(diff_matches) else len(git_patch)
+        )
+        diff = git_patch[start:end]
+        header = diff.split("\n", 1)[0]
+        m = re.match(r"diff --git a/(.+) b/(.+)", header)
+        if m and (_is_test_path(m.group(1)) or _is_test_path(m.group(2))):
+            kept.append(diff)
+
+    return "".join(kept)
+
+
 def remove_binary_diffs(patch_text):
     """
     Remove binary file diffs from a git patch.

diff --git a/tests/test_patch_utils_keep_test_files.py b/tests/test_patch_utils_keep_test_files.py
@@ -0,0 +1,74 @@
+"""Tests for patch_utils.keep_only_test_files."""
+
+from benchmarks.utils.patch_utils import keep_only_test_files
+
+
+def _diff(path: str, body: str = "@@ -1 +1 @@\n-old\n+new\n") -> str:
+    """Build a minimal ``diff --git`` block for ``path``."""
+    return (
+        f"diff --git a/{path} b/{path}\n"
+        f"--- a/{path}\n"
+        f"+++ b/{path}\n"
+        f"{body}"
+    )
+
+
+class TestKeepOnlyTestFiles:
+    def test_empty_returns_empty(self):
+        assert keep_only_test_files("") == ""
+
+    def test_passthrough_when_no_diff_header(self):
+        patch = "not a real patch\n"
+        assert keep_only_test_files(patch) == patch
+
+    def test_keeps_files_under_tests_dir(self):
+        patch = _diff("tests/foo/test_bar.py") + _diff("sympy/core/basic.py")
+        out = keep_only_test_files(patch)
+        assert "tests/foo/test_bar.py" in out
+        assert "sympy/core/basic.py" not in out
+
+    def test_keeps_files_under_testing_dir(self):
+        patch = _diff("django/testing/helpers.py")
+        assert "django/testing/helpers.py" in keep_only_test_files(patch)
+
+    def test_keeps_conftest(self):
+        patch = _diff("pkg/conftest.py")
+        assert "pkg/conftest.py" in keep_only_test_files(patch)
+
+    def test_keeps_test_prefix_outside_test_dir(self):
+        patch = _diff("pkg/sub/test_helpers.py")
+        assert "pkg/sub/test_helpers.py" in keep_only_test_files(patch)
+
+    def test_keeps_underscore_test_suffix(self):
+        patch = _diff("pkg/sub/helpers_test.py")
+        assert "pkg/sub/helpers_test.py" in keep_only_test_files(patch)
+
+    def test_drops_root_level_scratch_repros(self):
+        # Agent scratch files at the repo root are not real tests even when
+        # they match the naming convention.
+        patch = (
+            _diff("reproduction.py")
+            + _diff("test_repro.py")
+            + _diff("FIX_SUMMARY.md")
+            + _diff("tests/test_real.py")
+        )
+        out = keep_only_test_files(patch)
+        assert "reproduction.py" not in out
+        assert "test_repro.py" not in out
+        assert "FIX_SUMMARY.md" not in out
+        assert "tests/test_real.py" in out
+
+    def test_drops_build_and_docs(self):
+        patch = (
+            _diff("build/lib/requests/__init__.py")
+            + _diff("docs/intro.rst")
+            + _diff("tests/test_real.py")
+        )
+        out = keep_only_test_files(patch)
+        assert "build/lib/requests/__init__.py" not in out
+        assert "docs/intro.rst" not in out
+        assert "tests/test_real.py" in out
+
+    def test_all_non_test_returns_empty_string(self):
+        patch = _diff("sympy/core/basic.py") + _diff("django/db/models/base.py")
+        assert keep_only_test_files(patch) == ""