diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index acf7a0ea..0cadfdec 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -25,7 +25,10 @@ ) from benchmarks.utils.constants import MODEL_NAME_OR_PATH from benchmarks.utils.laminar import LaminarService -from benchmarks.utils.patch_utils import remove_files_from_patch +from benchmarks.utils.patch_utils import ( + keep_only_test_files, + remove_files_from_patch, +) from benchmarks.utils.report_costs import generate_cost_report from openhands.sdk import get_logger @@ -205,6 +208,13 @@ def convert_to_swtbench_format(input_file: str, output_file: str) -> None: # postprocess git_patch setup_files = ["pyproject.toml", "tox.ini", "setup.py"] git_patch = remove_files_from_patch(git_patch, setup_files) + # SWT-bench only scores diffs to existing test files. Strip + # everything else (source-code "fix" attempts, scratch files + # like reproduction.py / FIX_SUMMARY.md, build/, docs/, etc.): + # a non-test diff that lands in model_patch can silence the + # F2P signal because the test then runs against the agent's + # own patched code instead of the buggy code. + git_patch = keep_only_test_files(git_patch) # Create SWT-Bench format entry swtbench_entry = { diff --git a/benchmarks/utils/patch_utils.py b/benchmarks/utils/patch_utils.py index aa8afc01..9ccbee7a 100644 --- a/benchmarks/utils/patch_utils.py +++ b/benchmarks/utils/patch_utils.py @@ -69,6 +69,62 @@ def remove_files_from_patch(git_patch, files): return result +def _is_test_path(path: str) -> bool: + """Heuristic: does ``path`` look like a project test file? + + A path counts as a test if it lives under a ``tests``/``test``/``testing`` + directory anywhere in its tree, or if its basename matches the standard + pytest/unittest discovery patterns (``test_*.py``, ``*_test.py``, + ``conftest.py``). + + Files at the repository root are *not* considered tests even when they + match the naming pattern (e.g. ``test_repro.py``), because in SWT-bench + those are agent-authored scratch files that the harness ignores. + """ + if "/" not in path: + return False + parts = path.split("/") + if any(seg in ("tests", "test", "testing") for seg in parts[:-1]): + return True + base = parts[-1] + return ( + (base.startswith("test_") and base.endswith(".py")) or base.endswith("_test.py") or base == "conftest.py" + ) + + +def keep_only_test_files(git_patch: str) -> str: + """Return ``git_patch`` with every non-test file diff removed. + + Useful for benchmarks that only score test-file changes (e.g. SWT-bench). + A file is kept iff :func:`_is_test_path` returns ``True`` for either the + pre- or post-image path in its ``diff --git`` header. + + The implementation mirrors :func:`remove_files_from_patch` so the two + helpers behave consistently. + """ + if not git_patch: + return git_patch + + diff_pattern = r"diff --git [^\n]*\n" + diff_matches = list(re.finditer(diff_pattern, git_patch)) + if not diff_matches: + return git_patch + + kept = [] + for i, match in enumerate(diff_matches): + start = match.start() + end = ( + diff_matches[i + 1].start() if i + 1 < len(diff_matches) else len(git_patch) + ) + diff = git_patch[start:end] + header = diff.split("\n", 1)[0] + m = re.match(r"diff --git a/(.+) b/(.+)", header) + if m and (_is_test_path(m.group(1)) or _is_test_path(m.group(2))): + kept.append(diff) + + return "".join(kept) + + def remove_binary_diffs(patch_text): """ Remove binary file diffs from a git patch. diff --git a/tests/test_patch_utils_keep_test_files.py b/tests/test_patch_utils_keep_test_files.py new file mode 100644 index 00000000..c78eeba9 --- /dev/null +++ b/tests/test_patch_utils_keep_test_files.py @@ -0,0 +1,74 @@ +"""Tests for patch_utils.keep_only_test_files.""" + +from benchmarks.utils.patch_utils import keep_only_test_files + + +def _diff(path: str, body: str = "@@ -1 +1 @@\n-old\n+new\n") -> str: + """Build a minimal ``diff --git`` block for ``path``.""" + return ( + f"diff --git a/{path} b/{path}\n" + f"--- a/{path}\n" + f"+++ b/{path}\n" + f"{body}" + ) + + +class TestKeepOnlyTestFiles: + def test_empty_returns_empty(self): + assert keep_only_test_files("") == "" + + def test_passthrough_when_no_diff_header(self): + patch = "not a real patch\n" + assert keep_only_test_files(patch) == patch + + def test_keeps_files_under_tests_dir(self): + patch = _diff("tests/foo/test_bar.py") + _diff("sympy/core/basic.py") + out = keep_only_test_files(patch) + assert "tests/foo/test_bar.py" in out + assert "sympy/core/basic.py" not in out + + def test_keeps_files_under_testing_dir(self): + patch = _diff("django/testing/helpers.py") + assert "django/testing/helpers.py" in keep_only_test_files(patch) + + def test_keeps_conftest(self): + patch = _diff("pkg/conftest.py") + assert "pkg/conftest.py" in keep_only_test_files(patch) + + def test_keeps_test_prefix_outside_test_dir(self): + patch = _diff("pkg/sub/test_helpers.py") + assert "pkg/sub/test_helpers.py" in keep_only_test_files(patch) + + def test_keeps_underscore_test_suffix(self): + patch = _diff("pkg/sub/helpers_test.py") + assert "pkg/sub/helpers_test.py" in keep_only_test_files(patch) + + def test_drops_root_level_scratch_repros(self): + # Agent scratch files at the repo root are not real tests even when + # they match the naming convention. + patch = ( + _diff("reproduction.py") + + _diff("test_repro.py") + + _diff("FIX_SUMMARY.md") + + _diff("tests/test_real.py") + ) + out = keep_only_test_files(patch) + assert "reproduction.py" not in out + assert "test_repro.py" not in out + assert "FIX_SUMMARY.md" not in out + assert "tests/test_real.py" in out + + def test_drops_build_and_docs(self): + patch = ( + _diff("build/lib/requests/__init__.py") + + _diff("docs/intro.rst") + + _diff("tests/test_real.py") + ) + out = keep_only_test_files(patch) + assert "build/lib/requests/__init__.py" not in out + assert "docs/intro.rst" not in out + assert "tests/test_real.py" in out + + def test_all_non_test_returns_empty_string(self): + patch = _diff("sympy/core/basic.py") + _diff("django/db/models/base.py") + assert keep_only_test_files(patch) == ""