Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion benchmarks/swtbench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@
)
from benchmarks.utils.constants import MODEL_NAME_OR_PATH
from benchmarks.utils.laminar import LaminarService
from benchmarks.utils.patch_utils import remove_files_from_patch
from benchmarks.utils.patch_utils import (
keep_only_test_files,
remove_files_from_patch,
)
from benchmarks.utils.report_costs import generate_cost_report
from openhands.sdk import get_logger

Expand Down Expand Up @@ -205,6 +208,13 @@ def convert_to_swtbench_format(input_file: str, output_file: str) -> None:
# postprocess git_patch
setup_files = ["pyproject.toml", "tox.ini", "setup.py"]
git_patch = remove_files_from_patch(git_patch, setup_files)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With keep_only_test_files applied two lines below, this call is now logically redundant — pyproject.toml, tox.ini, and setup.py are not test files and would be stripped by the new filter anyway. Keeping it is fine for clarity, but a short follow-up comment (e.g. # belt-and-suspenders; also covered by keep_only_test_files below) would help future readers understand why two successive filter calls exist here.

# SWT-bench only scores diffs to existing test files. Strip
# everything else (source-code "fix" attempts, scratch files
# like reproduction.py / FIX_SUMMARY.md, build/, docs/, etc.):
# a non-test diff that lands in model_patch can silence the
# F2P signal because the test then runs against the agent's
# own patched code instead of the buggy code.
git_patch = keep_only_test_files(git_patch)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟠 Important: Missing Runtime Evidence - The PR description shows the problem exists (table with resolution rates), but doesn't show that applying this fix actually improves scores on a real evaluation run.

Please add an Evidence section to the PR description with:

  • The command used to run eval_infer.py on an existing bad run
  • Before/after resolved instance counts from the evaluation output
  • Confirmation that the score improved as expected

This proves the fix works in practice, not just in theory.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment thread
juanmichelini marked this conversation as resolved.

# Create SWT-Bench format entry
swtbench_entry = {
Expand Down
56 changes: 56 additions & 0 deletions benchmarks/utils/patch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,62 @@ def remove_files_from_patch(git_patch, files):
return result


def _is_test_path(path: str) -> bool:
"""Heuristic: does ``path`` look like a project test file?

A path counts as a test if it lives under a ``tests``/``test``/``testing``
directory anywhere in its tree, or if its basename matches the standard
pytest/unittest discovery patterns (``test_*.py``, ``*_test.py``,
``conftest.py``).

Files at the repository root are *not* considered tests even when they
match the naming pattern (e.g. ``test_repro.py``), because in SWT-bench
those are agent-authored scratch files that the harness ignores.
"""
if "/" not in path:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This guard intentionally rejects root-level files (agent scratch files like test_repro.py) — but it also silently drops a root-level conftest.py, which is a legitimate pytest fixture in many projects. If a SWT-bench instance has a root-level conftest.py that the agent correctly patches, those changes will be stripped. Consider adding a test_drops_root_level_conftest test case to document this trade-off explicitly and guard against someone loosening the guard in the future without realising the impact:

def test_drops_root_level_conftest(self):
    # Root conftest.py is treated as agent scratch, not a real test file.
    patch = _diff('conftest.py') + _diff('tests/conftest.py')
    out = keep_only_test_files(patch)
    assert 'conftest.py' not in out.split('\n')[0]  # root dropped
    assert 'tests/conftest.py' in out              # package-level kept

return False
parts = path.split("/")
if any(seg in ("tests", "test", "testing") for seg in parts[:-1]):
Comment thread
juanmichelini marked this conversation as resolved.
return True
base = parts[-1]
return (
(base.startswith("test_") and base.endswith(".py")) or base.endswith("_test.py") or base == "conftest.py"
)


def keep_only_test_files(git_patch: str) -> str:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Suggestion: The diff-block extraction loop (finding start/end bounds per diff --git header) is identical to the one in remove_files_from_patch. Consider extracting a private helper to avoid drift:

def _split_diff_blocks(git_patch: str) -> list[str]:
    """Split a git patch string into individual per-file diff blocks."""
    if not git_patch:
        return []
    matches = list(re.finditer(r"diff --git [^\n]*\n", git_patch))
    if not matches:
        return [git_patch]  # or [] — depends on caller contract
    return [
        git_patch[m.start() : (matches[i + 1].start() if i + 1 < len(matches) else len(git_patch))]
        for i, m in enumerate(matches)
    ]

Both remove_files_from_patch and keep_only_test_files can then reduce to a filter over _split_diff_blocks(git_patch). Not a blocker for this PR, but worth doing before a third caller appears.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same pattern but no need to extract

"""Return ``git_patch`` with every non-test file diff removed.

Useful for benchmarks that only score test-file changes (e.g. SWT-bench).
A file is kept iff :func:`_is_test_path` returns ``True`` for either the
Comment thread
juanmichelini marked this conversation as resolved.
pre- or post-image path in its ``diff --git`` header.

The implementation mirrors :func:`remove_files_from_patch` so the two
helpers behave consistently.
"""
if not git_patch:
return git_patch

diff_pattern = r"diff --git [^\n]*\n"
diff_matches = list(re.finditer(diff_pattern, git_patch))
if not diff_matches:
return git_patch

kept = []
for i, match in enumerate(diff_matches):
start = match.start()
end = (
diff_matches[i + 1].start() if i + 1 < len(diff_matches) else len(git_patch)
)
diff = git_patch[start:end]
header = diff.split("\n", 1)[0]
m = re.match(r"diff --git a/(.+) b/(.+)", header)
if m and (_is_test_path(m.group(1)) or _is_test_path(m.group(2))):
kept.append(diff)

return "".join(kept)


def remove_binary_diffs(patch_text):
"""
Remove binary file diffs from a git patch.
Expand Down
74 changes: 74 additions & 0 deletions tests/test_patch_utils_keep_test_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""Tests for patch_utils.keep_only_test_files."""

from benchmarks.utils.patch_utils import keep_only_test_files


def _diff(path: str, body: str = "@@ -1 +1 @@\n-old\n+new\n") -> str:
"""Build a minimal ``diff --git`` block for ``path``."""
return (
f"diff --git a/{path} b/{path}\n"
f"--- a/{path}\n"
f"+++ b/{path}\n"
f"{body}"
)


class TestKeepOnlyTestFiles:
def test_empty_returns_empty(self):
assert keep_only_test_files("") == ""

def test_passthrough_when_no_diff_header(self):
patch = "not a real patch\n"
assert keep_only_test_files(patch) == patch

def test_keeps_files_under_tests_dir(self):
patch = _diff("tests/foo/test_bar.py") + _diff("sympy/core/basic.py")
out = keep_only_test_files(patch)
assert "tests/foo/test_bar.py" in out
assert "sympy/core/basic.py" not in out

def test_keeps_files_under_testing_dir(self):
patch = _diff("django/testing/helpers.py")
assert "django/testing/helpers.py" in keep_only_test_files(patch)

def test_keeps_conftest(self):
patch = _diff("pkg/conftest.py")
assert "pkg/conftest.py" in keep_only_test_files(patch)

def test_keeps_test_prefix_outside_test_dir(self):
patch = _diff("pkg/sub/test_helpers.py")
assert "pkg/sub/test_helpers.py" in keep_only_test_files(patch)

def test_keeps_underscore_test_suffix(self):
patch = _diff("pkg/sub/helpers_test.py")
assert "pkg/sub/helpers_test.py" in keep_only_test_files(patch)

def test_drops_root_level_scratch_repros(self):
# Agent scratch files at the repo root are not real tests even when
# they match the naming convention.
patch = (
_diff("reproduction.py")
+ _diff("test_repro.py")
+ _diff("FIX_SUMMARY.md")
+ _diff("tests/test_real.py")
)
out = keep_only_test_files(patch)
assert "reproduction.py" not in out
assert "test_repro.py" not in out
assert "FIX_SUMMARY.md" not in out
assert "tests/test_real.py" in out

def test_drops_build_and_docs(self):
patch = (
_diff("build/lib/requests/__init__.py")
+ _diff("docs/intro.rst")
+ _diff("tests/test_real.py")
)
out = keep_only_test_files(patch)
assert "build/lib/requests/__init__.py" not in out
assert "docs/intro.rst" not in out
assert "tests/test_real.py" in out

def test_all_non_test_returns_empty_string(self):
patch = _diff("sympy/core/basic.py") + _diff("django/db/models/base.py")
assert keep_only_test_files(patch) == ""
Loading