-
Notifications
You must be signed in to change notification settings - Fork 65
DRAFT: swtbench: strip non-test files from model_patch in post-processing (option 2 of #708) #711
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,7 +25,10 @@ | |
| ) | ||
| from benchmarks.utils.constants import MODEL_NAME_OR_PATH | ||
| from benchmarks.utils.laminar import LaminarService | ||
| from benchmarks.utils.patch_utils import remove_files_from_patch | ||
| from benchmarks.utils.patch_utils import ( | ||
| keep_only_test_files, | ||
| remove_files_from_patch, | ||
| ) | ||
| from benchmarks.utils.report_costs import generate_cost_report | ||
| from openhands.sdk import get_logger | ||
|
|
||
|
|
@@ -205,6 +208,13 @@ def convert_to_swtbench_format(input_file: str, output_file: str) -> None: | |
| # postprocess git_patch | ||
| setup_files = ["pyproject.toml", "tox.ini", "setup.py"] | ||
| git_patch = remove_files_from_patch(git_patch, setup_files) | ||
| # SWT-bench only scores diffs to existing test files. Strip | ||
| # everything else (source-code "fix" attempts, scratch files | ||
| # like reproduction.py / FIX_SUMMARY.md, build/, docs/, etc.): | ||
| # a non-test diff that lands in model_patch can silence the | ||
| # F2P signal because the test then runs against the agent's | ||
| # own patched code instead of the buggy code. | ||
| git_patch = keep_only_test_files(git_patch) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟠 Important: Missing Runtime Evidence - The PR description shows the problem exists (table with resolution rates), but doesn't show that applying this fix actually improves scores on a real evaluation run. Please add an
This proves the fix works in practice, not just in theory.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
juanmichelini marked this conversation as resolved.
|
||
|
|
||
| # Create SWT-Bench format entry | ||
| swtbench_entry = { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -69,6 +69,62 @@ def remove_files_from_patch(git_patch, files): | |
| return result | ||
|
|
||
|
|
||
| def _is_test_path(path: str) -> bool: | ||
| """Heuristic: does ``path`` look like a project test file? | ||
|
|
||
| A path counts as a test if it lives under a ``tests``/``test``/``testing`` | ||
| directory anywhere in its tree, or if its basename matches the standard | ||
| pytest/unittest discovery patterns (``test_*.py``, ``*_test.py``, | ||
| ``conftest.py``). | ||
|
|
||
| Files at the repository root are *not* considered tests even when they | ||
| match the naming pattern (e.g. ``test_repro.py``), because in SWT-bench | ||
| those are agent-authored scratch files that the harness ignores. | ||
| """ | ||
| if "/" not in path: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This guard intentionally rejects root-level files (agent scratch files like def test_drops_root_level_conftest(self):
# Root conftest.py is treated as agent scratch, not a real test file.
patch = _diff('conftest.py') + _diff('tests/conftest.py')
out = keep_only_test_files(patch)
assert 'conftest.py' not in out.split('\n')[0] # root dropped
assert 'tests/conftest.py' in out # package-level kept |
||
| return False | ||
| parts = path.split("/") | ||
| if any(seg in ("tests", "test", "testing") for seg in parts[:-1]): | ||
|
juanmichelini marked this conversation as resolved.
|
||
| return True | ||
| base = parts[-1] | ||
| return ( | ||
| (base.startswith("test_") and base.endswith(".py")) or base.endswith("_test.py") or base == "conftest.py" | ||
| ) | ||
|
|
||
|
|
||
| def keep_only_test_files(git_patch: str) -> str: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 Suggestion: The diff-block extraction loop (finding def _split_diff_blocks(git_patch: str) -> list[str]:
"""Split a git patch string into individual per-file diff blocks."""
if not git_patch:
return []
matches = list(re.finditer(r"diff --git [^\n]*\n", git_patch))
if not matches:
return [git_patch] # or [] — depends on caller contract
return [
git_patch[m.start() : (matches[i + 1].start() if i + 1 < len(matches) else len(git_patch))]
for i, m in enumerate(matches)
]Both
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same pattern but no need to extract |
||
| """Return ``git_patch`` with every non-test file diff removed. | ||
|
|
||
| Useful for benchmarks that only score test-file changes (e.g. SWT-bench). | ||
| A file is kept iff :func:`_is_test_path` returns ``True`` for either the | ||
|
juanmichelini marked this conversation as resolved.
|
||
| pre- or post-image path in its ``diff --git`` header. | ||
|
|
||
| The implementation mirrors :func:`remove_files_from_patch` so the two | ||
| helpers behave consistently. | ||
| """ | ||
| if not git_patch: | ||
| return git_patch | ||
|
|
||
| diff_pattern = r"diff --git [^\n]*\n" | ||
| diff_matches = list(re.finditer(diff_pattern, git_patch)) | ||
| if not diff_matches: | ||
| return git_patch | ||
|
|
||
| kept = [] | ||
| for i, match in enumerate(diff_matches): | ||
| start = match.start() | ||
| end = ( | ||
| diff_matches[i + 1].start() if i + 1 < len(diff_matches) else len(git_patch) | ||
| ) | ||
| diff = git_patch[start:end] | ||
| header = diff.split("\n", 1)[0] | ||
| m = re.match(r"diff --git a/(.+) b/(.+)", header) | ||
| if m and (_is_test_path(m.group(1)) or _is_test_path(m.group(2))): | ||
| kept.append(diff) | ||
|
|
||
| return "".join(kept) | ||
|
|
||
|
|
||
| def remove_binary_diffs(patch_text): | ||
| """ | ||
| Remove binary file diffs from a git patch. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,74 @@ | ||
| """Tests for patch_utils.keep_only_test_files.""" | ||
|
|
||
| from benchmarks.utils.patch_utils import keep_only_test_files | ||
|
|
||
|
|
||
| def _diff(path: str, body: str = "@@ -1 +1 @@\n-old\n+new\n") -> str: | ||
| """Build a minimal ``diff --git`` block for ``path``.""" | ||
| return ( | ||
| f"diff --git a/{path} b/{path}\n" | ||
| f"--- a/{path}\n" | ||
| f"+++ b/{path}\n" | ||
| f"{body}" | ||
| ) | ||
|
|
||
|
|
||
| class TestKeepOnlyTestFiles: | ||
| def test_empty_returns_empty(self): | ||
| assert keep_only_test_files("") == "" | ||
|
|
||
| def test_passthrough_when_no_diff_header(self): | ||
| patch = "not a real patch\n" | ||
| assert keep_only_test_files(patch) == patch | ||
|
|
||
| def test_keeps_files_under_tests_dir(self): | ||
| patch = _diff("tests/foo/test_bar.py") + _diff("sympy/core/basic.py") | ||
| out = keep_only_test_files(patch) | ||
| assert "tests/foo/test_bar.py" in out | ||
| assert "sympy/core/basic.py" not in out | ||
|
|
||
| def test_keeps_files_under_testing_dir(self): | ||
| patch = _diff("django/testing/helpers.py") | ||
| assert "django/testing/helpers.py" in keep_only_test_files(patch) | ||
|
|
||
| def test_keeps_conftest(self): | ||
| patch = _diff("pkg/conftest.py") | ||
| assert "pkg/conftest.py" in keep_only_test_files(patch) | ||
|
|
||
| def test_keeps_test_prefix_outside_test_dir(self): | ||
| patch = _diff("pkg/sub/test_helpers.py") | ||
| assert "pkg/sub/test_helpers.py" in keep_only_test_files(patch) | ||
|
|
||
| def test_keeps_underscore_test_suffix(self): | ||
| patch = _diff("pkg/sub/helpers_test.py") | ||
| assert "pkg/sub/helpers_test.py" in keep_only_test_files(patch) | ||
|
|
||
| def test_drops_root_level_scratch_repros(self): | ||
| # Agent scratch files at the repo root are not real tests even when | ||
| # they match the naming convention. | ||
| patch = ( | ||
| _diff("reproduction.py") | ||
| + _diff("test_repro.py") | ||
| + _diff("FIX_SUMMARY.md") | ||
| + _diff("tests/test_real.py") | ||
| ) | ||
| out = keep_only_test_files(patch) | ||
| assert "reproduction.py" not in out | ||
| assert "test_repro.py" not in out | ||
| assert "FIX_SUMMARY.md" not in out | ||
| assert "tests/test_real.py" in out | ||
|
|
||
| def test_drops_build_and_docs(self): | ||
| patch = ( | ||
| _diff("build/lib/requests/__init__.py") | ||
| + _diff("docs/intro.rst") | ||
| + _diff("tests/test_real.py") | ||
| ) | ||
| out = keep_only_test_files(patch) | ||
| assert "build/lib/requests/__init__.py" not in out | ||
| assert "docs/intro.rst" not in out | ||
| assert "tests/test_real.py" in out | ||
|
|
||
| def test_all_non_test_returns_empty_string(self): | ||
| patch = _diff("sympy/core/basic.py") + _diff("django/db/models/base.py") | ||
| assert keep_only_test_files(patch) == "" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With
keep_only_test_filesapplied two lines below, this call is now logically redundant —pyproject.toml,tox.ini, andsetup.pyare not test files and would be stripped by the new filter anyway. Keeping it is fine for clarity, but a short follow-up comment (e.g.# belt-and-suspenders; also covered by keep_only_test_files below) would help future readers understand why two successive filter calls exist here.