From 4c25a6b5255ba2b1332ab68cbaf7a1c75173942d Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 29 May 2026 14:15:59 +0000 Subject: [PATCH] fix(swebenchmultimodal): skip evaluation when predictions file is empty When every inference attempt fails (e.g. LLM 5xx errors) the OpenHands output.jsonl is empty, so the converted predictions file is also empty. The SWE-Bench harness then prints "No instances to run." and exits 0 without calling make_run_report, so the per-model report.json is never created. The previous code unconditionally looked for that report after the harness exited and raised: FileNotFoundError: Expected report file not found: ....json. SWE-Bench harness output naming may have changed. That message hides the real failure (all inference attempts errored) and falsely suggests an upstream harness regression. Detect the empty-predictions case up front, log a clear warning, and return None. main() already handles report_path=None by emitting {"report_json": ""} and exiting cleanly. Refs OpenHands/software-agent-sdk#3435 Co-authored-by: openhands --- benchmarks/swebenchmultimodal/eval_infer.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/benchmarks/swebenchmultimodal/eval_infer.py b/benchmarks/swebenchmultimodal/eval_infer.py index 737213680..8fded1199 100644 --- a/benchmarks/swebenchmultimodal/eval_infer.py +++ b/benchmarks/swebenchmultimodal/eval_infer.py @@ -272,6 +272,24 @@ def run_swebench_multimodal_evaluation( # Default for run_id if not provided run_id = run_id or predictions_path.stem + # If the predictions file has no entries (e.g. every inference attempt + # failed and produced no patches), the SWE-Bench harness prints + # "No instances to run." and exits successfully without writing a + # report file. Detect this up-front and short-circuit so we surface a + # clear log message instead of a misleading + # "SWE-Bench harness output naming may have changed" FileNotFoundError. + num_predictions = sum( + 1 for line in predictions_path.read_text().splitlines() if line.strip() + ) + if num_predictions == 0: + logger.warning( + f"No predictions found in {predictions_file}; " + "skipping SWE-Bench Multimodal evaluation. " + "This usually means every inference attempt failed " + "(e.g. LLM errors) and no patches were produced." + ) + return None + # The key difference from regular SWE-Bench is the --modal true flag cmd = [ "uv",