From 4c25a6b5255ba2b1332ab68cbaf7a1c75173942d Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 29 May 2026 14:15:59 +0000
Subject: [PATCH] fix(swebenchmultimodal): skip evaluation when predictions
 file is empty

When every inference attempt fails (e.g. LLM 5xx errors) the OpenHands
output.jsonl is empty, so the converted predictions file is also empty.
The SWE-Bench harness then prints "No instances to run." and exits 0
without calling make_run_report, so the per-model report.json is never
created.

The previous code unconditionally looked for that report after the
harness exited and raised:

    FileNotFoundError: Expected report file not found: ....json.
    SWE-Bench harness output naming may have changed.

That message hides the real failure (all inference attempts errored)
and falsely suggests an upstream harness regression.

Detect the empty-predictions case up front, log a clear warning, and
return None. main() already handles report_path=None by emitting
{"report_json": ""} and exiting cleanly.

Refs OpenHands/software-agent-sdk#3435

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swebenchmultimodal/eval_infer.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/benchmarks/swebenchmultimodal/eval_infer.py b/benchmarks/swebenchmultimodal/eval_infer.py
index 737213680..8fded1199 100644
--- a/benchmarks/swebenchmultimodal/eval_infer.py
+++ b/benchmarks/swebenchmultimodal/eval_infer.py
@@ -272,6 +272,24 @@ def run_swebench_multimodal_evaluation(
     # Default for run_id if not provided
     run_id = run_id or predictions_path.stem
 
+    # If the predictions file has no entries (e.g. every inference attempt
+    # failed and produced no patches), the SWE-Bench harness prints
+    # "No instances to run." and exits successfully without writing a
+    # report file. Detect this up-front and short-circuit so we surface a
+    # clear log message instead of a misleading
+    # "SWE-Bench harness output naming may have changed" FileNotFoundError.
+    num_predictions = sum(
+        1 for line in predictions_path.read_text().splitlines() if line.strip()
+    )
+    if num_predictions == 0:
+        logger.warning(
+            f"No predictions found in {predictions_file}; "
+            "skipping SWE-Bench Multimodal evaluation. "
+            "This usually means every inference attempt failed "
+            "(e.g. LLM errors) and no patches were produced."
+        )
+        return None
+
     # The key difference from regular SWE-Bench is the --modal true flag
     cmd = [
         "uv",