From cf6087aeb1c316d99aa357ff7c65649166959ff2 Mon Sep 17 00:00:00 2001
From: Brendan O'Leary <brendan@olearycrew.com>
Date: Mon, 9 Mar 2026 10:09:12 -0400
Subject: [PATCH] Add final score summary and submission ID logging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Logs the total score as a percentage after benchmark completion:
  📊 Final score: 8.50/10 (85.0%)

Also logs the submission ID after successful upload for easier tracking.

This enables bench_runner.sh to parse and include scores in Slack notifications.
---
 scripts/benchmark.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 36b5e05..754c64a 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -620,6 +620,12 @@ def main():
     output_path = output_dir / f"{run_id}_{model_slug}.json"
     output_path.write_text(json.dumps(aggregate, indent=2), encoding="utf-8")
 
+    # Calculate and log final score summary
+    total_score = sum(grades_by_task_id[tid]["mean"] for tid in grades_by_task_id)
+    max_score = float(len(grades_by_task_id))  # Each task has max_score of 1.0
+    score_pct = (total_score / max_score * 100) if max_score > 0 else 0
+    logger.info("📊 Final score: %.2f/%.0f (%.1f%%)", total_score, max_score, score_pct)
+
     logger.info("Saved results to %s", output_path)
     _log_efficiency_summary(efficiency, grades_by_task_id)
     if args.no_upload:
@@ -629,6 +635,8 @@ def main():
             from lib_upload import UploadError, upload_results
 
             result = upload_results(output_path, official_key=args.official_key)
+            if result.submission_id:
+                logger.info("Submission ID: %s", result.submission_id)
             if result.rank is not None:
                 logger.info("Uploaded to leaderboard: rank #%s", result.rank)
             if result.leaderboard_url: