From cf6087aeb1c316d99aa357ff7c65649166959ff2 Mon Sep 17 00:00:00 2001 From: Brendan O'Leary Date: Mon, 9 Mar 2026 10:09:12 -0400 Subject: [PATCH] Add final score summary and submission ID logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Logs the total score as a percentage after benchmark completion: 📊 Final score: 8.50/10 (85.0%) Also logs the submission ID after successful upload for easier tracking. This enables bench_runner.sh to parse and include scores in Slack notifications. --- scripts/benchmark.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 36b5e05..754c64a 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -620,6 +620,12 @@ def main(): output_path = output_dir / f"{run_id}_{model_slug}.json" output_path.write_text(json.dumps(aggregate, indent=2), encoding="utf-8") + # Calculate and log final score summary + total_score = sum(grades_by_task_id[tid]["mean"] for tid in grades_by_task_id) + max_score = float(len(grades_by_task_id)) # Each task has max_score of 1.0 + score_pct = (total_score / max_score * 100) if max_score > 0 else 0 + logger.info("📊 Final score: %.2f/%.0f (%.1f%%)", total_score, max_score, score_pct) + logger.info("Saved results to %s", output_path) _log_efficiency_summary(efficiency, grades_by_task_id) if args.no_upload: @@ -629,6 +635,8 @@ def main(): from lib_upload import UploadError, upload_results result = upload_results(output_path, official_key=args.official_key) + if result.submission_id: + logger.info("Submission ID: %s", result.submission_id) if result.rank is not None: logger.info("Uploaded to leaderboard: rank #%s", result.rank) if result.leaderboard_url: