From ffefa4b9bb7ab1d14bc2aaefc3140df79fd28024 Mon Sep 17 00:00:00 2001 From: Brendan O'Leary Date: Tue, 10 Mar 2026 16:19:49 -0400 Subject: [PATCH] Add category-level score summary at end of benchmark run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new summary section after benchmark completion that shows: - Overall score percentage (matching website format) - Per-category breakdown with task counts - Color indicators (🟢 ≥90%, 🟡 ≥70%, 🔴 <70%) This allows users running locally (with --no-upload) to compare their results directly against scores on pinchbench.com without digging through the detailed output. Closes #41 --- scripts/benchmark.py | 69 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 36b5e05..ba4e49b 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -414,6 +414,74 @@ def _log_efficiency_summary( logger.info("%s", "=" * 80) +def _log_category_summary( + task_entries: List[Dict[str, Any]], + tasks_by_id: Dict[str, Any], +) -> None: + """Log a summary grouped by category, matching the PinchBench website format.""" + # Group scores by category + category_scores: Dict[str, Dict[str, float]] = {} + + for entry in task_entries: + task_id = entry["task_id"] + task = tasks_by_id.get(task_id) + if not task: + continue + + category = task.category.upper() if task.category else "UNCATEGORIZED" + grading = entry.get("grading", {}) + mean_score = float(grading.get("mean", 0.0)) + max_score = 1.0 # Each task is scored 0-1 + + if category not in category_scores: + category_scores[category] = {"earned": 0.0, "possible": 0.0, "task_count": 0} + + category_scores[category]["earned"] += mean_score + category_scores[category]["possible"] += max_score + category_scores[category]["task_count"] += 1 + + # Calculate overall totals + total_earned = sum(c["earned"] for c in category_scores.values()) + total_possible = sum(c["possible"] for c in category_scores.values()) + overall_pct = (total_earned / total_possible * 100) if total_possible > 0 else 0 + + logger.info("\n%s", "=" * 80) + logger.info("🦀 PINCHBENCH SCORE SUMMARY") + logger.info("%s", "=" * 80) + logger.info("") + logger.info(" Overall Score: %.1f%% (%.1f / %.1f)", overall_pct, total_earned, total_possible) + logger.info("") + logger.info(" %-20s %8s %12s", "CATEGORY", "SCORE", "TASKS") + logger.info(" %s", "-" * 44) + + # Sort categories alphabetically for consistent output + for category in sorted(category_scores.keys()): + data = category_scores[category] + pct = (data["earned"] / data["possible"] * 100) if data["possible"] > 0 else 0 + task_count = int(data["task_count"]) + task_label = "task" if task_count == 1 else "tasks" + + # Color indicator based on score + if pct >= 90: + indicator = "🟢" + elif pct >= 70: + indicator = "🟡" + else: + indicator = "🔴" + + logger.info( + " %s %-17s %6.1f%% %6d %s", + indicator, + category, + pct, + task_count, + task_label, + ) + + logger.info(" %s", "-" * 44) + logger.info("%s", "=" * 80) + + def main(): """Main entry point for the benchmark script.""" # Determine tasks directory @@ -621,6 +689,7 @@ def main(): output_path.write_text(json.dumps(aggregate, indent=2), encoding="utf-8") logger.info("Saved results to %s", output_path) + _log_category_summary(task_entries, tasks_by_id) _log_efficiency_summary(efficiency, grades_by_task_id) if args.no_upload: logger.info("Skipping upload (--no-upload)")