From ffefa4b9bb7ab1d14bc2aaefc3140df79fd28024 Mon Sep 17 00:00:00 2001
From: Brendan O'Leary <brendan@olearycrew.com>
Date: Tue, 10 Mar 2026 16:19:49 -0400
Subject: [PATCH] Add category-level score summary at end of benchmark run
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a new summary section after benchmark completion that shows:
- Overall score percentage (matching website format)
- Per-category breakdown with task counts
- Color indicators (🟢 ≥90%, 🟡 ≥70%, 🔴 <70%)

This allows users running locally (with --no-upload) to compare their
results directly against scores on pinchbench.com without digging
through the detailed output.

Closes #41
---
 scripts/benchmark.py | 69 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 36b5e05..ba4e49b 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -414,6 +414,74 @@ def _log_efficiency_summary(
     logger.info("%s", "=" * 80)
 
 
+def _log_category_summary(
+    task_entries: List[Dict[str, Any]],
+    tasks_by_id: Dict[str, Any],
+) -> None:
+    """Log a summary grouped by category, matching the PinchBench website format."""
+    # Group scores by category
+    category_scores: Dict[str, Dict[str, float]] = {}
+    
+    for entry in task_entries:
+        task_id = entry["task_id"]
+        task = tasks_by_id.get(task_id)
+        if not task:
+            continue
+        
+        category = task.category.upper() if task.category else "UNCATEGORIZED"
+        grading = entry.get("grading", {})
+        mean_score = float(grading.get("mean", 0.0))
+        max_score = 1.0  # Each task is scored 0-1
+        
+        if category not in category_scores:
+            category_scores[category] = {"earned": 0.0, "possible": 0.0, "task_count": 0}
+        
+        category_scores[category]["earned"] += mean_score
+        category_scores[category]["possible"] += max_score
+        category_scores[category]["task_count"] += 1
+    
+    # Calculate overall totals
+    total_earned = sum(c["earned"] for c in category_scores.values())
+    total_possible = sum(c["possible"] for c in category_scores.values())
+    overall_pct = (total_earned / total_possible * 100) if total_possible > 0 else 0
+    
+    logger.info("\n%s", "=" * 80)
+    logger.info("🦀 PINCHBENCH SCORE SUMMARY")
+    logger.info("%s", "=" * 80)
+    logger.info("")
+    logger.info("   Overall Score: %.1f%% (%.1f / %.1f)", overall_pct, total_earned, total_possible)
+    logger.info("")
+    logger.info("   %-20s %8s %12s", "CATEGORY", "SCORE", "TASKS")
+    logger.info("   %s", "-" * 44)
+    
+    # Sort categories alphabetically for consistent output
+    for category in sorted(category_scores.keys()):
+        data = category_scores[category]
+        pct = (data["earned"] / data["possible"] * 100) if data["possible"] > 0 else 0
+        task_count = int(data["task_count"])
+        task_label = "task" if task_count == 1 else "tasks"
+        
+        # Color indicator based on score
+        if pct >= 90:
+            indicator = "🟢"
+        elif pct >= 70:
+            indicator = "🟡"
+        else:
+            indicator = "🔴"
+        
+        logger.info(
+            "   %s %-17s %6.1f%% %6d %s",
+            indicator,
+            category,
+            pct,
+            task_count,
+            task_label,
+        )
+    
+    logger.info("   %s", "-" * 44)
+    logger.info("%s", "=" * 80)
+
+
 def main():
     """Main entry point for the benchmark script."""
     # Determine tasks directory
@@ -621,6 +689,7 @@ def main():
     output_path.write_text(json.dumps(aggregate, indent=2), encoding="utf-8")
 
     logger.info("Saved results to %s", output_path)
+    _log_category_summary(task_entries, tasks_by_id)
     _log_efficiency_summary(efficiency, grades_by_task_id)
     if args.no_upload:
         logger.info("Skipping upload (--no-upload)")