From 97691d2e6812acfb9bdb212c5646bb7e346d1d78 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 26 Jun 2026 11:01:57 +0000 Subject: [PATCH 1/5] Update leaderboard: GitHub Copilot CLI (claude-opus-4.8) - Run 28233006770 --- docs/_data/code-review.json | 146 ++++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json index 4cdb03526..a7554a9d6 100644 --- a/docs/_data/code-review.json +++ b/docs/_data/code-review.json @@ -3100,6 +3100,128 @@ "synthetic__upgrade-006": 0.0, "synthetic__privacy-011": 0.0 } + }, + { + "total": 81, + "date": "2026-06-26", + "model": "claude-opus-4-8", + "agent_name": "GitHub Copilot", + "category": "code-review", + "average_duration": 69.6, + "average_prompt_tokens": 154042.0, + "average_completion_tokens": 3361.7, + "average_llm_duration": 0.0, + "average_tool_usage": { + "report_intent": 1.42, + "bash": 3.57, + "create": 1.0, + "view": 0.84, + "grep": 0.1, + "glob": 0.01 + }, + "github_run_id": "28233006770", + "experiment": null, + "benchmark_version": "0.6.0", + "generated_comment_count": 306, + "expected_comment_count": 216, + "matched_comment_count": 78, + "incorrect_comment_count": 228, + "missed_comment_count": 138, + "precision": 0.255, + "recall": 0.361, + "f1": 0.299, + "f_beta_05": 0.271, + "f_beta_2": 0.333, + "macro_precision": 0.325, + "macro_recall": 0.501, + "macro_f1": 0.366, + "macro_f_beta_05": 0.336, + "macro_f_beta_2": 0.422, + "severity_mae": 0.474, + "valid_review_output_rate": 1.0, + "instance_results": { + "synthetic__performance-023": 0.0, + "synthetic__upgrade-005": 0.25, + "synthetic__upgrade-008": 0.222222, + "synthetic__performance-006": 0.571429, + "synthetic__performance-014": 0.0, + "synthetic__security-006": 1.0, + "synthetic__security-011": 0.0, + "synthetic__security-007": 0.5, + "synthetic__style-004": 0.666667, + "synthetic__performance-015": 0.571429, + "synthetic__style-013": 0.0, + "synthetic__performance-005": 0.333333, + "synthetic__performance-020": 0.0, + "synthetic__security-013": 0.25, + "synthetic__performance-013": 0.571429, + "synthetic__privacy-007": 0.666667, + "synthetic__performance-010": 0.0, + "synthetic__security-009": 0.222222, + "synthetic__style-009": 0.666667, + "synthetic__performance-018": 0.0, + "synthetic__privacy-003": 0.0, + "synthetic__security-012": 0.666667, + "synthetic__performance-007": 1.0, + "synthetic__security-003": 0.666667, + "synthetic__style-016": 0.5, + "synthetic__privacy-008": 0.0, + "synthetic__privacy-006": 0.333333, + "synthetic__style-012": 0.153846, + "synthetic__performance-017": 0.333333, + "synthetic__style-014": 0.0, + "synthetic__security-004": 0.666667, + "synthetic__privacy-012": 0.6, + "synthetic__style-005": 0.666667, + "synthetic__security-015": 0.0, + "synthetic__privacy-009": 0.4, + "synthetic__style-011": 0.058824, + "synthetic__security-005": 0.5, + "synthetic__performance-002": 0.666667, + "synthetic__style-008": 1.0, + "synthetic__style-006": 0.0, + "synthetic__style-015": 0.5, + "synthetic__privacy-002": 0.5, + "synthetic__performance-001": 0.666667, + "synthetic__privacy-014": 0.888889, + "synthetic__style-001": 0.0, + "synthetic__privacy-004": 0.0, + "synthetic__privacy-015": 0.0, + "synthetic__security-016": 1.0, + "synthetic__privacy-005": 0.5, + "synthetic__upgrade-003": 1.0, + "synthetic__performance-009": 0.0, + "synthetic__performance-024": 0.0, + "synthetic__security-008": 0.0, + "synthetic__performance-008": 0.666667, + "synthetic__performance-019": 0.0, + "synthetic__upgrade-001": 0.0, + "synthetic__upgrade-009": 0.307692, + "synthetic__upgrade-007": 0.181818, + "synthetic__security-014": 0.0, + "synthetic__performance-021": 0.0, + "synthetic__style-017": 0.153846, + "synthetic__style-010": 0.666667, + "synthetic__performance-012": 0.25, + "synthetic__style-002": 0.8, + "synthetic__privacy-010": 0.0, + "synthetic__performance-016": 0.0, + "synthetic__style-007": 0.0, + "synthetic__security-001": 0.4, + "synthetic__security-010": 0.666667, + "synthetic__upgrade-002": 0.5, + "synthetic__performance-022": 0.363636, + "synthetic__performance-003": 0.5, + "synthetic__privacy-001": 0.4, + "synthetic__upgrade-004": 0.5, + "synthetic__performance-011": 0.333333, + "synthetic__performance-004": 1.0, + "synthetic__security-002": 0.0, + "synthetic__privacy-013": 0.8, + "synthetic__style-003": 0.222222, + "synthetic__upgrade-006": 0.133333, + "synthetic__privacy-011": 1.0 + } } ], "aggregate": [ @@ -3222,6 +3344,30 @@ "macro_f_beta_2": 0.2774, "macro_precision": 0.2836, "macro_recall": 0.2928 + }, + { + "model": "claude-opus-4-8", + "agent_name": "GitHub Copilot", + "category": "code-review", + "experiment": null, + "total": 81, + "num_runs": 1, + "average_duration": 69.6, + "benchmark_version": "0.6.0", + "f1": 0.299, + "f1_ci_low": null, + "f1_ci_high": null, + "f_beta_05": 0.271, + "f_beta_2": 0.333, + "precision": 0.255, + "recall": 0.361, + "macro_f1": 0.366, + "macro_f1_ci_low": 0.297, + "macro_f1_ci_high": 0.439, + "macro_f_beta_05": 0.336, + "macro_f_beta_2": 0.422, + "macro_precision": 0.325, + "macro_recall": 0.501 } ] } From 7f20b0940fe9b1d3fcf3373a8cda1c70de97c40d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 26 Jun 2026 11:20:48 +0000 Subject: [PATCH 2/5] Update leaderboard: GitHub Copilot CLI (claude-opus-4.8) - Run 28233959466 --- docs/_data/code-review.json | 155 ++++++++++++++++++++++++++++++++---- 1 file changed, 139 insertions(+), 16 deletions(-) diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json index a7554a9d6..932e6203d 100644 --- a/docs/_data/code-review.json +++ b/docs/_data/code-review.json @@ -3222,6 +3222,129 @@ "synthetic__upgrade-006": 0.133333, "synthetic__privacy-011": 1.0 } + }, + { + "total": 81, + "date": "2026-06-26", + "model": "claude-opus-4-8", + "agent_name": "GitHub Copilot", + "category": "code-review", + "average_duration": 59.6, + "average_prompt_tokens": 155166.7, + "average_completion_tokens": 3422.2, + "average_llm_duration": 0.0, + "average_tool_usage": { + "bash": 3.62, + "report_intent": 1.43, + "create": 1.01, + "view": 0.77, + "grep": 0.23, + "stop_bash": 0.02, + "glob": 0.01 + }, + "github_run_id": "28233959466", + "experiment": null, + "benchmark_version": "0.6.0", + "generated_comment_count": 305, + "expected_comment_count": 216, + "matched_comment_count": 83, + "incorrect_comment_count": 222, + "missed_comment_count": 133, + "precision": 0.272, + "recall": 0.384, + "f1": 0.319, + "f_beta_05": 0.289, + "f_beta_2": 0.355, + "macro_precision": 0.338, + "macro_recall": 0.513, + "macro_f1": 0.378, + "macro_f_beta_05": 0.348, + "macro_f_beta_2": 0.436, + "severity_mae": 0.602, + "valid_review_output_rate": 1.0, + "instance_results": { + "synthetic__performance-023": 0.0, + "synthetic__upgrade-005": 0.25, + "synthetic__upgrade-008": 0.181818, + "synthetic__performance-006": 0.8, + "synthetic__performance-014": 0.0, + "synthetic__security-006": 1.0, + "synthetic__security-011": 0.0, + "synthetic__security-007": 0.666667, + "synthetic__style-004": 0.666667, + "synthetic__performance-015": 0.333333, + "synthetic__style-013": 0.0, + "synthetic__performance-005": 0.285714, + "synthetic__performance-020": 0.0, + "synthetic__security-013": 0.181818, + "synthetic__performance-013": 0.571429, + "synthetic__privacy-007": 1.0, + "synthetic__performance-010": 0.0, + "synthetic__security-009": 0.25, + "synthetic__style-009": 0.666667, + "synthetic__performance-018": 0.25, + "synthetic__privacy-003": 0.0, + "synthetic__security-012": 0.666667, + "synthetic__performance-007": 1.0, + "synthetic__security-003": 0.666667, + "synthetic__style-016": 0.666667, + "synthetic__privacy-008": 0.0, + "synthetic__privacy-006": 0.0, + "synthetic__style-012": 0.285714, + "synthetic__performance-017": 0.0, + "synthetic__style-014": 0.0, + "synthetic__security-004": 1.0, + "synthetic__privacy-012": 0.666667, + "synthetic__style-005": 0.666667, + "synthetic__security-015": 0.0, + "synthetic__privacy-009": 0.333333, + "synthetic__style-011": 0.176471, + "synthetic__security-005": 0.5, + "synthetic__performance-002": 0.8, + "synthetic__style-008": 0.666667, + "synthetic__style-006": 0.0, + "synthetic__style-015": 0.666667, + "synthetic__privacy-002": 0.5, + "synthetic__performance-001": 0.666667, + "synthetic__privacy-014": 0.888889, + "synthetic__style-001": 0.0, + "synthetic__privacy-004": 0.0, + "synthetic__privacy-015": 0.0, + "synthetic__security-016": 1.0, + "synthetic__privacy-005": 0.5, + "synthetic__upgrade-003": 0.666667, + "synthetic__performance-009": 0.0, + "synthetic__performance-024": 0.285714, + "synthetic__security-008": 0.0, + "synthetic__performance-008": 0.666667, + "synthetic__performance-019": 0.0, + "synthetic__upgrade-001": 0.0, + "synthetic__upgrade-009": 0.333333, + "synthetic__upgrade-007": 0.2, + "synthetic__security-014": 0.0, + "synthetic__performance-021": 0.333333, + "synthetic__style-017": 0.0, + "synthetic__style-010": 0.333333, + "synthetic__performance-012": 0.571429, + "synthetic__style-002": 0.0, + "synthetic__privacy-010": 0.166667, + "synthetic__performance-016": 0.4, + "synthetic__style-007": 0.0, + "synthetic__security-001": 0.666667, + "synthetic__security-010": 0.75, + "synthetic__upgrade-002": 0.5, + "synthetic__performance-022": 0.222222, + "synthetic__performance-003": 0.571429, + "synthetic__privacy-001": 0.333333, + "synthetic__upgrade-004": 0.5, + "synthetic__performance-011": 0.0, + "synthetic__performance-004": 1.0, + "synthetic__security-002": 0.4, + "synthetic__privacy-013": 0.8, + "synthetic__style-003": 0.25, + "synthetic__upgrade-006": 0.266667, + "synthetic__privacy-011": 1.0 + } } ], "aggregate": [ @@ -3351,23 +3474,23 @@ "category": "code-review", "experiment": null, "total": 81, - "num_runs": 1, - "average_duration": 69.6, + "num_runs": 2, + "average_duration": 64.6, "benchmark_version": "0.6.0", - "f1": 0.299, - "f1_ci_low": null, - "f1_ci_high": null, - "f_beta_05": 0.271, - "f_beta_2": 0.333, - "precision": 0.255, - "recall": 0.361, - "macro_f1": 0.366, - "macro_f1_ci_low": 0.297, - "macro_f1_ci_high": 0.439, - "macro_f_beta_05": 0.336, - "macro_f_beta_2": 0.422, - "macro_precision": 0.325, - "macro_recall": 0.501 + "f1": 0.309, + "f1_ci_low": 0.299, + "f1_ci_high": 0.319, + "f_beta_05": 0.28, + "f_beta_2": 0.344, + "precision": 0.2635, + "recall": 0.3725, + "macro_f1": 0.372, + "macro_f1_ci_low": 0.322, + "macro_f1_ci_high": 0.425, + "macro_f_beta_05": 0.34199999999999997, + "macro_f_beta_2": 0.429, + "macro_precision": 0.3315, + "macro_recall": 0.507 } ] } From a37d53fa9211326402c1e63baf8ef2ae1aa108da Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 26 Jun 2026 11:41:57 +0000 Subject: [PATCH 3/5] Update leaderboard: GitHub Copilot CLI (claude-opus-4.8) - Run 28234982430 --- docs/_data/code-review.json | 153 ++++++++++++++++++++++++++++++++---- 1 file changed, 138 insertions(+), 15 deletions(-) diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json index 932e6203d..743bb4fb4 100644 --- a/docs/_data/code-review.json +++ b/docs/_data/code-review.json @@ -3345,6 +3345,129 @@ "synthetic__upgrade-006": 0.266667, "synthetic__privacy-011": 1.0 } + }, + { + "total": 81, + "date": "2026-06-26", + "model": "claude-opus-4-8", + "agent_name": "GitHub Copilot", + "category": "code-review", + "average_duration": 57.0, + "average_prompt_tokens": 152940.7, + "average_completion_tokens": 3249.4, + "average_llm_duration": 0.0, + "average_tool_usage": { + "report_intent": 1.53, + "bash": 3.6, + "view": 0.91, + "create": 1.0, + "grep": 0.19, + "stop_bash": 0.01, + "glob": 0.01 + }, + "github_run_id": "28234982430", + "experiment": null, + "benchmark_version": "0.6.0", + "generated_comment_count": 310, + "expected_comment_count": 216, + "matched_comment_count": 89, + "incorrect_comment_count": 221, + "missed_comment_count": 127, + "precision": 0.287, + "recall": 0.412, + "f1": 0.338, + "f_beta_05": 0.306, + "f_beta_2": 0.379, + "macro_precision": 0.376, + "macro_recall": 0.554, + "macro_f1": 0.414, + "macro_f_beta_05": 0.386, + "macro_f_beta_2": 0.472, + "severity_mae": 0.494, + "valid_review_output_rate": 1.0, + "instance_results": { + "synthetic__performance-023": 0.0, + "synthetic__upgrade-005": 0.25, + "synthetic__upgrade-008": 0.181818, + "synthetic__performance-006": 0.666667, + "synthetic__performance-014": 0.0, + "synthetic__security-006": 1.0, + "synthetic__security-011": 0.0, + "synthetic__security-007": 0.666667, + "synthetic__style-004": 0.666667, + "synthetic__performance-015": 0.666667, + "synthetic__style-013": 0.0, + "synthetic__performance-005": 0.333333, + "synthetic__performance-020": 0.285714, + "synthetic__security-013": 0.0, + "synthetic__performance-013": 0.571429, + "synthetic__privacy-007": 0.666667, + "synthetic__performance-010": 0.0, + "synthetic__security-009": 0.222222, + "synthetic__style-009": 0.666667, + "synthetic__performance-018": 0.222222, + "synthetic__privacy-003": 0.0, + "synthetic__security-012": 0.4, + "synthetic__performance-007": 1.0, + "synthetic__security-003": 0.5, + "synthetic__style-016": 0.25, + "synthetic__privacy-008": 0.0, + "synthetic__privacy-006": 0.666667, + "synthetic__style-012": 0.333333, + "synthetic__performance-017": 0.4, + "synthetic__style-014": 0.222222, + "synthetic__security-004": 0.666667, + "synthetic__privacy-012": 0.8, + "synthetic__style-005": 0.5, + "synthetic__security-015": 0.0, + "synthetic__privacy-009": 0.333333, + "synthetic__style-011": 0.216216, + "synthetic__security-005": 0.5, + "synthetic__performance-002": 1.0, + "synthetic__style-008": 1.0, + "synthetic__style-006": 0.0, + "synthetic__style-015": 1.0, + "synthetic__privacy-002": 0.333333, + "synthetic__performance-001": 0.666667, + "synthetic__privacy-014": 0.888889, + "synthetic__style-001": 1.0, + "synthetic__privacy-004": 0.0, + "synthetic__privacy-015": 0.153846, + "synthetic__security-016": 1.0, + "synthetic__privacy-005": 0.666667, + "synthetic__upgrade-003": 1.0, + "synthetic__performance-009": 0.0, + "synthetic__performance-024": 0.285714, + "synthetic__security-008": 0.0, + "synthetic__performance-008": 0.666667, + "synthetic__performance-019": 0.0, + "synthetic__upgrade-001": 0.0, + "synthetic__upgrade-009": 0.461538, + "synthetic__upgrade-007": 0.0, + "synthetic__security-014": 0.0, + "synthetic__performance-021": 0.4, + "synthetic__style-017": 0.153846, + "synthetic__style-010": 0.666667, + "synthetic__performance-012": 0.571429, + "synthetic__style-002": 0.0, + "synthetic__privacy-010": 0.0, + "synthetic__performance-016": 0.0, + "synthetic__style-007": 0.4, + "synthetic__security-001": 0.5, + "synthetic__security-010": 0.666667, + "synthetic__upgrade-002": 0.5, + "synthetic__performance-022": 0.5, + "synthetic__performance-003": 0.25, + "synthetic__privacy-001": 0.333333, + "synthetic__upgrade-004": 1.0, + "synthetic__performance-011": 0.0, + "synthetic__performance-004": 1.0, + "synthetic__security-002": 0.5, + "synthetic__privacy-013": 0.8, + "synthetic__style-003": 0.222222, + "synthetic__upgrade-006": 0.125, + "synthetic__privacy-011": 1.0 + } } ], "aggregate": [ @@ -3474,23 +3597,23 @@ "category": "code-review", "experiment": null, "total": 81, - "num_runs": 2, - "average_duration": 64.6, + "num_runs": 3, + "average_duration": 62.06666666666666, "benchmark_version": "0.6.0", - "f1": 0.309, + "f1": 0.319, "f1_ci_low": 0.299, - "f1_ci_high": 0.319, - "f_beta_05": 0.28, - "f_beta_2": 0.344, - "precision": 0.2635, - "recall": 0.3725, - "macro_f1": 0.372, - "macro_f1_ci_low": 0.322, - "macro_f1_ci_high": 0.425, - "macro_f_beta_05": 0.34199999999999997, - "macro_f_beta_2": 0.429, - "macro_precision": 0.3315, - "macro_recall": 0.507 + "f1_ci_high": 0.338, + "f_beta_05": 0.2886666666666667, + "f_beta_2": 0.35566666666666663, + "precision": 0.27133333333333337, + "recall": 0.38566666666666666, + "macro_f1": 0.386, + "macro_f1_ci_low": 0.345, + "macro_f1_ci_high": 0.428, + "macro_f_beta_05": 0.3566666666666667, + "macro_f_beta_2": 0.44333333333333336, + "macro_precision": 0.3463333333333334, + "macro_recall": 0.5226666666666667 } ] } From f3b020a6c355386bdba3744bd56b7751a07263a4 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 26 Jun 2026 12:14:03 +0000 Subject: [PATCH 4/5] Update leaderboard: GitHub Copilot CLI (claude-opus-4.8) - Run 28235940363 --- docs/_data/code-review.json | 151 ++++++++++++++++++++++++++++++++---- 1 file changed, 136 insertions(+), 15 deletions(-) diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json index 743bb4fb4..28ce56381 100644 --- a/docs/_data/code-review.json +++ b/docs/_data/code-review.json @@ -3468,6 +3468,127 @@ "synthetic__upgrade-006": 0.125, "synthetic__privacy-011": 1.0 } + }, + { + "total": 81, + "date": "2026-06-26", + "model": "claude-opus-4-8", + "agent_name": "GitHub Copilot", + "category": "code-review", + "average_duration": 56.2, + "average_prompt_tokens": 144754.3, + "average_completion_tokens": 3250.4, + "average_llm_duration": 0.0, + "average_tool_usage": { + "report_intent": 1.33, + "bash": 3.42, + "create": 1.04, + "grep": 0.11, + "view": 0.62 + }, + "github_run_id": "28235940363", + "experiment": null, + "benchmark_version": "0.6.0", + "generated_comment_count": 299, + "expected_comment_count": 216, + "matched_comment_count": 85, + "incorrect_comment_count": 214, + "missed_comment_count": 131, + "precision": 0.284, + "recall": 0.394, + "f1": 0.33, + "f_beta_05": 0.301, + "f_beta_2": 0.365, + "macro_precision": 0.37, + "macro_recall": 0.516, + "macro_f1": 0.4, + "macro_f_beta_05": 0.374, + "macro_f_beta_2": 0.451, + "severity_mae": 0.565, + "valid_review_output_rate": 1.0, + "instance_results": { + "synthetic__performance-023": 0.0, + "synthetic__upgrade-005": 0.25, + "synthetic__upgrade-008": 0.4, + "synthetic__performance-006": 0.571429, + "synthetic__performance-014": 0.0, + "synthetic__security-006": 1.0, + "synthetic__security-011": 0.0, + "synthetic__security-007": 0.666667, + "synthetic__style-004": 0.666667, + "synthetic__performance-015": 0.0, + "synthetic__style-013": 0.0, + "synthetic__performance-005": 0.333333, + "synthetic__performance-020": 0.0, + "synthetic__security-013": 0.222222, + "synthetic__performance-013": 0.0, + "synthetic__privacy-007": 0.666667, + "synthetic__performance-010": 0.0, + "synthetic__security-009": 0.222222, + "synthetic__style-009": 1.0, + "synthetic__performance-018": 0.0, + "synthetic__privacy-003": 0.0, + "synthetic__security-012": 0.666667, + "synthetic__performance-007": 1.0, + "synthetic__security-003": 1.0, + "synthetic__style-016": 0.444444, + "synthetic__privacy-008": 0.0, + "synthetic__privacy-006": 0.5, + "synthetic__style-012": 0.285714, + "synthetic__performance-017": 0.285714, + "synthetic__style-014": 0.0, + "synthetic__security-004": 1.0, + "synthetic__privacy-012": 0.727273, + "synthetic__style-005": 1.0, + "synthetic__security-015": 0.0, + "synthetic__privacy-009": 0.5, + "synthetic__style-011": 0.352941, + "synthetic__security-005": 0.5, + "synthetic__performance-002": 1.0, + "synthetic__style-008": 0.666667, + "synthetic__style-006": 0.0, + "synthetic__style-015": 0.8, + "synthetic__privacy-002": 0.5, + "synthetic__performance-001": 1.0, + "synthetic__privacy-014": 1.0, + "synthetic__style-001": 0.0, + "synthetic__privacy-004": 0.0, + "synthetic__privacy-015": 0.0, + "synthetic__security-016": 1.0, + "synthetic__privacy-005": 0.666667, + "synthetic__upgrade-003": 1.0, + "synthetic__performance-009": 0.0, + "synthetic__performance-024": 0.333333, + "synthetic__security-008": 0.666667, + "synthetic__performance-008": 1.0, + "synthetic__performance-019": 0.0, + "synthetic__upgrade-001": 0.0, + "synthetic__upgrade-009": 0.285714, + "synthetic__upgrade-007": 0.181818, + "synthetic__security-014": 0.0, + "synthetic__performance-021": 0.285714, + "synthetic__style-017": 0.0, + "synthetic__style-010": 0.8, + "synthetic__performance-012": 0.571429, + "synthetic__style-002": 0.0, + "synthetic__privacy-010": 0.25, + "synthetic__performance-016": 0.0, + "synthetic__style-007": 0.0, + "synthetic__security-001": 0.5, + "synthetic__security-010": 0.888889, + "synthetic__upgrade-002": 0.0, + "synthetic__performance-022": 0.363636, + "synthetic__performance-003": 0.5, + "synthetic__privacy-001": 0.4, + "synthetic__upgrade-004": 0.4, + "synthetic__performance-011": 0.0, + "synthetic__performance-004": 0.5, + "synthetic__security-002": 0.4, + "synthetic__privacy-013": 0.666667, + "synthetic__style-003": 1.0, + "synthetic__upgrade-006": 0.125, + "synthetic__privacy-011": 0.4 + } } ], "aggregate": [ @@ -3597,23 +3718,23 @@ "category": "code-review", "experiment": null, "total": 81, - "num_runs": 3, - "average_duration": 62.06666666666666, + "num_runs": 4, + "average_duration": 60.6, "benchmark_version": "0.6.0", - "f1": 0.319, - "f1_ci_low": 0.299, - "f1_ci_high": 0.338, - "f_beta_05": 0.2886666666666667, - "f_beta_2": 0.35566666666666663, - "precision": 0.27133333333333337, - "recall": 0.38566666666666666, - "macro_f1": 0.386, - "macro_f1_ci_low": 0.345, + "f1": 0.322, + "f1_ci_low": 0.304, + "f1_ci_high": 0.333, + "f_beta_05": 0.29175, + "f_beta_2": 0.358, + "precision": 0.27449999999999997, + "recall": 0.38775, + "macro_f1": 0.39, + "macro_f1_ci_low": 0.353, "macro_f1_ci_high": 0.428, - "macro_f_beta_05": 0.3566666666666667, - "macro_f_beta_2": 0.44333333333333336, - "macro_precision": 0.3463333333333334, - "macro_recall": 0.5226666666666667 + "macro_f_beta_05": 0.361, + "macro_f_beta_2": 0.44525, + "macro_precision": 0.35225, + "macro_recall": 0.521 } ] } From 2dd841b09495c0499154ececc93ad6440a266e15 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 26 Jun 2026 12:30:11 +0000 Subject: [PATCH 5/5] Update leaderboard: GitHub Copilot CLI (claude-opus-4.8) - Run 28237448350 --- docs/_data/code-review.json | 156 ++++++++++++++++++++++++++++++++---- 1 file changed, 140 insertions(+), 16 deletions(-) diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json index 28ce56381..dd41a5ef9 100644 --- a/docs/_data/code-review.json +++ b/docs/_data/code-review.json @@ -3589,6 +3589,130 @@ "synthetic__upgrade-006": 0.125, "synthetic__privacy-011": 0.4 } + }, + { + "total": 81, + "date": "2026-06-26", + "model": "claude-opus-4-8", + "agent_name": "GitHub Copilot", + "category": "code-review", + "average_duration": 61.0, + "average_prompt_tokens": 159760.5, + "average_completion_tokens": 3464.2, + "average_llm_duration": 0.0, + "average_tool_usage": { + "report_intent": 1.51, + "bash": 3.68, + "view": 0.93, + "create": 1.0, + "grep": 0.23, + "stop_bash": 0.01, + "edit": 0.01, + "glob": 0.02 + }, + "github_run_id": "28237448350", + "experiment": null, + "benchmark_version": "0.6.0", + "generated_comment_count": 336, + "expected_comment_count": 216, + "matched_comment_count": 78, + "incorrect_comment_count": 258, + "missed_comment_count": 138, + "precision": 0.232, + "recall": 0.361, + "f1": 0.283, + "f_beta_05": 0.25, + "f_beta_2": 0.325, + "macro_precision": 0.297, + "macro_recall": 0.479, + "macro_f1": 0.335, + "macro_f_beta_05": 0.305, + "macro_f_beta_2": 0.396, + "severity_mae": 0.59, + "valid_review_output_rate": 1.0, + "instance_results": { + "synthetic__performance-023": 0.0, + "synthetic__upgrade-005": 0.222222, + "synthetic__upgrade-008": 0.181818, + "synthetic__performance-006": 0.444444, + "synthetic__performance-014": 0.0, + "synthetic__security-006": 1.0, + "synthetic__security-011": 0.0, + "synthetic__security-007": 0.666667, + "synthetic__style-004": 0.666667, + "synthetic__performance-015": 0.444444, + "synthetic__style-013": 0.095238, + "synthetic__performance-005": 0.333333, + "synthetic__performance-020": 0.0, + "synthetic__security-013": 0.25, + "synthetic__performance-013": 0.666667, + "synthetic__privacy-007": 0.5, + "synthetic__performance-010": 0.0, + "synthetic__security-009": 0.222222, + "synthetic__style-009": 0.666667, + "synthetic__performance-018": 0.2, + "synthetic__privacy-003": 0.0, + "synthetic__security-012": 1.0, + "synthetic__performance-007": 1.0, + "synthetic__security-003": 0.666667, + "synthetic__style-016": 0.444444, + "synthetic__privacy-008": 0.0, + "synthetic__privacy-006": 0.4, + "synthetic__style-012": 0.153846, + "synthetic__performance-017": 0.25, + "synthetic__style-014": 0.0, + "synthetic__security-004": 0.666667, + "synthetic__privacy-012": 0.666667, + "synthetic__style-005": 0.666667, + "synthetic__security-015": 0.0, + "synthetic__privacy-009": 0.4, + "synthetic__style-011": 0.114286, + "synthetic__security-005": 0.5, + "synthetic__performance-002": 0.8, + "synthetic__style-008": 0.666667, + "synthetic__style-006": 0.0, + "synthetic__style-015": 0.857143, + "synthetic__privacy-002": 0.5, + "synthetic__performance-001": 0.0, + "synthetic__privacy-014": 0.8, + "synthetic__style-001": 0.0, + "synthetic__privacy-004": 0.0, + "synthetic__privacy-015": 0.0, + "synthetic__security-016": 1.0, + "synthetic__privacy-005": 0.4, + "synthetic__upgrade-003": 0.0, + "synthetic__performance-009": 0.0, + "synthetic__performance-024": 0.0, + "synthetic__security-008": 0.0, + "synthetic__performance-008": 1.0, + "synthetic__performance-019": 0.0, + "synthetic__upgrade-001": 0.0, + "synthetic__upgrade-009": 0.428571, + "synthetic__upgrade-007": 0.0, + "synthetic__security-014": 0.0, + "synthetic__performance-021": 0.285714, + "synthetic__style-017": 0.142857, + "synthetic__style-010": 0.5, + "synthetic__performance-012": 0.5, + "synthetic__style-002": 0.8, + "synthetic__privacy-010": 0.0, + "synthetic__performance-016": 0.0, + "synthetic__style-007": 0.0, + "synthetic__security-001": 0.4, + "synthetic__security-010": 0.666667, + "synthetic__upgrade-002": 0.0, + "synthetic__performance-022": 0.4, + "synthetic__performance-003": 0.25, + "synthetic__privacy-001": 0.666667, + "synthetic__upgrade-004": 0.5, + "synthetic__performance-011": 0.222222, + "synthetic__performance-004": 0.4, + "synthetic__security-002": 0.0, + "synthetic__privacy-013": 0.666667, + "synthetic__style-003": 0.285714, + "synthetic__upgrade-006": 0.133333, + "synthetic__privacy-011": 0.4 + } } ], "aggregate": [ @@ -3718,23 +3842,23 @@ "category": "code-review", "experiment": null, "total": 81, - "num_runs": 4, - "average_duration": 60.6, + "num_runs": 5, + "average_duration": 60.67999999999999, "benchmark_version": "0.6.0", - "f1": 0.322, - "f1_ci_low": 0.304, - "f1_ci_high": 0.333, - "f_beta_05": 0.29175, - "f_beta_2": 0.358, - "precision": 0.27449999999999997, - "recall": 0.38775, - "macro_f1": 0.39, - "macro_f1_ci_low": 0.353, - "macro_f1_ci_high": 0.428, - "macro_f_beta_05": 0.361, - "macro_f_beta_2": 0.44525, - "macro_precision": 0.35225, - "macro_recall": 0.521 + "f1": 0.314, + "f1_ci_low": 0.293, + "f1_ci_high": 0.329, + "f_beta_05": 0.2834, + "f_beta_2": 0.35140000000000005, + "precision": 0.266, + "recall": 0.38239999999999996, + "macro_f1": 0.379, + "macro_f1_ci_low": 0.345, + "macro_f1_ci_high": 0.411, + "macro_f_beta_05": 0.3498, + "macro_f_beta_2": 0.4354, + "macro_precision": 0.3412, + "macro_recall": 0.5126000000000001 } ] }