From 9d6e47616b3fe3bc897c5f91f92ad40cfb9c6d21 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 26 Jun 2026 13:44:11 +0000 Subject: [PATCH 1/5] Update leaderboard: Claude Code (claude-sonnet-4-6) - Run 28240911816 --- docs/_data/code-review.json | 144 ++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json index dd41a5ef9..36f67e96f 100644 --- a/docs/_data/code-review.json +++ b/docs/_data/code-review.json @@ -3713,6 +3713,126 @@ "synthetic__upgrade-006": 0.133333, "synthetic__privacy-011": 0.4 } + }, + { + "total": 81, + "date": "2026-06-26", + "model": "claude-sonnet-4-6", + "agent_name": "Claude Code", + "category": "code-review", + "average_duration": 74.6, + "average_prompt_tokens": 79315.5, + "average_completion_tokens": 4510.7, + "average_llm_duration": 73.5, + "average_tool_usage": { + "Bash": 1.47, + "Read": 0.98, + "Write": 1.0, + "Glob": 0.01 + }, + "github_run_id": "28240911816", + "experiment": null, + "benchmark_version": "0.6.0", + "generated_comment_count": 518, + "expected_comment_count": 216, + "matched_comment_count": 88, + "incorrect_comment_count": 430, + "missed_comment_count": 128, + "precision": 0.17, + "recall": 0.407, + "f1": 0.24, + "f_beta_05": 0.192, + "f_beta_2": 0.318, + "macro_precision": 0.177, + "macro_recall": 0.541, + "macro_f1": 0.255, + "macro_f_beta_05": 0.201, + "macro_f_beta_2": 0.363, + "severity_mae": 0.682, + "valid_review_output_rate": 1.0, + "instance_results": { + "synthetic__performance-023": 0.0, + "synthetic__upgrade-005": 0.142857, + "synthetic__upgrade-008": 0.0, + "synthetic__performance-006": 0.571429, + "synthetic__performance-014": 0.0, + "synthetic__security-006": 0.5, + "synthetic__security-011": 0.0, + "synthetic__security-007": 0.333333, + "synthetic__style-004": 0.4, + "synthetic__performance-015": 0.333333, + "synthetic__style-013": 0.173913, + "synthetic__performance-005": 0.444444, + "synthetic__performance-020": 0.0, + "synthetic__security-013": 0.222222, + "synthetic__performance-013": 0.2, + "synthetic__privacy-007": 0.285714, + "synthetic__performance-010": 0.0, + "synthetic__security-009": 0.222222, + "synthetic__style-009": 0.4, + "synthetic__performance-018": 0.166667, + "synthetic__privacy-003": 0.0, + "synthetic__security-012": 0.5, + "synthetic__performance-007": 0.4, + "synthetic__security-003": 0.4, + "synthetic__style-016": 0.5, + "synthetic__privacy-008": 0.0, + "synthetic__privacy-006": 0.0, + "synthetic__style-012": 0.25, + "synthetic__performance-017": 0.285714, + "synthetic__style-014": 0.0, + "synthetic__security-004": 0.333333, + "synthetic__privacy-012": 0.153846, + "synthetic__style-005": 0.333333, + "synthetic__security-015": 0.5, + "synthetic__privacy-009": 0.333333, + "synthetic__style-011": 0.186047, + "synthetic__security-005": 0.571429, + "synthetic__performance-002": 0.363636, + "synthetic__style-008": 0.333333, + "synthetic__style-006": 0.0, + "synthetic__style-015": 0.8, + "synthetic__privacy-002": 0.333333, + "synthetic__performance-001": 0.333333, + "synthetic__privacy-014": 0.333333, + "synthetic__style-001": 0.0, + "synthetic__privacy-004": 0.0, + "synthetic__privacy-015": 0.0, + "synthetic__security-016": 0.5, + "synthetic__privacy-005": 0.25, + "synthetic__upgrade-003": 0.4, + "synthetic__performance-009": 0.0, + "synthetic__performance-024": 0.0, + "synthetic__security-008": 0.333333, + "synthetic__performance-008": 0.444444, + "synthetic__performance-019": 0.0, + "synthetic__upgrade-001": 0.0, + "synthetic__upgrade-009": 0.117647, + "synthetic__upgrade-007": 0.0, + "synthetic__security-014": 0.0, + "synthetic__performance-021": 0.222222, + "synthetic__style-017": 0.0, + "synthetic__style-010": 0.571429, + "synthetic__performance-012": 0.444444, + "synthetic__style-002": 0.0, + "synthetic__privacy-010": 0.153846, + "synthetic__performance-016": 0.0, + "synthetic__style-007": 0.0, + "synthetic__security-001": 0.285714, + "synthetic__security-010": 0.5, + "synthetic__upgrade-002": 0.4, + "synthetic__performance-022": 0.4, + "synthetic__performance-003": 0.222222, + "synthetic__privacy-001": 0.333333, + "synthetic__upgrade-004": 0.25, + "synthetic__performance-011": 0.666667, + "synthetic__performance-004": 0.4, + "synthetic__security-002": 0.4, + "synthetic__privacy-013": 0.571429, + "synthetic__style-003": 0.285714, + "synthetic__upgrade-006": 0.181818, + "synthetic__privacy-011": 0.666667 + } } ], "aggregate": [ @@ -3859,6 +3979,30 @@ "macro_f_beta_2": 0.4354, "macro_precision": 0.3412, "macro_recall": 0.5126000000000001 + }, + { + "model": "claude-sonnet-4-6", + "agent_name": "Claude Code", + "category": "code-review", + "experiment": null, + "total": 81, + "num_runs": 1, + "average_duration": 74.6, + "benchmark_version": "0.6.0", + "f1": 0.24, + "f1_ci_low": null, + "f1_ci_high": null, + "f_beta_05": 0.192, + "f_beta_2": 0.318, + "precision": 0.17, + "recall": 0.407, + "macro_f1": 0.255, + "macro_f1_ci_low": 0.211, + "macro_f1_ci_high": 0.301, + "macro_f_beta_05": 0.201, + "macro_f_beta_2": 0.363, + "macro_precision": 0.177, + "macro_recall": 0.541 } ] } From 81ae9f75eb51bfa4f9036d722bf91cae219603de Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 26 Jun 2026 14:05:46 +0000 Subject: [PATCH 2/5] Update leaderboard: Claude Code (claude-sonnet-4-6) - Run 28242115307 --- docs/_data/code-review.json | 151 ++++++++++++++++++++++++++++++++---- 1 file changed, 135 insertions(+), 16 deletions(-) diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json index 36f67e96f..adc4b9464 100644 --- a/docs/_data/code-review.json +++ b/docs/_data/code-review.json @@ -3833,6 +3833,125 @@ "synthetic__upgrade-006": 0.181818, "synthetic__privacy-011": 0.666667 } + }, + { + "total": 81, + "date": "2026-06-26", + "model": "claude-sonnet-4-6", + "agent_name": "Claude Code", + "category": "code-review", + "average_duration": 75.7, + "average_prompt_tokens": 75539.0, + "average_completion_tokens": 4643.5, + "average_llm_duration": 75.0, + "average_tool_usage": { + "Bash": 1.46, + "Read": 0.81, + "Write": 1.0 + }, + "github_run_id": "28242115307", + "experiment": null, + "benchmark_version": "0.6.0", + "generated_comment_count": 526, + "expected_comment_count": 216, + "matched_comment_count": 91, + "incorrect_comment_count": 435, + "missed_comment_count": 125, + "precision": 0.173, + "recall": 0.421, + "f1": 0.245, + "f_beta_05": 0.196, + "f_beta_2": 0.327, + "macro_precision": 0.175, + "macro_recall": 0.531, + "macro_f1": 0.247, + "macro_f_beta_05": 0.197, + "macro_f_beta_2": 0.35, + "severity_mae": 0.714, + "valid_review_output_rate": 1.0, + "instance_results": { + "synthetic__performance-023": 0.5, + "synthetic__upgrade-005": 0.153846, + "synthetic__upgrade-008": 0.0, + "synthetic__performance-006": 0.333333, + "synthetic__performance-014": 0.0, + "synthetic__security-006": 0.666667, + "synthetic__security-011": 0.0, + "synthetic__security-007": 0.333333, + "synthetic__style-004": 0.333333, + "synthetic__performance-015": 0.4, + "synthetic__style-013": 0.16, + "synthetic__performance-005": 0.571429, + "synthetic__performance-020": 0.181818, + "synthetic__security-013": 0.0, + "synthetic__performance-013": 0.444444, + "synthetic__privacy-007": 0.333333, + "synthetic__performance-010": 0.0, + "synthetic__security-009": 0.222222, + "synthetic__style-009": 0.285714, + "synthetic__performance-018": 0.181818, + "synthetic__privacy-003": 0.0, + "synthetic__security-012": 0.5, + "synthetic__performance-007": 0.5, + "synthetic__security-003": 0.333333, + "synthetic__style-016": 0.2, + "synthetic__privacy-008": 0.0, + "synthetic__privacy-006": 0.0, + "synthetic__style-012": 0.25, + "synthetic__performance-017": 0.285714, + "synthetic__style-014": 0.181818, + "synthetic__security-004": 0.285714, + "synthetic__privacy-012": 0.133333, + "synthetic__style-005": 0.333333, + "synthetic__security-015": 0.0, + "synthetic__privacy-009": 0.25, + "synthetic__style-011": 0.4, + "synthetic__security-005": 0.571429, + "synthetic__performance-002": 0.5, + "synthetic__style-008": 0.285714, + "synthetic__style-006": 0.0, + "synthetic__style-015": 0.727273, + "synthetic__privacy-002": 0.285714, + "synthetic__performance-001": 0.4, + "synthetic__privacy-014": 0.333333, + "synthetic__style-001": 0.0, + "synthetic__privacy-004": 0.0, + "synthetic__privacy-015": 0.0, + "synthetic__security-016": 0.5, + "synthetic__privacy-005": 0.285714, + "synthetic__upgrade-003": 0.0, + "synthetic__performance-009": 0.0, + "synthetic__performance-024": 0.2, + "synthetic__security-008": 0.0, + "synthetic__performance-008": 0.444444, + "synthetic__performance-019": 0.0, + "synthetic__upgrade-001": 0.0, + "synthetic__upgrade-009": 0.190476, + "synthetic__upgrade-007": 0.0, + "synthetic__security-014": 0.285714, + "synthetic__performance-021": 0.363636, + "synthetic__style-017": 0.0, + "synthetic__style-010": 0.571429, + "synthetic__performance-012": 0.571429, + "synthetic__style-002": 0.0, + "synthetic__privacy-010": 0.142857, + "synthetic__performance-016": 0.0, + "synthetic__style-007": 0.0, + "synthetic__security-001": 0.25, + "synthetic__security-010": 0.545455, + "synthetic__upgrade-002": 0.0, + "synthetic__performance-022": 0.307692, + "synthetic__performance-003": 0.166667, + "synthetic__privacy-001": 0.25, + "synthetic__upgrade-004": 0.285714, + "synthetic__performance-011": 0.666667, + "synthetic__performance-004": 0.4, + "synthetic__security-002": 0.25, + "synthetic__privacy-013": 0.571429, + "synthetic__style-003": 0.25, + "synthetic__upgrade-006": 0.210526, + "synthetic__privacy-011": 0.461538 + } } ], "aggregate": [ @@ -3986,23 +4105,23 @@ "category": "code-review", "experiment": null, "total": 81, - "num_runs": 1, - "average_duration": 74.6, + "num_runs": 2, + "average_duration": 75.15, "benchmark_version": "0.6.0", - "f1": 0.24, - "f1_ci_low": null, - "f1_ci_high": null, - "f_beta_05": 0.192, - "f_beta_2": 0.318, - "precision": 0.17, - "recall": 0.407, - "macro_f1": 0.255, - "macro_f1_ci_low": 0.211, - "macro_f1_ci_high": 0.301, - "macro_f_beta_05": 0.201, - "macro_f_beta_2": 0.363, - "macro_precision": 0.177, - "macro_recall": 0.541 + "f1": 0.242, + "f1_ci_low": 0.24, + "f1_ci_high": 0.245, + "f_beta_05": 0.194, + "f_beta_2": 0.3225, + "precision": 0.17149999999999999, + "recall": 0.414, + "macro_f1": 0.251, + "macro_f1_ci_low": 0.22, + "macro_f1_ci_high": 0.284, + "macro_f_beta_05": 0.199, + "macro_f_beta_2": 0.3565, + "macro_precision": 0.176, + "macro_recall": 0.536 } ] } From e6f2d96c8d214f9b9ed023514d4fec120d641959 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 26 Jun 2026 14:26:37 +0000 Subject: [PATCH 3/5] Update leaderboard: Claude Code (claude-sonnet-4-6) - Run 28243306907 --- docs/_data/code-review.json | 154 ++++++++++++++++++++++++++++++++---- 1 file changed, 138 insertions(+), 16 deletions(-) diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json index adc4b9464..1c3fbde1f 100644 --- a/docs/_data/code-review.json +++ b/docs/_data/code-review.json @@ -3952,6 +3952,128 @@ "synthetic__upgrade-006": 0.210526, "synthetic__privacy-011": 0.461538 } + }, + { + "total": 81, + "date": "2026-06-26", + "model": "claude-sonnet-4-6", + "agent_name": "Claude Code", + "category": "code-review", + "average_duration": 78.8, + "average_prompt_tokens": 79802.3, + "average_completion_tokens": 4696.1, + "average_llm_duration": 77.9, + "average_tool_usage": { + "Bash": 1.62, + "Write": 1.0, + "Read": 1.17, + "Glob": 0.07, + "Agent": 0.02, + "Grep": 0.14 + }, + "github_run_id": "28243306907", + "experiment": null, + "benchmark_version": "0.6.0", + "generated_comment_count": 546, + "expected_comment_count": 216, + "matched_comment_count": 87, + "incorrect_comment_count": 459, + "missed_comment_count": 129, + "precision": 0.159, + "recall": 0.403, + "f1": 0.228, + "f_beta_05": 0.181, + "f_beta_2": 0.309, + "macro_precision": 0.16, + "macro_recall": 0.49, + "macro_f1": 0.228, + "macro_f_beta_05": 0.181, + "macro_f_beta_2": 0.323, + "severity_mae": 0.747, + "valid_review_output_rate": 1.0, + "instance_results": { + "synthetic__performance-023": 0.0, + "synthetic__upgrade-005": 0.166667, + "synthetic__upgrade-008": 0.2, + "synthetic__performance-006": 0.444444, + "synthetic__performance-014": 0.0, + "synthetic__security-006": 0.5, + "synthetic__security-011": 0.0, + "synthetic__security-007": 0.333333, + "synthetic__style-004": 0.25, + "synthetic__performance-015": 0.153846, + "synthetic__style-013": 0.16, + "synthetic__performance-005": 0.666667, + "synthetic__performance-020": 0.0, + "synthetic__security-013": 0.0, + "synthetic__performance-013": 0.222222, + "synthetic__privacy-007": 0.222222, + "synthetic__performance-010": 0.0, + "synthetic__security-009": 0.2, + "synthetic__style-009": 0.4, + "synthetic__performance-018": 0.0, + "synthetic__privacy-003": 0.0, + "synthetic__security-012": 0.2, + "synthetic__performance-007": 0.5, + "synthetic__security-003": 0.285714, + "synthetic__style-016": 0.545455, + "synthetic__privacy-008": 0.0, + "synthetic__privacy-006": 0.0, + "synthetic__style-012": 0.235294, + "synthetic__performance-017": 0.0, + "synthetic__style-014": 0.0, + "synthetic__security-004": 0.333333, + "synthetic__privacy-012": 0.142857, + "synthetic__style-005": 0.333333, + "synthetic__security-015": 0.0, + "synthetic__privacy-009": 0.333333, + "synthetic__style-011": 0.25641, + "synthetic__security-005": 0.25, + "synthetic__performance-002": 0.4, + "synthetic__style-008": 0.285714, + "synthetic__style-006": 0.0, + "synthetic__style-015": 0.666667, + "synthetic__privacy-002": 0.2, + "synthetic__performance-001": 0.4, + "synthetic__privacy-014": 0.6, + "synthetic__style-001": 0.0, + "synthetic__privacy-004": 0.0, + "synthetic__privacy-015": 0.0, + "synthetic__security-016": 0.5, + "synthetic__privacy-005": 0.333333, + "synthetic__upgrade-003": 0.4, + "synthetic__performance-009": 0.0, + "synthetic__performance-024": 0.0, + "synthetic__security-008": 0.0, + "synthetic__performance-008": 0.666667, + "synthetic__performance-019": 0.0, + "synthetic__upgrade-001": 0.0, + "synthetic__upgrade-009": 0.272727, + "synthetic__upgrade-007": 0.0, + "synthetic__security-014": 0.0, + "synthetic__performance-021": 0.333333, + "synthetic__style-017": 0.153846, + "synthetic__style-010": 0.666667, + "synthetic__performance-012": 0.5, + "synthetic__style-002": 0.0, + "synthetic__privacy-010": 0.0, + "synthetic__performance-016": 0.0, + "synthetic__style-007": 0.0, + "synthetic__security-001": 0.25, + "synthetic__security-010": 0.5, + "synthetic__upgrade-002": 0.0, + "synthetic__performance-022": 0.4, + "synthetic__performance-003": 0.25, + "synthetic__privacy-001": 0.333333, + "synthetic__upgrade-004": 0.25, + "synthetic__performance-011": 0.444444, + "synthetic__performance-004": 0.4, + "synthetic__security-002": 0.0, + "synthetic__privacy-013": 0.571429, + "synthetic__style-003": 0.333333, + "synthetic__upgrade-006": 0.363636, + "synthetic__privacy-011": 0.666667 + } } ], "aggregate": [ @@ -4105,23 +4227,23 @@ "category": "code-review", "experiment": null, "total": 81, - "num_runs": 2, - "average_duration": 75.15, + "num_runs": 3, + "average_duration": 76.36666666666666, "benchmark_version": "0.6.0", - "f1": 0.242, - "f1_ci_low": 0.24, - "f1_ci_high": 0.245, - "f_beta_05": 0.194, - "f_beta_2": 0.3225, - "precision": 0.17149999999999999, - "recall": 0.414, - "macro_f1": 0.251, - "macro_f1_ci_low": 0.22, - "macro_f1_ci_high": 0.284, - "macro_f_beta_05": 0.199, - "macro_f_beta_2": 0.3565, - "macro_precision": 0.176, - "macro_recall": 0.536 + "f1": 0.238, + "f1_ci_low": 0.228, + "f1_ci_high": 0.243, + "f_beta_05": 0.18966666666666665, + "f_beta_2": 0.318, + "precision": 0.16733333333333333, + "recall": 0.41033333333333327, + "macro_f1": 0.243, + "macro_f1_ci_low": 0.217, + "macro_f1_ci_high": 0.271, + "macro_f_beta_05": 0.19299999999999998, + "macro_f_beta_2": 0.3453333333333333, + "macro_precision": 0.17066666666666666, + "macro_recall": 0.5206666666666667 } ] } From 07b530a9aa2c83cd64ae997b3f2f75979b00c013 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 26 Jun 2026 21:05:13 +0000 Subject: [PATCH 4/5] Update leaderboard: Claude Code (claude-sonnet-4-6) - Run 28244487769 --- docs/_data/code-review.json | 150 ++++++++++++++++++++++++++++++++---- 1 file changed, 135 insertions(+), 15 deletions(-) diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json index 1c3fbde1f..dd49aadc1 100644 --- a/docs/_data/code-review.json +++ b/docs/_data/code-review.json @@ -4074,6 +4074,126 @@ "synthetic__upgrade-006": 0.363636, "synthetic__privacy-011": 0.666667 } + }, + { + "total": 81, + "date": "2026-06-26", + "model": "claude-sonnet-4-6", + "agent_name": "Claude Code", + "category": "code-review", + "average_duration": 72.4, + "average_prompt_tokens": 76769.8, + "average_completion_tokens": 4398.9, + "average_llm_duration": 71.5, + "average_tool_usage": { + "Bash": 1.52, + "Read": 0.74, + "Write": 1.0, + "Glob": 0.01 + }, + "github_run_id": "28244487769", + "experiment": null, + "benchmark_version": "0.6.0", + "generated_comment_count": 517, + "expected_comment_count": 216, + "matched_comment_count": 85, + "incorrect_comment_count": 432, + "missed_comment_count": 131, + "precision": 0.164, + "recall": 0.394, + "f1": 0.232, + "f_beta_05": 0.186, + "f_beta_2": 0.308, + "macro_precision": 0.177, + "macro_recall": 0.491, + "macro_f1": 0.241, + "macro_f_beta_05": 0.196, + "macro_f_beta_2": 0.332, + "severity_mae": 0.671, + "valid_review_output_rate": 1.0, + "instance_results": { + "synthetic__performance-023": 0.0, + "synthetic__upgrade-005": 0.0, + "synthetic__upgrade-008": 0.222222, + "synthetic__performance-006": 0.222222, + "synthetic__performance-014": 0.0, + "synthetic__security-006": 0.666667, + "synthetic__security-011": 0.0, + "synthetic__security-007": 0.333333, + "synthetic__style-004": 0.4, + "synthetic__performance-015": 0.4, + "synthetic__style-013": 0.24, + "synthetic__performance-005": 0.571429, + "synthetic__performance-020": 0.25, + "synthetic__security-013": 0.0, + "synthetic__performance-013": 0.181818, + "synthetic__privacy-007": 0.333333, + "synthetic__performance-010": 0.0, + "synthetic__security-009": 0.222222, + "synthetic__style-009": 0.285714, + "synthetic__performance-018": 0.0, + "synthetic__privacy-003": 0.0, + "synthetic__security-012": 0.5, + "synthetic__performance-007": 0.4, + "synthetic__security-003": 0.4, + "synthetic__style-016": 0.666667, + "synthetic__privacy-008": 0.0, + "synthetic__privacy-006": 0.25, + "synthetic__style-012": 0.25, + "synthetic__performance-017": 0.25, + "synthetic__style-014": 0.0, + "synthetic__security-004": 0.4, + "synthetic__privacy-012": 0.285714, + "synthetic__style-005": 0.333333, + "synthetic__security-015": 0.0, + "synthetic__privacy-009": 0.285714, + "synthetic__style-011": 0.166667, + "synthetic__security-005": 0.4, + "synthetic__performance-002": 0.444444, + "synthetic__style-008": 0.2, + "synthetic__style-006": 0.0, + "synthetic__style-015": 0.8, + "synthetic__privacy-002": 0.2, + "synthetic__performance-001": 0.285714, + "synthetic__privacy-014": 0.428571, + "synthetic__style-001": 0.0, + "synthetic__privacy-004": 0.0, + "synthetic__privacy-015": 0.0, + "synthetic__security-016": 1.0, + "synthetic__privacy-005": 0.222222, + "synthetic__upgrade-003": 0.0, + "synthetic__performance-009": 0.0, + "synthetic__performance-024": 0.2, + "synthetic__security-008": 0.4, + "synthetic__performance-008": 0.461538, + "synthetic__performance-019": 0.0, + "synthetic__upgrade-001": 0.0, + "synthetic__upgrade-009": 0.086957, + "synthetic__upgrade-007": 0.181818, + "synthetic__security-014": 0.0, + "synthetic__performance-021": 0.444444, + "synthetic__style-017": 0.0, + "synthetic__style-010": 0.571429, + "synthetic__performance-012": 0.285714, + "synthetic__style-002": 0.0, + "synthetic__privacy-010": 0.0, + "synthetic__performance-016": 0.0, + "synthetic__style-007": 0.0, + "synthetic__security-001": 0.285714, + "synthetic__security-010": 0.4, + "synthetic__upgrade-002": 0.0, + "synthetic__performance-022": 0.333333, + "synthetic__performance-003": 0.222222, + "synthetic__privacy-001": 0.333333, + "synthetic__upgrade-004": 0.222222, + "synthetic__performance-011": 0.2, + "synthetic__performance-004": 0.5, + "synthetic__security-002": 0.0, + "synthetic__privacy-013": 0.571429, + "synthetic__style-003": 0.0, + "synthetic__upgrade-006": 0.454545, + "synthetic__privacy-011": 0.888889 + } } ], "aggregate": [ @@ -4227,23 +4347,23 @@ "category": "code-review", "experiment": null, "total": 81, - "num_runs": 3, - "average_duration": 76.36666666666666, + "num_runs": 4, + "average_duration": 75.375, "benchmark_version": "0.6.0", - "f1": 0.238, - "f1_ci_low": 0.228, - "f1_ci_high": 0.243, - "f_beta_05": 0.18966666666666665, - "f_beta_2": 0.318, - "precision": 0.16733333333333333, - "recall": 0.41033333333333327, + "f1": 0.236, + "f1_ci_low": 0.23, + "f1_ci_high": 0.242, + "f_beta_05": 0.18875, + "f_beta_2": 0.3155, + "precision": 0.1665, + "recall": 0.40625, "macro_f1": 0.243, - "macro_f1_ci_low": 0.217, - "macro_f1_ci_high": 0.271, - "macro_f_beta_05": 0.19299999999999998, - "macro_f_beta_2": 0.3453333333333333, - "macro_precision": 0.17066666666666666, - "macro_recall": 0.5206666666666667 + "macro_f1_ci_low": 0.22, + "macro_f1_ci_high": 0.266, + "macro_f_beta_05": 0.19375, + "macro_f_beta_2": 0.34199999999999997, + "macro_precision": 0.17225, + "macro_recall": 0.51325 } ] } From 7f64d92b639ed2c02710332f6bc63cf6c673ed8f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 26 Jun 2026 21:21:23 +0000 Subject: [PATCH 5/5] Update leaderboard: Claude Code (claude-sonnet-4-6) - Run 28265285082 --- docs/_data/code-review.json | 154 ++++++++++++++++++++++++++++++++---- 1 file changed, 138 insertions(+), 16 deletions(-) diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json index dd49aadc1..231308881 100644 --- a/docs/_data/code-review.json +++ b/docs/_data/code-review.json @@ -4194,6 +4194,128 @@ "synthetic__upgrade-006": 0.454545, "synthetic__privacy-011": 0.888889 } + }, + { + "total": 81, + "date": "2026-06-26", + "model": "claude-sonnet-4-6", + "agent_name": "Claude Code", + "category": "code-review", + "average_duration": 77.1, + "average_prompt_tokens": 81735.6, + "average_completion_tokens": 4657.0, + "average_llm_duration": 75.8, + "average_tool_usage": { + "Bash": 2.1, + "Write": 1.0, + "Read": 1.04, + "Agent": 0.02, + "Glob": 0.06, + "Grep": 0.01 + }, + "github_run_id": "28265285082", + "experiment": null, + "benchmark_version": "0.6.0", + "generated_comment_count": 549, + "expected_comment_count": 216, + "matched_comment_count": 95, + "incorrect_comment_count": 454, + "missed_comment_count": 121, + "precision": 0.173, + "recall": 0.44, + "f1": 0.248, + "f_beta_05": 0.197, + "f_beta_2": 0.336, + "macro_precision": 0.184, + "macro_recall": 0.525, + "macro_f1": 0.257, + "macro_f_beta_05": 0.207, + "macro_f_beta_2": 0.357, + "severity_mae": 0.663, + "valid_review_output_rate": 1.0, + "instance_results": { + "synthetic__performance-023": 0.5, + "synthetic__upgrade-005": 0.166667, + "synthetic__upgrade-008": 0.181818, + "synthetic__performance-006": 0.4, + "synthetic__performance-014": 0.0, + "synthetic__security-006": 0.4, + "synthetic__security-011": 0.0, + "synthetic__security-007": 0.5, + "synthetic__style-004": 0.285714, + "synthetic__performance-015": 0.363636, + "synthetic__style-013": 0.230769, + "synthetic__performance-005": 0.545455, + "synthetic__performance-020": 0.2, + "synthetic__security-013": 0.0, + "synthetic__performance-013": 0.25, + "synthetic__privacy-007": 0.285714, + "synthetic__performance-010": 0.0, + "synthetic__security-009": 0.4, + "synthetic__style-009": 0.333333, + "synthetic__performance-018": 0.0, + "synthetic__privacy-003": 0.0, + "synthetic__security-012": 0.5, + "synthetic__performance-007": 0.5, + "synthetic__security-003": 0.25, + "synthetic__style-016": 0.727273, + "synthetic__privacy-008": 0.0, + "synthetic__privacy-006": 0.0, + "synthetic__style-012": 0.25, + "synthetic__performance-017": 0.4, + "synthetic__style-014": 0.0, + "synthetic__security-004": 0.285714, + "synthetic__privacy-012": 0.533333, + "synthetic__style-005": 0.333333, + "synthetic__security-015": 0.0, + "synthetic__privacy-009": 0.333333, + "synthetic__style-011": 0.358974, + "synthetic__security-005": 0.571429, + "synthetic__performance-002": 0.444444, + "synthetic__style-008": 0.333333, + "synthetic__style-006": 0.0, + "synthetic__style-015": 0.8, + "synthetic__privacy-002": 0.25, + "synthetic__performance-001": 0.444444, + "synthetic__privacy-014": 0.571429, + "synthetic__style-001": 0.0, + "synthetic__privacy-004": 0.0, + "synthetic__privacy-015": 0.0, + "synthetic__security-016": 1.0, + "synthetic__privacy-005": 0.25, + "synthetic__upgrade-003": 0.5, + "synthetic__performance-009": 0.0, + "synthetic__performance-024": 0.0, + "synthetic__security-008": 0.333333, + "synthetic__performance-008": 0.545455, + "synthetic__performance-019": 0.0, + "synthetic__upgrade-001": 0.0, + "synthetic__upgrade-009": 0.090909, + "synthetic__upgrade-007": 0.0, + "synthetic__security-014": 0.0, + "synthetic__performance-021": 0.0, + "synthetic__style-017": 0.0, + "synthetic__style-010": 0.571429, + "synthetic__performance-012": 0.285714, + "synthetic__style-002": 0.0, + "synthetic__privacy-010": 0.0, + "synthetic__performance-016": 0.0, + "synthetic__style-007": 0.0, + "synthetic__security-001": 0.222222, + "synthetic__security-010": 0.5, + "synthetic__upgrade-002": 0.0, + "synthetic__performance-022": 0.5, + "synthetic__performance-003": 0.222222, + "synthetic__privacy-001": 0.4, + "synthetic__upgrade-004": 0.25, + "synthetic__performance-011": 0.444444, + "synthetic__performance-004": 0.4, + "synthetic__security-002": 0.0, + "synthetic__privacy-013": 0.571429, + "synthetic__style-003": 0.0, + "synthetic__upgrade-006": 0.086957, + "synthetic__privacy-011": 0.727273 + } } ], "aggregate": [ @@ -4347,23 +4469,23 @@ "category": "code-review", "experiment": null, "total": 81, - "num_runs": 4, - "average_duration": 75.375, + "num_runs": 5, + "average_duration": 75.72, "benchmark_version": "0.6.0", - "f1": 0.236, - "f1_ci_low": 0.23, - "f1_ci_high": 0.242, - "f_beta_05": 0.18875, - "f_beta_2": 0.3155, - "precision": 0.1665, - "recall": 0.40625, - "macro_f1": 0.243, - "macro_f1_ci_low": 0.22, - "macro_f1_ci_high": 0.266, - "macro_f_beta_05": 0.19375, - "macro_f_beta_2": 0.34199999999999997, - "macro_precision": 0.17225, - "macro_recall": 0.51325 + "f1": 0.239, + "f1_ci_low": 0.232, + "f1_ci_high": 0.245, + "f_beta_05": 0.19039999999999999, + "f_beta_2": 0.3196, + "precision": 0.1678, + "recall": 0.413, + "macro_f1": 0.246, + "macro_f1_ci_low": 0.224, + "macro_f1_ci_high": 0.267, + "macro_f_beta_05": 0.1964, + "macro_f_beta_2": 0.34500000000000003, + "macro_precision": 0.1746, + "macro_recall": 0.5156000000000001 } ] }