diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json index dd41a5ef9..231308881 100644 --- a/docs/_data/code-review.json +++ b/docs/_data/code-review.json @@ -3713,6 +3713,609 @@ "synthetic__upgrade-006": 0.133333, "synthetic__privacy-011": 0.4 } + }, + { + "total": 81, + "date": "2026-06-26", + "model": "claude-sonnet-4-6", + "agent_name": "Claude Code", + "category": "code-review", + "average_duration": 74.6, + "average_prompt_tokens": 79315.5, + "average_completion_tokens": 4510.7, + "average_llm_duration": 73.5, + "average_tool_usage": { + "Bash": 1.47, + "Read": 0.98, + "Write": 1.0, + "Glob": 0.01 + }, + "github_run_id": "28240911816", + "experiment": null, + "benchmark_version": "0.6.0", + "generated_comment_count": 518, + "expected_comment_count": 216, + "matched_comment_count": 88, + "incorrect_comment_count": 430, + "missed_comment_count": 128, + "precision": 0.17, + "recall": 0.407, + "f1": 0.24, + "f_beta_05": 0.192, + "f_beta_2": 0.318, + "macro_precision": 0.177, + "macro_recall": 0.541, + "macro_f1": 0.255, + "macro_f_beta_05": 0.201, + "macro_f_beta_2": 0.363, + "severity_mae": 0.682, + "valid_review_output_rate": 1.0, + "instance_results": { + "synthetic__performance-023": 0.0, + "synthetic__upgrade-005": 0.142857, + "synthetic__upgrade-008": 0.0, + "synthetic__performance-006": 0.571429, + "synthetic__performance-014": 0.0, + "synthetic__security-006": 0.5, + "synthetic__security-011": 0.0, + "synthetic__security-007": 0.333333, + "synthetic__style-004": 0.4, + "synthetic__performance-015": 0.333333, + "synthetic__style-013": 0.173913, + "synthetic__performance-005": 0.444444, + "synthetic__performance-020": 0.0, + "synthetic__security-013": 0.222222, + "synthetic__performance-013": 0.2, + "synthetic__privacy-007": 0.285714, + "synthetic__performance-010": 0.0, + "synthetic__security-009": 0.222222, + "synthetic__style-009": 0.4, + "synthetic__performance-018": 0.166667, + "synthetic__privacy-003": 0.0, + "synthetic__security-012": 0.5, + "synthetic__performance-007": 0.4, + "synthetic__security-003": 0.4, + "synthetic__style-016": 0.5, + "synthetic__privacy-008": 0.0, + "synthetic__privacy-006": 0.0, + "synthetic__style-012": 0.25, + "synthetic__performance-017": 0.285714, + "synthetic__style-014": 0.0, + "synthetic__security-004": 0.333333, + "synthetic__privacy-012": 0.153846, + "synthetic__style-005": 0.333333, + "synthetic__security-015": 0.5, + "synthetic__privacy-009": 0.333333, + "synthetic__style-011": 0.186047, + "synthetic__security-005": 0.571429, + "synthetic__performance-002": 0.363636, + "synthetic__style-008": 0.333333, + "synthetic__style-006": 0.0, + "synthetic__style-015": 0.8, + "synthetic__privacy-002": 0.333333, + "synthetic__performance-001": 0.333333, + "synthetic__privacy-014": 0.333333, + "synthetic__style-001": 0.0, + "synthetic__privacy-004": 0.0, + "synthetic__privacy-015": 0.0, + "synthetic__security-016": 0.5, + "synthetic__privacy-005": 0.25, + "synthetic__upgrade-003": 0.4, + "synthetic__performance-009": 0.0, + "synthetic__performance-024": 0.0, + "synthetic__security-008": 0.333333, + "synthetic__performance-008": 0.444444, + "synthetic__performance-019": 0.0, + "synthetic__upgrade-001": 0.0, + "synthetic__upgrade-009": 0.117647, + "synthetic__upgrade-007": 0.0, + "synthetic__security-014": 0.0, + "synthetic__performance-021": 0.222222, + "synthetic__style-017": 0.0, + "synthetic__style-010": 0.571429, + "synthetic__performance-012": 0.444444, + "synthetic__style-002": 0.0, + "synthetic__privacy-010": 0.153846, + "synthetic__performance-016": 0.0, + "synthetic__style-007": 0.0, + "synthetic__security-001": 0.285714, + "synthetic__security-010": 0.5, + "synthetic__upgrade-002": 0.4, + "synthetic__performance-022": 0.4, + "synthetic__performance-003": 0.222222, + "synthetic__privacy-001": 0.333333, + "synthetic__upgrade-004": 0.25, + "synthetic__performance-011": 0.666667, + "synthetic__performance-004": 0.4, + "synthetic__security-002": 0.4, + "synthetic__privacy-013": 0.571429, + "synthetic__style-003": 0.285714, + "synthetic__upgrade-006": 0.181818, + "synthetic__privacy-011": 0.666667 + } + }, + { + "total": 81, + "date": "2026-06-26", + "model": "claude-sonnet-4-6", + "agent_name": "Claude Code", + "category": "code-review", + "average_duration": 75.7, + "average_prompt_tokens": 75539.0, + "average_completion_tokens": 4643.5, + "average_llm_duration": 75.0, + "average_tool_usage": { + "Bash": 1.46, + "Read": 0.81, + "Write": 1.0 + }, + "github_run_id": "28242115307", + "experiment": null, + "benchmark_version": "0.6.0", + "generated_comment_count": 526, + "expected_comment_count": 216, + "matched_comment_count": 91, + "incorrect_comment_count": 435, + "missed_comment_count": 125, + "precision": 0.173, + "recall": 0.421, + "f1": 0.245, + "f_beta_05": 0.196, + "f_beta_2": 0.327, + "macro_precision": 0.175, + "macro_recall": 0.531, + "macro_f1": 0.247, + "macro_f_beta_05": 0.197, + "macro_f_beta_2": 0.35, + "severity_mae": 0.714, + "valid_review_output_rate": 1.0, + "instance_results": { + "synthetic__performance-023": 0.5, + "synthetic__upgrade-005": 0.153846, + "synthetic__upgrade-008": 0.0, + "synthetic__performance-006": 0.333333, + "synthetic__performance-014": 0.0, + "synthetic__security-006": 0.666667, + "synthetic__security-011": 0.0, + "synthetic__security-007": 0.333333, + "synthetic__style-004": 0.333333, + "synthetic__performance-015": 0.4, + "synthetic__style-013": 0.16, + "synthetic__performance-005": 0.571429, + "synthetic__performance-020": 0.181818, + "synthetic__security-013": 0.0, + "synthetic__performance-013": 0.444444, + "synthetic__privacy-007": 0.333333, + "synthetic__performance-010": 0.0, + "synthetic__security-009": 0.222222, + "synthetic__style-009": 0.285714, + "synthetic__performance-018": 0.181818, + "synthetic__privacy-003": 0.0, + "synthetic__security-012": 0.5, + "synthetic__performance-007": 0.5, + "synthetic__security-003": 0.333333, + "synthetic__style-016": 0.2, + "synthetic__privacy-008": 0.0, + "synthetic__privacy-006": 0.0, + "synthetic__style-012": 0.25, + "synthetic__performance-017": 0.285714, + "synthetic__style-014": 0.181818, + "synthetic__security-004": 0.285714, + "synthetic__privacy-012": 0.133333, + "synthetic__style-005": 0.333333, + "synthetic__security-015": 0.0, + "synthetic__privacy-009": 0.25, + "synthetic__style-011": 0.4, + "synthetic__security-005": 0.571429, + "synthetic__performance-002": 0.5, + "synthetic__style-008": 0.285714, + "synthetic__style-006": 0.0, + "synthetic__style-015": 0.727273, + "synthetic__privacy-002": 0.285714, + "synthetic__performance-001": 0.4, + "synthetic__privacy-014": 0.333333, + "synthetic__style-001": 0.0, + "synthetic__privacy-004": 0.0, + "synthetic__privacy-015": 0.0, + "synthetic__security-016": 0.5, + "synthetic__privacy-005": 0.285714, + "synthetic__upgrade-003": 0.0, + "synthetic__performance-009": 0.0, + "synthetic__performance-024": 0.2, + "synthetic__security-008": 0.0, + "synthetic__performance-008": 0.444444, + "synthetic__performance-019": 0.0, + "synthetic__upgrade-001": 0.0, + "synthetic__upgrade-009": 0.190476, + "synthetic__upgrade-007": 0.0, + "synthetic__security-014": 0.285714, + "synthetic__performance-021": 0.363636, + "synthetic__style-017": 0.0, + "synthetic__style-010": 0.571429, + "synthetic__performance-012": 0.571429, + "synthetic__style-002": 0.0, + "synthetic__privacy-010": 0.142857, + "synthetic__performance-016": 0.0, + "synthetic__style-007": 0.0, + "synthetic__security-001": 0.25, + "synthetic__security-010": 0.545455, + "synthetic__upgrade-002": 0.0, + "synthetic__performance-022": 0.307692, + "synthetic__performance-003": 0.166667, + "synthetic__privacy-001": 0.25, + "synthetic__upgrade-004": 0.285714, + "synthetic__performance-011": 0.666667, + "synthetic__performance-004": 0.4, + "synthetic__security-002": 0.25, + "synthetic__privacy-013": 0.571429, + "synthetic__style-003": 0.25, + "synthetic__upgrade-006": 0.210526, + "synthetic__privacy-011": 0.461538 + } + }, + { + "total": 81, + "date": "2026-06-26", + "model": "claude-sonnet-4-6", + "agent_name": "Claude Code", + "category": "code-review", + "average_duration": 78.8, + "average_prompt_tokens": 79802.3, + "average_completion_tokens": 4696.1, + "average_llm_duration": 77.9, + "average_tool_usage": { + "Bash": 1.62, + "Write": 1.0, + "Read": 1.17, + "Glob": 0.07, + "Agent": 0.02, + "Grep": 0.14 + }, + "github_run_id": "28243306907", + "experiment": null, + "benchmark_version": "0.6.0", + "generated_comment_count": 546, + "expected_comment_count": 216, + "matched_comment_count": 87, + "incorrect_comment_count": 459, + "missed_comment_count": 129, + "precision": 0.159, + "recall": 0.403, + "f1": 0.228, + "f_beta_05": 0.181, + "f_beta_2": 0.309, + "macro_precision": 0.16, + "macro_recall": 0.49, + "macro_f1": 0.228, + "macro_f_beta_05": 0.181, + "macro_f_beta_2": 0.323, + "severity_mae": 0.747, + "valid_review_output_rate": 1.0, + "instance_results": { + "synthetic__performance-023": 0.0, + "synthetic__upgrade-005": 0.166667, + "synthetic__upgrade-008": 0.2, + "synthetic__performance-006": 0.444444, + "synthetic__performance-014": 0.0, + "synthetic__security-006": 0.5, + "synthetic__security-011": 0.0, + "synthetic__security-007": 0.333333, + "synthetic__style-004": 0.25, + "synthetic__performance-015": 0.153846, + "synthetic__style-013": 0.16, + "synthetic__performance-005": 0.666667, + "synthetic__performance-020": 0.0, + "synthetic__security-013": 0.0, + "synthetic__performance-013": 0.222222, + "synthetic__privacy-007": 0.222222, + "synthetic__performance-010": 0.0, + "synthetic__security-009": 0.2, + "synthetic__style-009": 0.4, + "synthetic__performance-018": 0.0, + "synthetic__privacy-003": 0.0, + "synthetic__security-012": 0.2, + "synthetic__performance-007": 0.5, + "synthetic__security-003": 0.285714, + "synthetic__style-016": 0.545455, + "synthetic__privacy-008": 0.0, + "synthetic__privacy-006": 0.0, + "synthetic__style-012": 0.235294, + "synthetic__performance-017": 0.0, + "synthetic__style-014": 0.0, + "synthetic__security-004": 0.333333, + "synthetic__privacy-012": 0.142857, + "synthetic__style-005": 0.333333, + "synthetic__security-015": 0.0, + "synthetic__privacy-009": 0.333333, + "synthetic__style-011": 0.25641, + "synthetic__security-005": 0.25, + "synthetic__performance-002": 0.4, + "synthetic__style-008": 0.285714, + "synthetic__style-006": 0.0, + "synthetic__style-015": 0.666667, + "synthetic__privacy-002": 0.2, + "synthetic__performance-001": 0.4, + "synthetic__privacy-014": 0.6, + "synthetic__style-001": 0.0, + "synthetic__privacy-004": 0.0, + "synthetic__privacy-015": 0.0, + "synthetic__security-016": 0.5, + "synthetic__privacy-005": 0.333333, + "synthetic__upgrade-003": 0.4, + "synthetic__performance-009": 0.0, + "synthetic__performance-024": 0.0, + "synthetic__security-008": 0.0, + "synthetic__performance-008": 0.666667, + "synthetic__performance-019": 0.0, + "synthetic__upgrade-001": 0.0, + "synthetic__upgrade-009": 0.272727, + "synthetic__upgrade-007": 0.0, + "synthetic__security-014": 0.0, + "synthetic__performance-021": 0.333333, + "synthetic__style-017": 0.153846, + "synthetic__style-010": 0.666667, + "synthetic__performance-012": 0.5, + "synthetic__style-002": 0.0, + "synthetic__privacy-010": 0.0, + "synthetic__performance-016": 0.0, + "synthetic__style-007": 0.0, + "synthetic__security-001": 0.25, + "synthetic__security-010": 0.5, + "synthetic__upgrade-002": 0.0, + "synthetic__performance-022": 0.4, + "synthetic__performance-003": 0.25, + "synthetic__privacy-001": 0.333333, + "synthetic__upgrade-004": 0.25, + "synthetic__performance-011": 0.444444, + "synthetic__performance-004": 0.4, + "synthetic__security-002": 0.0, + "synthetic__privacy-013": 0.571429, + "synthetic__style-003": 0.333333, + "synthetic__upgrade-006": 0.363636, + "synthetic__privacy-011": 0.666667 + } + }, + { + "total": 81, + "date": "2026-06-26", + "model": "claude-sonnet-4-6", + "agent_name": "Claude Code", + "category": "code-review", + "average_duration": 72.4, + "average_prompt_tokens": 76769.8, + "average_completion_tokens": 4398.9, + "average_llm_duration": 71.5, + "average_tool_usage": { + "Bash": 1.52, + "Read": 0.74, + "Write": 1.0, + "Glob": 0.01 + }, + "github_run_id": "28244487769", + "experiment": null, + "benchmark_version": "0.6.0", + "generated_comment_count": 517, + "expected_comment_count": 216, + "matched_comment_count": 85, + "incorrect_comment_count": 432, + "missed_comment_count": 131, + "precision": 0.164, + "recall": 0.394, + "f1": 0.232, + "f_beta_05": 0.186, + "f_beta_2": 0.308, + "macro_precision": 0.177, + "macro_recall": 0.491, + "macro_f1": 0.241, + "macro_f_beta_05": 0.196, + "macro_f_beta_2": 0.332, + "severity_mae": 0.671, + "valid_review_output_rate": 1.0, + "instance_results": { + "synthetic__performance-023": 0.0, + "synthetic__upgrade-005": 0.0, + "synthetic__upgrade-008": 0.222222, + "synthetic__performance-006": 0.222222, + "synthetic__performance-014": 0.0, + "synthetic__security-006": 0.666667, + "synthetic__security-011": 0.0, + "synthetic__security-007": 0.333333, + "synthetic__style-004": 0.4, + "synthetic__performance-015": 0.4, + "synthetic__style-013": 0.24, + "synthetic__performance-005": 0.571429, + "synthetic__performance-020": 0.25, + "synthetic__security-013": 0.0, + "synthetic__performance-013": 0.181818, + "synthetic__privacy-007": 0.333333, + "synthetic__performance-010": 0.0, + "synthetic__security-009": 0.222222, + "synthetic__style-009": 0.285714, + "synthetic__performance-018": 0.0, + "synthetic__privacy-003": 0.0, + "synthetic__security-012": 0.5, + "synthetic__performance-007": 0.4, + "synthetic__security-003": 0.4, + "synthetic__style-016": 0.666667, + "synthetic__privacy-008": 0.0, + "synthetic__privacy-006": 0.25, + "synthetic__style-012": 0.25, + "synthetic__performance-017": 0.25, + "synthetic__style-014": 0.0, + "synthetic__security-004": 0.4, + "synthetic__privacy-012": 0.285714, + "synthetic__style-005": 0.333333, + "synthetic__security-015": 0.0, + "synthetic__privacy-009": 0.285714, + "synthetic__style-011": 0.166667, + "synthetic__security-005": 0.4, + "synthetic__performance-002": 0.444444, + "synthetic__style-008": 0.2, + "synthetic__style-006": 0.0, + "synthetic__style-015": 0.8, + "synthetic__privacy-002": 0.2, + "synthetic__performance-001": 0.285714, + "synthetic__privacy-014": 0.428571, + "synthetic__style-001": 0.0, + "synthetic__privacy-004": 0.0, + "synthetic__privacy-015": 0.0, + "synthetic__security-016": 1.0, + "synthetic__privacy-005": 0.222222, + "synthetic__upgrade-003": 0.0, + "synthetic__performance-009": 0.0, + "synthetic__performance-024": 0.2, + "synthetic__security-008": 0.4, + "synthetic__performance-008": 0.461538, + "synthetic__performance-019": 0.0, + "synthetic__upgrade-001": 0.0, + "synthetic__upgrade-009": 0.086957, + "synthetic__upgrade-007": 0.181818, + "synthetic__security-014": 0.0, + "synthetic__performance-021": 0.444444, + "synthetic__style-017": 0.0, + "synthetic__style-010": 0.571429, + "synthetic__performance-012": 0.285714, + "synthetic__style-002": 0.0, + "synthetic__privacy-010": 0.0, + "synthetic__performance-016": 0.0, + "synthetic__style-007": 0.0, + "synthetic__security-001": 0.285714, + "synthetic__security-010": 0.4, + "synthetic__upgrade-002": 0.0, + "synthetic__performance-022": 0.333333, + "synthetic__performance-003": 0.222222, + "synthetic__privacy-001": 0.333333, + "synthetic__upgrade-004": 0.222222, + "synthetic__performance-011": 0.2, + "synthetic__performance-004": 0.5, + "synthetic__security-002": 0.0, + "synthetic__privacy-013": 0.571429, + "synthetic__style-003": 0.0, + "synthetic__upgrade-006": 0.454545, + "synthetic__privacy-011": 0.888889 + } + }, + { + "total": 81, + "date": "2026-06-26", + "model": "claude-sonnet-4-6", + "agent_name": "Claude Code", + "category": "code-review", + "average_duration": 77.1, + "average_prompt_tokens": 81735.6, + "average_completion_tokens": 4657.0, + "average_llm_duration": 75.8, + "average_tool_usage": { + "Bash": 2.1, + "Write": 1.0, + "Read": 1.04, + "Agent": 0.02, + "Glob": 0.06, + "Grep": 0.01 + }, + "github_run_id": "28265285082", + "experiment": null, + "benchmark_version": "0.6.0", + "generated_comment_count": 549, + "expected_comment_count": 216, + "matched_comment_count": 95, + "incorrect_comment_count": 454, + "missed_comment_count": 121, + "precision": 0.173, + "recall": 0.44, + "f1": 0.248, + "f_beta_05": 0.197, + "f_beta_2": 0.336, + "macro_precision": 0.184, + "macro_recall": 0.525, + "macro_f1": 0.257, + "macro_f_beta_05": 0.207, + "macro_f_beta_2": 0.357, + "severity_mae": 0.663, + "valid_review_output_rate": 1.0, + "instance_results": { + "synthetic__performance-023": 0.5, + "synthetic__upgrade-005": 0.166667, + "synthetic__upgrade-008": 0.181818, + "synthetic__performance-006": 0.4, + "synthetic__performance-014": 0.0, + "synthetic__security-006": 0.4, + "synthetic__security-011": 0.0, + "synthetic__security-007": 0.5, + "synthetic__style-004": 0.285714, + "synthetic__performance-015": 0.363636, + "synthetic__style-013": 0.230769, + "synthetic__performance-005": 0.545455, + "synthetic__performance-020": 0.2, + "synthetic__security-013": 0.0, + "synthetic__performance-013": 0.25, + "synthetic__privacy-007": 0.285714, + "synthetic__performance-010": 0.0, + "synthetic__security-009": 0.4, + "synthetic__style-009": 0.333333, + "synthetic__performance-018": 0.0, + "synthetic__privacy-003": 0.0, + "synthetic__security-012": 0.5, + "synthetic__performance-007": 0.5, + "synthetic__security-003": 0.25, + "synthetic__style-016": 0.727273, + "synthetic__privacy-008": 0.0, + "synthetic__privacy-006": 0.0, + "synthetic__style-012": 0.25, + "synthetic__performance-017": 0.4, + "synthetic__style-014": 0.0, + "synthetic__security-004": 0.285714, + "synthetic__privacy-012": 0.533333, + "synthetic__style-005": 0.333333, + "synthetic__security-015": 0.0, + "synthetic__privacy-009": 0.333333, + "synthetic__style-011": 0.358974, + "synthetic__security-005": 0.571429, + "synthetic__performance-002": 0.444444, + "synthetic__style-008": 0.333333, + "synthetic__style-006": 0.0, + "synthetic__style-015": 0.8, + "synthetic__privacy-002": 0.25, + "synthetic__performance-001": 0.444444, + "synthetic__privacy-014": 0.571429, + "synthetic__style-001": 0.0, + "synthetic__privacy-004": 0.0, + "synthetic__privacy-015": 0.0, + "synthetic__security-016": 1.0, + "synthetic__privacy-005": 0.25, + "synthetic__upgrade-003": 0.5, + "synthetic__performance-009": 0.0, + "synthetic__performance-024": 0.0, + "synthetic__security-008": 0.333333, + "synthetic__performance-008": 0.545455, + "synthetic__performance-019": 0.0, + "synthetic__upgrade-001": 0.0, + "synthetic__upgrade-009": 0.090909, + "synthetic__upgrade-007": 0.0, + "synthetic__security-014": 0.0, + "synthetic__performance-021": 0.0, + "synthetic__style-017": 0.0, + "synthetic__style-010": 0.571429, + "synthetic__performance-012": 0.285714, + "synthetic__style-002": 0.0, + "synthetic__privacy-010": 0.0, + "synthetic__performance-016": 0.0, + "synthetic__style-007": 0.0, + "synthetic__security-001": 0.222222, + "synthetic__security-010": 0.5, + "synthetic__upgrade-002": 0.0, + "synthetic__performance-022": 0.5, + "synthetic__performance-003": 0.222222, + "synthetic__privacy-001": 0.4, + "synthetic__upgrade-004": 0.25, + "synthetic__performance-011": 0.444444, + "synthetic__performance-004": 0.4, + "synthetic__security-002": 0.0, + "synthetic__privacy-013": 0.571429, + "synthetic__style-003": 0.0, + "synthetic__upgrade-006": 0.086957, + "synthetic__privacy-011": 0.727273 + } } ], "aggregate": [ @@ -3859,6 +4462,30 @@ "macro_f_beta_2": 0.4354, "macro_precision": 0.3412, "macro_recall": 0.5126000000000001 + }, + { + "model": "claude-sonnet-4-6", + "agent_name": "Claude Code", + "category": "code-review", + "experiment": null, + "total": 81, + "num_runs": 5, + "average_duration": 75.72, + "benchmark_version": "0.6.0", + "f1": 0.239, + "f1_ci_low": 0.232, + "f1_ci_high": 0.245, + "f_beta_05": 0.19039999999999999, + "f_beta_2": 0.3196, + "precision": 0.1678, + "recall": 0.413, + "macro_f1": 0.246, + "macro_f1_ci_low": 0.224, + "macro_f1_ci_high": 0.267, + "macro_f_beta_05": 0.1964, + "macro_f_beta_2": 0.34500000000000003, + "macro_precision": 0.1746, + "macro_recall": 0.5156000000000001 } ] }