diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json index 4cdb03526..dd41a5ef9 100644 --- a/docs/_data/code-review.json +++ b/docs/_data/code-review.json @@ -3100,6 +3100,619 @@ "synthetic__upgrade-006": 0.0, "synthetic__privacy-011": 0.0 } + }, + { + "total": 81, + "date": "2026-06-26", + "model": "claude-opus-4-8", + "agent_name": "GitHub Copilot", + "category": "code-review", + "average_duration": 69.6, + "average_prompt_tokens": 154042.0, + "average_completion_tokens": 3361.7, + "average_llm_duration": 0.0, + "average_tool_usage": { + "report_intent": 1.42, + "bash": 3.57, + "create": 1.0, + "view": 0.84, + "grep": 0.1, + "glob": 0.01 + }, + "github_run_id": "28233006770", + "experiment": null, + "benchmark_version": "0.6.0", + "generated_comment_count": 306, + "expected_comment_count": 216, + "matched_comment_count": 78, + "incorrect_comment_count": 228, + "missed_comment_count": 138, + "precision": 0.255, + "recall": 0.361, + "f1": 0.299, + "f_beta_05": 0.271, + "f_beta_2": 0.333, + "macro_precision": 0.325, + "macro_recall": 0.501, + "macro_f1": 0.366, + "macro_f_beta_05": 0.336, + "macro_f_beta_2": 0.422, + "severity_mae": 0.474, + "valid_review_output_rate": 1.0, + "instance_results": { + "synthetic__performance-023": 0.0, + "synthetic__upgrade-005": 0.25, + "synthetic__upgrade-008": 0.222222, + "synthetic__performance-006": 0.571429, + "synthetic__performance-014": 0.0, + "synthetic__security-006": 1.0, + "synthetic__security-011": 0.0, + "synthetic__security-007": 0.5, + "synthetic__style-004": 0.666667, + "synthetic__performance-015": 0.571429, + "synthetic__style-013": 0.0, + "synthetic__performance-005": 0.333333, + "synthetic__performance-020": 0.0, + "synthetic__security-013": 0.25, + "synthetic__performance-013": 0.571429, + "synthetic__privacy-007": 0.666667, + "synthetic__performance-010": 0.0, + "synthetic__security-009": 0.222222, + "synthetic__style-009": 0.666667, + "synthetic__performance-018": 0.0, + "synthetic__privacy-003": 0.0, + "synthetic__security-012": 0.666667, + "synthetic__performance-007": 1.0, + "synthetic__security-003": 0.666667, + "synthetic__style-016": 0.5, + "synthetic__privacy-008": 0.0, + "synthetic__privacy-006": 0.333333, + "synthetic__style-012": 0.153846, + "synthetic__performance-017": 0.333333, + "synthetic__style-014": 0.0, + "synthetic__security-004": 0.666667, + "synthetic__privacy-012": 0.6, + "synthetic__style-005": 0.666667, + "synthetic__security-015": 0.0, + "synthetic__privacy-009": 0.4, + "synthetic__style-011": 0.058824, + "synthetic__security-005": 0.5, + "synthetic__performance-002": 0.666667, + "synthetic__style-008": 1.0, + "synthetic__style-006": 0.0, + "synthetic__style-015": 0.5, + "synthetic__privacy-002": 0.5, + "synthetic__performance-001": 0.666667, + "synthetic__privacy-014": 0.888889, + "synthetic__style-001": 0.0, + "synthetic__privacy-004": 0.0, + "synthetic__privacy-015": 0.0, + "synthetic__security-016": 1.0, + "synthetic__privacy-005": 0.5, + "synthetic__upgrade-003": 1.0, + "synthetic__performance-009": 0.0, + "synthetic__performance-024": 0.0, + "synthetic__security-008": 0.0, + "synthetic__performance-008": 0.666667, + "synthetic__performance-019": 0.0, + "synthetic__upgrade-001": 0.0, + "synthetic__upgrade-009": 0.307692, + "synthetic__upgrade-007": 0.181818, + "synthetic__security-014": 0.0, + "synthetic__performance-021": 0.0, + "synthetic__style-017": 0.153846, + "synthetic__style-010": 0.666667, + "synthetic__performance-012": 0.25, + "synthetic__style-002": 0.8, + "synthetic__privacy-010": 0.0, + "synthetic__performance-016": 0.0, + "synthetic__style-007": 0.0, + "synthetic__security-001": 0.4, + "synthetic__security-010": 0.666667, + "synthetic__upgrade-002": 0.5, + "synthetic__performance-022": 0.363636, + "synthetic__performance-003": 0.5, + "synthetic__privacy-001": 0.4, + "synthetic__upgrade-004": 0.5, + "synthetic__performance-011": 0.333333, + "synthetic__performance-004": 1.0, + "synthetic__security-002": 0.0, + "synthetic__privacy-013": 0.8, + "synthetic__style-003": 0.222222, + "synthetic__upgrade-006": 0.133333, + "synthetic__privacy-011": 1.0 + } + }, + { + "total": 81, + "date": "2026-06-26", + "model": "claude-opus-4-8", + "agent_name": "GitHub Copilot", + "category": "code-review", + "average_duration": 59.6, + "average_prompt_tokens": 155166.7, + "average_completion_tokens": 3422.2, + "average_llm_duration": 0.0, + "average_tool_usage": { + "bash": 3.62, + "report_intent": 1.43, + "create": 1.01, + "view": 0.77, + "grep": 0.23, + "stop_bash": 0.02, + "glob": 0.01 + }, + "github_run_id": "28233959466", + "experiment": null, + "benchmark_version": "0.6.0", + "generated_comment_count": 305, + "expected_comment_count": 216, + "matched_comment_count": 83, + "incorrect_comment_count": 222, + "missed_comment_count": 133, + "precision": 0.272, + "recall": 0.384, + "f1": 0.319, + "f_beta_05": 0.289, + "f_beta_2": 0.355, + "macro_precision": 0.338, + "macro_recall": 0.513, + "macro_f1": 0.378, + "macro_f_beta_05": 0.348, + "macro_f_beta_2": 0.436, + "severity_mae": 0.602, + "valid_review_output_rate": 1.0, + "instance_results": { + "synthetic__performance-023": 0.0, + "synthetic__upgrade-005": 0.25, + "synthetic__upgrade-008": 0.181818, + "synthetic__performance-006": 0.8, + "synthetic__performance-014": 0.0, + "synthetic__security-006": 1.0, + "synthetic__security-011": 0.0, + "synthetic__security-007": 0.666667, + "synthetic__style-004": 0.666667, + "synthetic__performance-015": 0.333333, + "synthetic__style-013": 0.0, + "synthetic__performance-005": 0.285714, + "synthetic__performance-020": 0.0, + "synthetic__security-013": 0.181818, + "synthetic__performance-013": 0.571429, + "synthetic__privacy-007": 1.0, + "synthetic__performance-010": 0.0, + "synthetic__security-009": 0.25, + "synthetic__style-009": 0.666667, + "synthetic__performance-018": 0.25, + "synthetic__privacy-003": 0.0, + "synthetic__security-012": 0.666667, + "synthetic__performance-007": 1.0, + "synthetic__security-003": 0.666667, + "synthetic__style-016": 0.666667, + "synthetic__privacy-008": 0.0, + "synthetic__privacy-006": 0.0, + "synthetic__style-012": 0.285714, + "synthetic__performance-017": 0.0, + "synthetic__style-014": 0.0, + "synthetic__security-004": 1.0, + "synthetic__privacy-012": 0.666667, + "synthetic__style-005": 0.666667, + "synthetic__security-015": 0.0, + "synthetic__privacy-009": 0.333333, + "synthetic__style-011": 0.176471, + "synthetic__security-005": 0.5, + "synthetic__performance-002": 0.8, + "synthetic__style-008": 0.666667, + "synthetic__style-006": 0.0, + "synthetic__style-015": 0.666667, + "synthetic__privacy-002": 0.5, + "synthetic__performance-001": 0.666667, + "synthetic__privacy-014": 0.888889, + "synthetic__style-001": 0.0, + "synthetic__privacy-004": 0.0, + "synthetic__privacy-015": 0.0, + "synthetic__security-016": 1.0, + "synthetic__privacy-005": 0.5, + "synthetic__upgrade-003": 0.666667, + "synthetic__performance-009": 0.0, + "synthetic__performance-024": 0.285714, + "synthetic__security-008": 0.0, + "synthetic__performance-008": 0.666667, + "synthetic__performance-019": 0.0, + "synthetic__upgrade-001": 0.0, + "synthetic__upgrade-009": 0.333333, + "synthetic__upgrade-007": 0.2, + "synthetic__security-014": 0.0, + "synthetic__performance-021": 0.333333, + "synthetic__style-017": 0.0, + "synthetic__style-010": 0.333333, + "synthetic__performance-012": 0.571429, + "synthetic__style-002": 0.0, + "synthetic__privacy-010": 0.166667, + "synthetic__performance-016": 0.4, + "synthetic__style-007": 0.0, + "synthetic__security-001": 0.666667, + "synthetic__security-010": 0.75, + "synthetic__upgrade-002": 0.5, + "synthetic__performance-022": 0.222222, + "synthetic__performance-003": 0.571429, + "synthetic__privacy-001": 0.333333, + "synthetic__upgrade-004": 0.5, + "synthetic__performance-011": 0.0, + "synthetic__performance-004": 1.0, + "synthetic__security-002": 0.4, + "synthetic__privacy-013": 0.8, + "synthetic__style-003": 0.25, + "synthetic__upgrade-006": 0.266667, + "synthetic__privacy-011": 1.0 + } + }, + { + "total": 81, + "date": "2026-06-26", + "model": "claude-opus-4-8", + "agent_name": "GitHub Copilot", + "category": "code-review", + "average_duration": 57.0, + "average_prompt_tokens": 152940.7, + "average_completion_tokens": 3249.4, + "average_llm_duration": 0.0, + "average_tool_usage": { + "report_intent": 1.53, + "bash": 3.6, + "view": 0.91, + "create": 1.0, + "grep": 0.19, + "stop_bash": 0.01, + "glob": 0.01 + }, + "github_run_id": "28234982430", + "experiment": null, + "benchmark_version": "0.6.0", + "generated_comment_count": 310, + "expected_comment_count": 216, + "matched_comment_count": 89, + "incorrect_comment_count": 221, + "missed_comment_count": 127, + "precision": 0.287, + "recall": 0.412, + "f1": 0.338, + "f_beta_05": 0.306, + "f_beta_2": 0.379, + "macro_precision": 0.376, + "macro_recall": 0.554, + "macro_f1": 0.414, + "macro_f_beta_05": 0.386, + "macro_f_beta_2": 0.472, + "severity_mae": 0.494, + "valid_review_output_rate": 1.0, + "instance_results": { + "synthetic__performance-023": 0.0, + "synthetic__upgrade-005": 0.25, + "synthetic__upgrade-008": 0.181818, + "synthetic__performance-006": 0.666667, + "synthetic__performance-014": 0.0, + "synthetic__security-006": 1.0, + "synthetic__security-011": 0.0, + "synthetic__security-007": 0.666667, + "synthetic__style-004": 0.666667, + "synthetic__performance-015": 0.666667, + "synthetic__style-013": 0.0, + "synthetic__performance-005": 0.333333, + "synthetic__performance-020": 0.285714, + "synthetic__security-013": 0.0, + "synthetic__performance-013": 0.571429, + "synthetic__privacy-007": 0.666667, + "synthetic__performance-010": 0.0, + "synthetic__security-009": 0.222222, + "synthetic__style-009": 0.666667, + "synthetic__performance-018": 0.222222, + "synthetic__privacy-003": 0.0, + "synthetic__security-012": 0.4, + "synthetic__performance-007": 1.0, + "synthetic__security-003": 0.5, + "synthetic__style-016": 0.25, + "synthetic__privacy-008": 0.0, + "synthetic__privacy-006": 0.666667, + "synthetic__style-012": 0.333333, + "synthetic__performance-017": 0.4, + "synthetic__style-014": 0.222222, + "synthetic__security-004": 0.666667, + "synthetic__privacy-012": 0.8, + "synthetic__style-005": 0.5, + "synthetic__security-015": 0.0, + "synthetic__privacy-009": 0.333333, + "synthetic__style-011": 0.216216, + "synthetic__security-005": 0.5, + "synthetic__performance-002": 1.0, + "synthetic__style-008": 1.0, + "synthetic__style-006": 0.0, + "synthetic__style-015": 1.0, + "synthetic__privacy-002": 0.333333, + "synthetic__performance-001": 0.666667, + "synthetic__privacy-014": 0.888889, + "synthetic__style-001": 1.0, + "synthetic__privacy-004": 0.0, + "synthetic__privacy-015": 0.153846, + "synthetic__security-016": 1.0, + "synthetic__privacy-005": 0.666667, + "synthetic__upgrade-003": 1.0, + "synthetic__performance-009": 0.0, + "synthetic__performance-024": 0.285714, + "synthetic__security-008": 0.0, + "synthetic__performance-008": 0.666667, + "synthetic__performance-019": 0.0, + "synthetic__upgrade-001": 0.0, + "synthetic__upgrade-009": 0.461538, + "synthetic__upgrade-007": 0.0, + "synthetic__security-014": 0.0, + "synthetic__performance-021": 0.4, + "synthetic__style-017": 0.153846, + "synthetic__style-010": 0.666667, + "synthetic__performance-012": 0.571429, + "synthetic__style-002": 0.0, + "synthetic__privacy-010": 0.0, + "synthetic__performance-016": 0.0, + "synthetic__style-007": 0.4, + "synthetic__security-001": 0.5, + "synthetic__security-010": 0.666667, + "synthetic__upgrade-002": 0.5, + "synthetic__performance-022": 0.5, + "synthetic__performance-003": 0.25, + "synthetic__privacy-001": 0.333333, + "synthetic__upgrade-004": 1.0, + "synthetic__performance-011": 0.0, + "synthetic__performance-004": 1.0, + "synthetic__security-002": 0.5, + "synthetic__privacy-013": 0.8, + "synthetic__style-003": 0.222222, + "synthetic__upgrade-006": 0.125, + "synthetic__privacy-011": 1.0 + } + }, + { + "total": 81, + "date": "2026-06-26", + "model": "claude-opus-4-8", + "agent_name": "GitHub Copilot", + "category": "code-review", + "average_duration": 56.2, + "average_prompt_tokens": 144754.3, + "average_completion_tokens": 3250.4, + "average_llm_duration": 0.0, + "average_tool_usage": { + "report_intent": 1.33, + "bash": 3.42, + "create": 1.04, + "grep": 0.11, + "view": 0.62 + }, + "github_run_id": "28235940363", + "experiment": null, + "benchmark_version": "0.6.0", + "generated_comment_count": 299, + "expected_comment_count": 216, + "matched_comment_count": 85, + "incorrect_comment_count": 214, + "missed_comment_count": 131, + "precision": 0.284, + "recall": 0.394, + "f1": 0.33, + "f_beta_05": 0.301, + "f_beta_2": 0.365, + "macro_precision": 0.37, + "macro_recall": 0.516, + "macro_f1": 0.4, + "macro_f_beta_05": 0.374, + "macro_f_beta_2": 0.451, + "severity_mae": 0.565, + "valid_review_output_rate": 1.0, + "instance_results": { + "synthetic__performance-023": 0.0, + "synthetic__upgrade-005": 0.25, + "synthetic__upgrade-008": 0.4, + "synthetic__performance-006": 0.571429, + "synthetic__performance-014": 0.0, + "synthetic__security-006": 1.0, + "synthetic__security-011": 0.0, + "synthetic__security-007": 0.666667, + "synthetic__style-004": 0.666667, + "synthetic__performance-015": 0.0, + "synthetic__style-013": 0.0, + "synthetic__performance-005": 0.333333, + "synthetic__performance-020": 0.0, + "synthetic__security-013": 0.222222, + "synthetic__performance-013": 0.0, + "synthetic__privacy-007": 0.666667, + "synthetic__performance-010": 0.0, + "synthetic__security-009": 0.222222, + "synthetic__style-009": 1.0, + "synthetic__performance-018": 0.0, + "synthetic__privacy-003": 0.0, + "synthetic__security-012": 0.666667, + "synthetic__performance-007": 1.0, + "synthetic__security-003": 1.0, + "synthetic__style-016": 0.444444, + "synthetic__privacy-008": 0.0, + "synthetic__privacy-006": 0.5, + "synthetic__style-012": 0.285714, + "synthetic__performance-017": 0.285714, + "synthetic__style-014": 0.0, + "synthetic__security-004": 1.0, + "synthetic__privacy-012": 0.727273, + "synthetic__style-005": 1.0, + "synthetic__security-015": 0.0, + "synthetic__privacy-009": 0.5, + "synthetic__style-011": 0.352941, + "synthetic__security-005": 0.5, + "synthetic__performance-002": 1.0, + "synthetic__style-008": 0.666667, + "synthetic__style-006": 0.0, + "synthetic__style-015": 0.8, + "synthetic__privacy-002": 0.5, + "synthetic__performance-001": 1.0, + "synthetic__privacy-014": 1.0, + "synthetic__style-001": 0.0, + "synthetic__privacy-004": 0.0, + "synthetic__privacy-015": 0.0, + "synthetic__security-016": 1.0, + "synthetic__privacy-005": 0.666667, + "synthetic__upgrade-003": 1.0, + "synthetic__performance-009": 0.0, + "synthetic__performance-024": 0.333333, + "synthetic__security-008": 0.666667, + "synthetic__performance-008": 1.0, + "synthetic__performance-019": 0.0, + "synthetic__upgrade-001": 0.0, + "synthetic__upgrade-009": 0.285714, + "synthetic__upgrade-007": 0.181818, + "synthetic__security-014": 0.0, + "synthetic__performance-021": 0.285714, + "synthetic__style-017": 0.0, + "synthetic__style-010": 0.8, + "synthetic__performance-012": 0.571429, + "synthetic__style-002": 0.0, + "synthetic__privacy-010": 0.25, + "synthetic__performance-016": 0.0, + "synthetic__style-007": 0.0, + "synthetic__security-001": 0.5, + "synthetic__security-010": 0.888889, + "synthetic__upgrade-002": 0.0, + "synthetic__performance-022": 0.363636, + "synthetic__performance-003": 0.5, + "synthetic__privacy-001": 0.4, + "synthetic__upgrade-004": 0.4, + "synthetic__performance-011": 0.0, + "synthetic__performance-004": 0.5, + "synthetic__security-002": 0.4, + "synthetic__privacy-013": 0.666667, + "synthetic__style-003": 1.0, + "synthetic__upgrade-006": 0.125, + "synthetic__privacy-011": 0.4 + } + }, + { + "total": 81, + "date": "2026-06-26", + "model": "claude-opus-4-8", + "agent_name": "GitHub Copilot", + "category": "code-review", + "average_duration": 61.0, + "average_prompt_tokens": 159760.5, + "average_completion_tokens": 3464.2, + "average_llm_duration": 0.0, + "average_tool_usage": { + "report_intent": 1.51, + "bash": 3.68, + "view": 0.93, + "create": 1.0, + "grep": 0.23, + "stop_bash": 0.01, + "edit": 0.01, + "glob": 0.02 + }, + "github_run_id": "28237448350", + "experiment": null, + "benchmark_version": "0.6.0", + "generated_comment_count": 336, + "expected_comment_count": 216, + "matched_comment_count": 78, + "incorrect_comment_count": 258, + "missed_comment_count": 138, + "precision": 0.232, + "recall": 0.361, + "f1": 0.283, + "f_beta_05": 0.25, + "f_beta_2": 0.325, + "macro_precision": 0.297, + "macro_recall": 0.479, + "macro_f1": 0.335, + "macro_f_beta_05": 0.305, + "macro_f_beta_2": 0.396, + "severity_mae": 0.59, + "valid_review_output_rate": 1.0, + "instance_results": { + "synthetic__performance-023": 0.0, + "synthetic__upgrade-005": 0.222222, + "synthetic__upgrade-008": 0.181818, + "synthetic__performance-006": 0.444444, + "synthetic__performance-014": 0.0, + "synthetic__security-006": 1.0, + "synthetic__security-011": 0.0, + "synthetic__security-007": 0.666667, + "synthetic__style-004": 0.666667, + "synthetic__performance-015": 0.444444, + "synthetic__style-013": 0.095238, + "synthetic__performance-005": 0.333333, + "synthetic__performance-020": 0.0, + "synthetic__security-013": 0.25, + "synthetic__performance-013": 0.666667, + "synthetic__privacy-007": 0.5, + "synthetic__performance-010": 0.0, + "synthetic__security-009": 0.222222, + "synthetic__style-009": 0.666667, + "synthetic__performance-018": 0.2, + "synthetic__privacy-003": 0.0, + "synthetic__security-012": 1.0, + "synthetic__performance-007": 1.0, + "synthetic__security-003": 0.666667, + "synthetic__style-016": 0.444444, + "synthetic__privacy-008": 0.0, + "synthetic__privacy-006": 0.4, + "synthetic__style-012": 0.153846, + "synthetic__performance-017": 0.25, + "synthetic__style-014": 0.0, + "synthetic__security-004": 0.666667, + "synthetic__privacy-012": 0.666667, + "synthetic__style-005": 0.666667, + "synthetic__security-015": 0.0, + "synthetic__privacy-009": 0.4, + "synthetic__style-011": 0.114286, + "synthetic__security-005": 0.5, + "synthetic__performance-002": 0.8, + "synthetic__style-008": 0.666667, + "synthetic__style-006": 0.0, + "synthetic__style-015": 0.857143, + "synthetic__privacy-002": 0.5, + "synthetic__performance-001": 0.0, + "synthetic__privacy-014": 0.8, + "synthetic__style-001": 0.0, + "synthetic__privacy-004": 0.0, + "synthetic__privacy-015": 0.0, + "synthetic__security-016": 1.0, + "synthetic__privacy-005": 0.4, + "synthetic__upgrade-003": 0.0, + "synthetic__performance-009": 0.0, + "synthetic__performance-024": 0.0, + "synthetic__security-008": 0.0, + "synthetic__performance-008": 1.0, + "synthetic__performance-019": 0.0, + "synthetic__upgrade-001": 0.0, + "synthetic__upgrade-009": 0.428571, + "synthetic__upgrade-007": 0.0, + "synthetic__security-014": 0.0, + "synthetic__performance-021": 0.285714, + "synthetic__style-017": 0.142857, + "synthetic__style-010": 0.5, + "synthetic__performance-012": 0.5, + "synthetic__style-002": 0.8, + "synthetic__privacy-010": 0.0, + "synthetic__performance-016": 0.0, + "synthetic__style-007": 0.0, + "synthetic__security-001": 0.4, + "synthetic__security-010": 0.666667, + "synthetic__upgrade-002": 0.0, + "synthetic__performance-022": 0.4, + "synthetic__performance-003": 0.25, + "synthetic__privacy-001": 0.666667, + "synthetic__upgrade-004": 0.5, + "synthetic__performance-011": 0.222222, + "synthetic__performance-004": 0.4, + "synthetic__security-002": 0.0, + "synthetic__privacy-013": 0.666667, + "synthetic__style-003": 0.285714, + "synthetic__upgrade-006": 0.133333, + "synthetic__privacy-011": 0.4 + } } ], "aggregate": [ @@ -3222,6 +3835,30 @@ "macro_f_beta_2": 0.2774, "macro_precision": 0.2836, "macro_recall": 0.2928 + }, + { + "model": "claude-opus-4-8", + "agent_name": "GitHub Copilot", + "category": "code-review", + "experiment": null, + "total": 81, + "num_runs": 5, + "average_duration": 60.67999999999999, + "benchmark_version": "0.6.0", + "f1": 0.314, + "f1_ci_low": 0.293, + "f1_ci_high": 0.329, + "f_beta_05": 0.2834, + "f_beta_2": 0.35140000000000005, + "precision": 0.266, + "recall": 0.38239999999999996, + "macro_f1": 0.379, + "macro_f1_ci_low": 0.345, + "macro_f1_ci_high": 0.411, + "macro_f_beta_05": 0.3498, + "macro_f_beta_2": 0.4354, + "macro_precision": 0.3412, + "macro_recall": 0.5126000000000001 } ] }