diff --git a/docs/code-review.md b/docs/code-review.md index 6f3ef7518..fe05e074a 100644 --- a/docs/code-review.md +++ b/docs/code-review.md @@ -33,7 +33,7 @@ Unlike the pass/fail categories, code review is scored with **Precision / Recall Agent Model - F1 (95% CI) + Micro F1 (95% CI) Precision Recall Avg Time @@ -61,6 +61,54 @@ Unlike the pass/fail categories, code review is scored with **Precision / Recall

No results available yet. Check back soon!

{% endif %} +## Experiment Leaderboard + +Compares review-knowledge configurations for the same model (see the Baseline Leaderboard above for the plain agent): + +- **Inline knowledge (pre-#8700)** — the review checklists BCApps shipped inline before adopting BCQuality, injected as custom instructions. +- **BCQuality (live skills)** — the agent dynamically consumes the live BCQuality skill tree. + +{% assign experiment_rows = site.data.code-review.aggregate | where_exp: "agg", "agg.experiment" %} +{% if experiment_rows and experiment_rows.size > 0 %} + + + + + + + + + + + + + + + + {% assign experiment_results = experiment_rows | sort: "f1" | reverse %} + {% for agg in experiment_results %} + + + + + + + + + + + + {% endfor %} + +
VariantAgentModelMicro F1 (95% CI)Macro F1 (95% CI)PrecisionRecallAvg TimeVer
+ {%- if agg.experiment.bcquality -%}BCQuality (live skills) + {%- elsif agg.experiment.custom_instructions -%}Inline knowledge (pre-#8700) + {%- else -%}Other{%- endif -%} + {{ agg.agent_name }}{{ agg.model }}{{ agg.f1 | times: 100.0 | round: 1 }}%{% if agg.f1_ci_low %} ({{ agg.f1_ci_low | times: 100.0 | round: 1 }}-{{ agg.f1_ci_high | times: 100.0 | round: 1 }}%){% endif %}{{ agg.macro_f1 | times: 100.0 | round: 1 }}%{% if agg.macro_f1_ci_low %} ({{ agg.macro_f1_ci_low | times: 100.0 | round: 1 }}-{{ agg.macro_f1_ci_high | times: 100.0 | round: 1 }}%){% endif %}{{ agg.precision | times: 100.0 | round: 1 }}%{{ agg.recall | times: 100.0 | round: 1 }}%{{ agg.average_duration | round: 1 }}s{{ agg.benchmark_version }}
+{% else %} +

No experiment results available yet. Check back soon!

+{% endif %} + ## How metrics are computed - **Precision** — of the comments the agent generated, the fraction that matched an expected finding. Penalizes noisy reviews. diff --git a/src/bcbench/types.py b/src/bcbench/types.py index b70821d05..f9d2733c1 100644 --- a/src/bcbench/types.py +++ b/src/bcbench/types.py @@ -95,13 +95,18 @@ class ExperimentConfiguration(BaseModel): # Custom agent name used in experiment (if any) custom_agent: str | None = None + # Live BCQuality consumption enabled (code-review category only) + bcquality: bool = False + def is_empty(self) -> bool: """Check if this configuration has all default/empty values. An empty configuration means no special experiment settings were used. This is useful for comparing with None (no experiment) vs default experiment. """ - return self.mcp_servers is None and self.al_lsp_enabled is False and self.custom_instructions is False and self.skills_enabled is False and self.custom_agent is None + return ( + self.mcp_servers is None and self.al_lsp_enabled is False and self.custom_instructions is False and self.skills_enabled is False and self.custom_agent is None and self.bcquality is False + ) class AgentType(StrEnum):