microsoft · gggdttt · Jun 27, 2026 · Jun 26, 2026 · Jun 26, 2026 · Jun 27, 2026
diff --git a/docs/code-review.md b/docs/code-review.md
@@ -33,7 +33,7 @@ Unlike the pass/fail categories, code review is scored with **Precision / Recall
     <tr>
       <th>Agent</th>
       <th>Model</th>
-      <th>F1 (95% CI)</th>
+      <th>Micro F1 (95% CI)</th>
       <th>Precision</th>
       <th>Recall</th>
       <th>Avg Time</th>
@@ -61,6 +61,54 @@ Unlike the pass/fail categories, code review is scored with **Precision / Recall
 <p><em>No results available yet. Check back soon!</em></p>
 {% endif %}
 
+## Experiment Leaderboard
+
+Compares review-knowledge configurations for the same model (see the Baseline Leaderboard above for the plain agent):
+
+- **Inline knowledge (pre-#8700)** — the review checklists BCApps shipped inline before adopting BCQuality, injected as custom instructions.
+- **BCQuality (live skills)** — the agent dynamically consumes the live BCQuality skill tree.
+
+{% assign experiment_rows = site.data.code-review.aggregate | where_exp: "agg", "agg.experiment" %}
+{% if experiment_rows and experiment_rows.size > 0 %}
+<table>
+  <thead>
+    <tr>
+      <th>Variant</th>
+      <th>Agent</th>
+      <th>Model</th>
+      <th>Micro F1 (95% CI)</th>
+      <th>Macro F1 (95% CI)</th>
+      <th>Precision</th>
+      <th>Recall</th>
+      <th>Avg Time</th>
+      <th>Ver</th>
+    </tr>
+  </thead>
+  <tbody>
+    {% assign experiment_results = experiment_rows | sort: "f1" | reverse %}
+    {% for agg in experiment_results %}
+    <tr>
+      <td>
+        {%- if agg.experiment.bcquality -%}BCQuality (live skills)
+        {%- elsif agg.experiment.custom_instructions -%}Inline knowledge (pre-#8700)
+        {%- else -%}Other{%- endif -%}
+      </td>
+      <td>{{ agg.agent_name }}</td>
+      <td>{{ agg.model }}</td>
+      <td>{{ agg.f1 | times: 100.0 | round: 1 }}%{% if agg.f1_ci_low %} ({{ agg.f1_ci_low | times: 100.0 | round: 1 }}-{{ agg.f1_ci_high | times: 100.0 | round: 1 }}%){% endif %}</td>
+      <td>{{ agg.macro_f1 | times: 100.0 | round: 1 }}%{% if agg.macro_f1_ci_low %} ({{ agg.macro_f1_ci_low | times: 100.0 | round: 1 }}-{{ agg.macro_f1_ci_high | times: 100.0 | round: 1 }}%){% endif %}</td>
+      <td>{{ agg.precision | times: 100.0 | round: 1 }}%</td>
+      <td>{{ agg.recall | times: 100.0 | round: 1 }}%</td>
+      <td>{{ agg.average_duration | round: 1 }}s</td>
+      <td><a href="https://github.com/microsoft/BC-Bench/releases/tag/v{{ agg.benchmark_version }}" target="_blank">{{ agg.benchmark_version }}</a></td>
+    </tr>
+    {% endfor %}
+  </tbody>
+</table>
+{% else %}
+<p><em>No experiment results available yet. Check back soon!</em></p>
+{% endif %}
+
 ## How metrics are computed
 
 - **Precision** — of the comments the agent generated, the fraction that matched an expected finding. Penalizes noisy reviews.

diff --git a/src/bcbench/types.py b/src/bcbench/types.py
@@ -95,13 +95,18 @@ class ExperimentConfiguration(BaseModel):
     # Custom agent name used in experiment (if any)
     custom_agent: str | None = None
 
+    # Live BCQuality consumption enabled (code-review category only)
+    bcquality: bool = False
+
     def is_empty(self) -> bool:
         """Check if this configuration has all default/empty values.
 
         An empty configuration means no special experiment settings were used.
         This is useful for comparing with None (no experiment) vs default experiment.
         """
-        return self.mcp_servers is None and self.al_lsp_enabled is False and self.custom_instructions is False and self.skills_enabled is False and self.custom_agent is None
+        return (
+            self.mcp_servers is None and self.al_lsp_enabled is False and self.custom_instructions is False and self.skills_enabled is False and self.custom_agent is None and self.bcquality is False
+        )
 
 
 class AgentType(StrEnum):