Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 49 additions & 1 deletion docs/code-review.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ Unlike the pass/fail categories, code review is scored with **Precision / Recall
<tr>
<th>Agent</th>
<th>Model</th>
<th>F1 (95% CI)</th>
<th>Micro F1 (95% CI)</th>
<th>Precision</th>
<th>Recall</th>
<th>Avg Time</th>
Expand Down Expand Up @@ -61,6 +61,54 @@ Unlike the pass/fail categories, code review is scored with **Precision / Recall
<p><em>No results available yet. Check back soon!</em></p>
{% endif %}

## Experiment Leaderboard

Compares review-knowledge configurations for the same model (see the Baseline Leaderboard above for the plain agent):

- **Inline knowledge (pre-#8700)** — the review checklists BCApps shipped inline before adopting BCQuality, injected as custom instructions.
- **BCQuality (live skills)** — the agent dynamically consumes the live BCQuality skill tree.

{% assign experiment_rows = site.data.code-review.aggregate | where_exp: "agg", "agg.experiment" %}
{% if experiment_rows and experiment_rows.size > 0 %}
<table>
<thead>
<tr>
<th>Variant</th>
<th>Agent</th>
<th>Model</th>
<th>Micro F1 (95% CI)</th>
<th>Macro F1 (95% CI)</th>
<th>Precision</th>
<th>Recall</th>
<th>Avg Time</th>
<th>Ver</th>
</tr>
</thead>
<tbody>
{% assign experiment_results = experiment_rows | sort: "f1" | reverse %}
{% for agg in experiment_results %}
<tr>
<td>
{%- if agg.experiment.bcquality -%}BCQuality (live skills)
{%- elsif agg.experiment.custom_instructions -%}Inline knowledge (pre-#8700)
{%- else -%}Other{%- endif -%}
</td>
<td>{{ agg.agent_name }}</td>
<td>{{ agg.model }}</td>
<td>{{ agg.f1 | times: 100.0 | round: 1 }}%{% if agg.f1_ci_low %} ({{ agg.f1_ci_low | times: 100.0 | round: 1 }}-{{ agg.f1_ci_high | times: 100.0 | round: 1 }}%){% endif %}</td>
<td>{{ agg.macro_f1 | times: 100.0 | round: 1 }}%{% if agg.macro_f1_ci_low %} ({{ agg.macro_f1_ci_low | times: 100.0 | round: 1 }}-{{ agg.macro_f1_ci_high | times: 100.0 | round: 1 }}%){% endif %}</td>
<td>{{ agg.precision | times: 100.0 | round: 1 }}%</td>
<td>{{ agg.recall | times: 100.0 | round: 1 }}%</td>
<td>{{ agg.average_duration | round: 1 }}s</td>
<td><a href="https://github.com/microsoft/BC-Bench/releases/tag/v{{ agg.benchmark_version }}" target="_blank">{{ agg.benchmark_version }}</a></td>
</tr>
{% endfor %}
</tbody>
</table>
{% else %}
<p><em>No experiment results available yet. Check back soon!</em></p>
{% endif %}

## How metrics are computed

- **Precision** — of the comments the agent generated, the fraction that matched an expected finding. Penalizes noisy reviews.
Expand Down
62 changes: 47 additions & 15 deletions src/bcbench/agent/copilot/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@

from bcbench.agent.copilot.metrics import parse_metrics
from bcbench.agent.shared import build_al_lsp_plugin, build_mcp_config, build_prompt, parse_tool_usage_from_hooks
from bcbench.agent.shared.codereview_bcquality import parse_bcquality_config, prepare_bcquality_workspace
from bcbench.config import get_config
from bcbench.dataset import BaseDatasetEntry
from bcbench.evaluate.codereview import REVIEW_OUTPUT_FILE
from bcbench.exceptions import AgentError, AgentTimeoutError
from bcbench.logger import get_logger
from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_hooks, setup_instructions_from_config
Expand Down Expand Up @@ -41,22 +43,49 @@ def run_copilot_agent(

logger.info(f"Running GitHub Copilot CLI on: {entry.instance_id}")

prompt: str = build_prompt(entry, repo_path, copilot_config, category, al_mcp=al_mcp)
mcp_config_json, mcp_server_names = build_mcp_config(copilot_config, entry, repo_path, al_mcp=al_mcp, container_name=container_name)
lsp_plugin_dir: Path | None = build_al_lsp_plugin(entry, category, repo_path, AgentType.COPILOT, al_lsp=al_lsp, container_name=container_name)
instructions_enabled: bool = setup_instructions_from_config(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
skills_enabled: bool = setup_agent_skills(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
custom_agent: str | None = setup_custom_agent(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
tool_log_path: Path = setup_hooks(repo_path, AgentType.COPILOT, output_dir)
config = ExperimentConfiguration(
mcp_servers=mcp_server_names,
al_lsp_enabled=lsp_plugin_dir is not None,
custom_instructions=instructions_enabled,
skills_enabled=skills_enabled,
custom_agent=custom_agent,
)

logger.info(f"Executing Copilot CLI in directory: {repo_path}")

bcquality_config = parse_bcquality_config(copilot_config)
bcquality_live: bool = category == EvaluationCategory.CODE_REVIEW and bcquality_config is not None and bcquality_config.enabled

if bcquality_live:
assert bcquality_config is not None
# Live BCQuality consumption: clone+filter BCQuality and route the agent through skills/entry.md.
# The filtered clone (not the repo) becomes the Copilot CLI working directory; the repo under
# review is granted via --add-dir. No static instruction/skill/agent injection in this mode.
bootstrap_template: str = copilot_config["prompt"]["bcquality-bootstrap-template"]
bcquality_root, prompt = prepare_bcquality_workspace(bcquality_config, output_dir / "bcquality-clone", repo_path, REVIEW_OUTPUT_FILE, bootstrap_template)
work_dir: Path = bcquality_root
instructions_enabled: bool = False
skills_enabled: bool = False
custom_agent: str | None = None
# Copilot reads hooks from the CWD's .github/hooks, so install them into the clone to keep tool-usage metrics.
tool_log_path: Path = setup_hooks(bcquality_root, AgentType.COPILOT, output_dir)
config = ExperimentConfiguration(
mcp_servers=mcp_server_names,
al_lsp_enabled=lsp_plugin_dir is not None,
custom_instructions=False,
skills_enabled=False,
custom_agent=None,
bcquality=True,
)
else:
prompt = build_prompt(entry, repo_path, copilot_config, category, al_mcp=al_mcp)
work_dir = repo_path
instructions_enabled = setup_instructions_from_config(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
skills_enabled = setup_agent_skills(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
custom_agent = setup_custom_agent(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
tool_log_path = setup_hooks(repo_path, AgentType.COPILOT, output_dir)
config = ExperimentConfiguration(
mcp_servers=mcp_server_names,
al_lsp_enabled=lsp_plugin_dir is not None,
custom_instructions=instructions_enabled,
skills_enabled=skills_enabled,
custom_agent=custom_agent,
)

logger.info(f"Executing Copilot CLI in directory: {work_dir}")
logger.debug(f"Using prompt:\n{prompt}")

# Prefer copilot.exe over copilot.bat/copilot.cmd shims on Windows: the .bat shim invokes PowerShell,
Expand All @@ -83,12 +112,15 @@ def run_copilot_agent(
cmd_args.append(f"--plugin-dir={lsp_plugin_dir}")
if custom_agent:
cmd_args.append(f"--agent={custom_agent}")
if bcquality_live:
# Grant the agent access to the repo under review (it lives outside the BCQuality CWD).
cmd_args.extend(["--add-dir", str(repo_path)])

logger.debug(f"Copilot command args: {cmd_args}")

result = subprocess.run(
cmd_args,
cwd=str(repo_path),
cwd=str(work_dir),
env={
**os.environ,
"GITHUB_COPILOT_PROMPT_MODE_REPO_HOOKS": "true",
Expand Down
Loading
Loading