microsoft · WaelAbuSeada · Apr 8, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/.github/workflows/claude-evaluation.yml b/.github/workflows/claude-evaluation.yml
@@ -23,6 +23,7 @@ on:
         options:
           - "bug-fix"
           - "test-generation"
+          - "code-review"
       test-run:
         description: "Indicate this is a test run (with few entries)"
         required: false

diff --git a/.github/workflows/copilot-evaluation.yml b/.github/workflows/copilot-evaluation.yml
@@ -30,6 +30,7 @@ on:
         options:
           - "bug-fix"
           - "test-generation"
+          - "code-review"
       test-run:
         description: "Indicate this is a test run (with few entries)"
         required: false
@@ -153,7 +154,9 @@ jobs:
         if: always()
         with:
           name: evaluation-results-${{ github.run_id }}-${{ matrix.entry }}
-          path: ${{ env.EVALUATION_RESULTS_DIR }}/**/*.jsonl
+          path: |
+            ${{ env.EVALUATION_RESULTS_DIR }}/**/*.jsonl
+            ${{ env.EVALUATION_RESULTS_DIR }}/**/*.log
           retention-days: ${{ inputs.test-run && 1 || 30 }}
 
   summarize-results:

diff --git a/.github/workflows/summarize-results.yml b/.github/workflows/summarize-results.yml
@@ -108,7 +108,8 @@ jobs:
             --use-capi ${{ !inputs.mock && '--storage braintrust --storage kusto' || '' }}
 
       - name: Update leaderboard in a new branch
-        if: ${{ !inputs.mock && !inputs.skip-leaderboard }}
+        # WIP for code-review category
+        if: ${{ !inputs.mock && !inputs.skip-leaderboard && inputs.category != 'code-review' }}
         run: |
           git fetch origin main
 

diff --git a/dataset/codereview.jsonl b/dataset/codereview.jsonl
diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json
@@ -0,0 +1,4 @@
+{
+    "runs": [],
+    "aggregate": []
+}
diff --git a/evaluator/scores.py b/evaluator/scores.py
@@ -19,3 +19,23 @@ def __call__(self, *, metadata: dict, **kwargs: object) -> bool:
 class PostPatchPassedRate:
     def __call__(self, *, metadata: dict, **kwargs: object) -> bool:
         return metadata.get("post_patch_passed", False)
+
+
+class PrecisionScore:
+    def __call__(self, *, metadata: dict, **kwargs: object) -> float:
+        return float(metadata.get("precision", 0.0))
+
+
+class RecallScore:
+    def __call__(self, *, metadata: dict, **kwargs: object) -> float:
+        return float(metadata.get("recall", 0.0))
+
+
+class F1Score:
+    def __call__(self, *, metadata: dict, **kwargs: object) -> float:
+        return float(metadata.get("f1", 0.0))
+
+
+class ValidReviewOutput:
+    def __call__(self, *, metadata: dict, **kwargs: object) -> bool:
+        return bool(metadata.get("valid_review_output", False))
diff --git a/scripts/BCBenchUtils.psm1 b/scripts/BCBenchUtils.psm1
@@ -490,13 +490,14 @@ function Get-BCBenchDatasetPath {
     param(
         [Parameter(Mandatory = $true)]
         # Category validation lives only here: every caller resolves the dataset path through this function, so there's no need to duplicate ValidateSet on each caller.
-        [ValidateSet("bug-fix", "test-generation")]
+        [ValidateSet("bug-fix", "test-generation", "code-review")]
         [string] $Category
     )
 
     switch ($Category) {
         "bug-fix" { $DatasetName = "bcbench.jsonl" }
         "test-generation" { $DatasetName = "bcbench.jsonl" }
+        "code-review" { $DatasetName = "codereview.jsonl" }
     }
 
     [string] $projectRoot = Split-Path $PSScriptRoot -Parent

diff --git a/src/bcbench/agent/copilot/metrics.py b/src/bcbench/agent/copilot/metrics.py
@@ -34,7 +34,12 @@ def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = N
         output_lines: Lines from Copilot CLI stderr output
         session_log_path: Optional path to session log file for tool usage parsing
 
-    Expected output format (new, v1.0.2+):
+    Expected output format (newest, v1.0.61+):
+        Changes    +23 -0
+        AI Credits 58.4 (1m 14s)
+        Tokens     ↑ 413.9k (368.1k cached) • ↓ 4.5k (500 reasoning)
+
+    Previous output format (v1.0.2..v1.0.60):
         Changes   +17 -0
         Requests  0.33 Premium (1m 45s)
         Tokens    ↑ 317.5k • ↓ 4.3k • 255.0k (cached)
@@ -83,26 +88,41 @@ def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = N
             seconds = float(duration_match.group(2))
             execution_time = minutes * 60 + seconds
 
-        # New format: "Requests  0.33 Premium (1m 45s)" — extract session time from parenthesized duration
+        # New format (v1.0.2+): "Requests  0.33 Premium (1m 45s)" — extract session time from parenthesized duration
         if execution_time is None:
             requests_match = re.search(r"Requests\s+[\d.]+\s+Premium\s+\((?:(\d+)m\s*)?(\d+(?:\.\d+)?)s\)", output_text)
             if requests_match:
                 minutes = int(requests_match.group(1)) if requests_match.group(1) else 0
                 seconds = float(requests_match.group(2))
                 execution_time = minutes * 60 + seconds
 
+        # Newest format (v1.0.61+): "AI Credits 58.4 (1m 14s)" — "Requests N Premium" was renamed to "AI Credits N"
+        if execution_time is None:
+            credits_match = re.search(r"AI Credits\s+[\d.]+\s+\((?:(\d+)m\s*)?(\d+(?:\.\d+)?)s\)", output_text)
+            if credits_match:
+                minutes = int(credits_match.group(1)) if credits_match.group(1) else 0
+                seconds = float(credits_match.group(2))
+                execution_time = minutes * 60 + seconds
+
         # Token usage — legacy format: "1.3m in, 11.6k out"
         usage_match = re.search(r"(\d+(?:\.\d+)?[km]?)\s+in,\s*(\d+(?:\.\d+)?[km]?)\s+out", output_text)
         if usage_match:
             prompt_tokens = _parse_token_count(usage_match.group(1))
             completion_tokens = _parse_token_count(usage_match.group(2))
 
-        # New format: "Tokens    ↑ 317.5k • ↓ 4.3k • 255.0k (cached)"
+        # New format (v1.0.2+): "Tokens    ↑ 317.5k • ↓ 4.3k • 255.0k (cached)"
+        # Newest format (v1.0.61+): "Tokens     ↑ 413.9k (368.1k cached) • ↓ 4.5k (500 reasoning)"
+        # Use separate ↑ / ↓ lookups to tolerate inline "(N cached)" / "(N reasoning)" annotations
+        # between the two values.
         if prompt_tokens is None:
-            tokens_match = re.search(r"Tokens\s+[^\d]*(\d+(?:\.\d+)?[km]?)\s*[•·]\s*[^\d]*(\d+(?:\.\d+)?[km]?)", output_text)
-            if tokens_match:
-                prompt_tokens = _parse_token_count(tokens_match.group(1))
-                completion_tokens = _parse_token_count(tokens_match.group(2))
+            tokens_line_match = re.search(r"Tokens\s+([^\n]+)", output_text)
+            if tokens_line_match:
+                tokens_line = tokens_line_match.group(1)
+                up_match = re.search(r"\u2191\s*(\d+(?:\.\d+)?[km]?)", tokens_line)
+                down_match = re.search(r"\u2193\s*(\d+(?:\.\d+)?[km]?)", tokens_line)
+                if up_match and down_match:
+                    prompt_tokens = _parse_token_count(up_match.group(1))
+                    completion_tokens = _parse_token_count(down_match.group(1))
 
         if execution_time is not None or llm_duration is not None or prompt_tokens is not None or completion_tokens is not None or turn_count is not None:
             return AgentMetrics(

diff --git a/src/bcbench/agent/shared/config.yaml b/src/bcbench/agent/shared/config.yaml
@@ -50,6 +50,21 @@ prompt:
     {{task}}
     {% endif %}
 
+  code-review-template: |
+    /al-code-review
+
+    Review ONLY the current working-tree AL file changes for this evaluation entry.
+    Use the working tree diff only (git diff HEAD), and focus on changed *.al files.
+    Do NOT review committed history or the HEAD commit, and do NOT compare commits (for example, do NOT use HEAD~1..HEAD or origin/main comparisons).
+
+    Save findings to a file named "review.json" in the repository root.
+    The file must contain valid JSON with a top-level object named findings.
+    Each finding must include: filePath, lineNumber, severity, issue, recommendation, domain, suggestedCode.
+    Map the skill's findings into this schema as described in the skill's output-mapping section
+    (blocker->critical, major->high, minor->medium, info->low; from-sub-skill->domain).
+    Allowed severity values are: critical, high, medium, low.
+    If there are no findings, write an empty findings list.
+
 # controls:
 # 1. whether to copy custom instructions from `src/bcbench/agent/shared/instructions/<sanitized-repo>/`
 #    - Copilot: copies to repo/.github/ and renames AGENTS.md to copilot-instructions.md
@@ -66,7 +81,7 @@ instructions:
 #    - Copilot: copies to repo/.github/skills/
 #    - Claude: copies to repo/.claude/skills/
 skills:
-  enabled: false
+  enabled: true
 
 # controls:
 # 1. whether to copy custom agents from `src/bcbench/agent/shared/instructions/<sanitized-repo>/agents/`

diff --git a/src/bcbench/agent/shared/hooks/log_tool_usage.py b/src/bcbench/agent/shared/hooks/log_tool_usage.py
@@ -0,0 +1,51 @@
+"""Copilot/Claude PreToolUse hook: log tool invocations to a JSONL file.
+
+Reads the hook payload from stdin and appends one JSON line per call to the
+path in BCBENCH_TOOL_LOG. Used by both Copilot CLI (Linux runners) and Claude
+hooks via the `bash` field of the hook command spec; the legacy .ps1 in this
+directory mirrors the same behavior for the Windows `powershell` field.
+"""
+
+import contextlib
+import json
+import os
+import sys
+
+
+def _extract_tool_name(payload: dict) -> str | None:
+    name = payload.get("tool_name") or payload.get("toolName")
+    if name != "lsp":
+        return name
+
+    args = payload.get("toolArgs") or payload.get("tool_input")
+    if isinstance(args, str):
+        try:
+            args = json.loads(args)
+        except json.JSONDecodeError:
+            args = None
+    if isinstance(args, dict) and (op := args.get("operation")):
+        return f"lsp:{op}"
+    return name
+
+
+def main() -> None:
+    try:
+        payload = json.loads(sys.stdin.read() or "{}")
+    except json.JSONDecodeError:
+        return
+
+    name = _extract_tool_name(payload)
+    log_path = os.environ.get("BCBENCH_TOOL_LOG")
+    if not name or not log_path:
+        return
+
+    entry = {"tool_name": name, "timestamp": payload.get("timestamp", "")}
+    with open(log_path, "a", encoding="utf-8") as f:
+        f.write(json.dumps(entry) + "\n")
+
+
+if __name__ == "__main__":
+    with contextlib.suppress(Exception):
+        # Never block tool execution — silently fail.
+        main()
+    sys.exit(0)
diff --git a/...bench/agent/shared/instructions/microsoft-BCApps/skills/al-code-review/SKILL.md b/...bench/agent/shared/instructions/microsoft-BCApps/skills/al-code-review/SKILL.md
@@ -0,0 +1,131 @@
+---
+name: al-code-review
+description: 'Review AL code for Dynamics 365 Business Central by composing specialized review sub-skills (performance, security, privacy, upgrade, style), each backed by curated BCQuality knowledge files. Use when reviewing AL code changes or pull requests and producing structured findings.'
+allowed-tools: Read, Glob, Grep, LSP
+argument-hint: 'leave empty to run the full composed review across all domains'
+---
+
+# AL Code Review (composed super-skill / sub-skill review)
+
+Reviews AL code for Dynamics 365 Business Central using a **composition** pattern: a single
+super-skill invokes five domain leaf sub-skills one at a time, each evaluating the diff against
+its own curated **knowledge files**, then performs a cross-cutting self-review pass. The result
+is mapped into this repository's `review.json` schema.
+
+## When to Use
+
+- Reviewing AL code changes or pull requests
+- User asks for "code review", "review this AL code", or domain-specific analysis
+
+## Vendored layout — all paths are relative to THIS skill folder
+
+This skill is self-contained. The evaluator copies it to `.github/skills/al-code-review/`.
+Every path referenced by the vendored framework files below resolves **relative to this skill
+folder** (the directory containing this `SKILL.md`), not the repository root:
+
+```
+.github/skills/al-code-review/
+  SKILL.md                                  <- this entry point (you are here)
+  skills/
+    read.md                                 <- READ contract: how to read a knowledge file
+    do.md                                   <- DO contract: the action-skill template + output schema
+  microsoft/
+    skills/review/
+      al-code-review.md                     <- the super-skill (composition orchestrator)
+      al-performance-review.md              <- leaf sub-skill
+      al-privacy-review.md                  <- leaf sub-skill
+      al-security-review.md                 <- leaf sub-skill
+      al-style-review.md                    <- leaf sub-skill
+      al-upgrade-review.md                  <- leaf sub-skill
+    knowledge/
+      performance/  privacy/  security/  style/  upgrade/   <- knowledge files (*.md) + samples (*.al)
+  knowledge-index.json                      <- discovery metadata for the knowledge corpus
+```
+
+This `SKILL.md` is the entry point. The BCQuality framework's own `entry.md` routing/dispatch
+step is **not** vendored — its job is fulfilled here: you always dispatch the single super-skill
+`microsoft/skills/review/al-code-review.md`. Ignore any reference to `entry.md` inside the
+vendored contracts; wherever a contract says "the knowledge index BCQuality builds at the root of
+the knowledge checkout", read the already-built `knowledge-index.json` in this skill folder.
+
+## Review Process
+
+1. **Read the contracts first.** Read `skills/read.md` (knowledge-file schema + frontmatter
+   matching) and `skills/do.md` (action-skill template, severity taxonomy, output contract,
+   agent-finding precision bar).
+2. **Invoke the super-skill.** Read `microsoft/skills/review/al-code-review.md` and execute it.
+   It composes the five leaf sub-skills listed in its frontmatter `sub-skills`.
+3. **Run each leaf one at a time (mandatory execution discipline).** For each of
+   `al-performance-review`, `al-security-review`, `al-privacy-review`, `al-upgrade-review`,
+   `al-style-review`: read the leaf file, read `knowledge-index.json`, select the candidate
+   articles for that leaf's `domain`, open only the worklisted knowledge files in full, and
+   evaluate the diff against their `## Best Practice` / `## Anti Pattern` sections. Do NOT
+   collapse multiple leaves into one shared scan — each leaf re-walks the diff independently.
+4. **Cross-cutting self-review pass.** After all five leaves finish, perform the super-skill's
+   own self-review for defects that span domain boundaries. Hold agent findings to the precision
+   bar in `skills/do.md` (concrete, demonstrable, material; steelman first; when in doubt, omit).
+5. **Map and write output.** Convert the rolled-up findings into this repository's `review.json`
+   schema (below) and save to a file named `review.json` in the repository root.
+
+## Scope of the diff
+
+Review ONLY the current working-tree AL file changes for this evaluation entry. Do NOT compare
+commits (do NOT use `HEAD~1..HEAD` or `origin/main`). Use working-tree diff only (`git diff HEAD`)
+and focus on changed `*.al` files.
+
+## Strict domain discipline
+
+Each leaf sub-skill owns exactly one domain and emits findings only within that domain. When a
+leaf is active, judge every candidate by its **root cause**, not by surrounding names: a
+non-translatable string in a method called `GenerateComplianceReport` is a `style` issue, not
+`privacy`. If a candidate's root cause is outside the active leaf's domain, the active leaf stays
+silent — the owning leaf (or the cross-cutting pass) will surface it. When in doubt, drop it.
+
+## Output mapping — BCQuality findings-report -> review.json
+
+The composed run produces a BCQuality findings-report (see `skills/do.md`). Map it into the
+`review.json` schema this repository expects. The output file MUST contain valid JSON with a
+top-level object named `findings`; each finding is an object with exactly these fields:
+
+| review.json field | Source in the BCQuality finding | Notes |
+|-------------------|---------------------------------|-------|
+| `filePath`        | `location.file`                 | Repo-relative path of the changed `*.al` file. |
+| `lineNumber`      | `location.line`                 | 1-based line in the changed file. |
+| `severity`        | `severity`                      | Map: `blocker`->`critical`, `major`->`high`, `minor`->`medium`, `info`->`low`. |
+| `issue`           | `message`                       | Describe the concern. |
+| `recommendation`  | `message` / knowledge guidance  | The concrete fix; draw from the knowledge file's `## Best Practice` when present. |
+| `domain`          | `from-sub-skill`                | Map leaf id to domain (below). |
+| `suggestedCode`   | `suggested-code`                | Literal replacement for the located lines; empty string if none. |
+
+Domain mapping for `from-sub-skill`:
+
+- `al-performance-review` -> `performance`
+- `al-security-review` -> `security`
+- `al-privacy-review` -> `privacy`
+- `al-upgrade-review` -> `upgrade`
+- `al-style-review` -> `style`
+- `agent` (cross-cutting self-review finding) or a leaf's own agent finding -> map to the single
+  closest of the five domains above by the finding's root cause.
+
+Allowed `severity` values in `review.json` are exactly: `critical`, `high`, `medium`, `low`.
+Drop the BCQuality-only fields (`id`, `references`, `confidence`, `from-sub-skill`, `sub-results`,
+etc.) — they are not part of `review.json`. If there are no findings, write an empty `findings`
+list.
+
+Example `review.json`:
+
+```json
+{
+  "findings": [
+    {
+      "filePath": "src/Sales/PostingRoutines.Codeunit.al",
+      "lineNumber": 140,
+      "severity": "high",
+      "issue": "FindSet is called without a prior SetRange/SetFilter, forcing a full-table scan.",
+      "recommendation": "Apply SetRange/SetFilter to narrow the record set before FindSet, per the filter-before-find guidance.",
+      "domain": "performance",
+      "suggestedCode": ""
+    }
+  ]
+}
+```