From 66265ff8ed6bfe80991bdc514cf1cee7f6a6b9e2 Mon Sep 17 00:00:00 2001
From: wenjiefan <wenjiefan@microsoft.com>
Date: Fri, 26 Jun 2026 13:30:28 +0200
Subject: [PATCH 1/4] ci: provision Copilot CLI for code-review judge in Claude
 workflow

---
 .github/workflows/claude-evaluation.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/workflows/claude-evaluation.yml b/.github/workflows/claude-evaluation.yml
index d91282bf2..695bb328b 100644
--- a/.github/workflows/claude-evaluation.yml
+++ b/.github/workflows/claude-evaluation.yml
@@ -77,6 +77,7 @@ jobs:
     permissions:
       contents: read
       id-token: write
+      copilot-requests: write  # code-review semantic judge runs via Copilot CLI
     name: ${{ matrix.entry }}
     strategy:
       fail-fast: false
@@ -116,13 +117,21 @@ jobs:
       - name: Install Claude Code
         run: npm install -g @anthropic-ai/claude-code@2.1.160
 
+      # The code-review category scores findings with a semantic judge that runs on Copilot CLI,
+      # regardless of the agent under test. Provision it only when judging is needed.
+      - name: Install GitHub Copilot CLI (code-review judge)
+        if: ${{ inputs.category == 'code-review' }}
+        run: npm install -g @github/copilot@1.0.57
+
       - name: Run Claude Code for entry ${{ matrix.entry }}
         timeout-minutes: 120
         shell: pwsh
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          COPILOT_GITHUB_TOKEN: ${{ github.token }}  # used by the code-review semantic judge
         run: |
           Write-Output "::add-mask::$env:ANTHROPIC_API_KEY"
+          Write-Output "::add-mask::$env:COPILOT_GITHUB_TOKEN"
 
           uv run bcbench evaluate claude "${{ matrix.entry }}" `
             --model "${{ inputs.model }}" `

From c2ffb754cbeb3bbf448d3bdd0e2611c35ec86a68 Mon Sep 17 00:00:00 2001
From: wenjiefan <wenjiefan@microsoft.com>
Date: Fri, 26 Jun 2026 14:46:10 +0200
Subject: [PATCH 2/4] ci: extract eval CLI install into shared composite
 action; install both CLIs in both workflows

---
 .github/actions/install-eval-clis/action.yml | 13 +++++++++++++
 .github/workflows/claude-evaluation.yml      | 10 ++--------
 .github/workflows/copilot-evaluation.yml     |  4 ++--
 3 files changed, 17 insertions(+), 10 deletions(-)
 create mode 100644 .github/actions/install-eval-clis/action.yml

diff --git a/.github/actions/install-eval-clis/action.yml b/.github/actions/install-eval-clis/action.yml
new file mode 100644
index 000000000..a1bae3a1f
--- /dev/null
+++ b/.github/actions/install-eval-clis/action.yml
@@ -0,0 +1,13 @@
+name: Install evaluation CLIs
+description: Install the Claude Code and GitHub Copilot CLIs used by the evaluation workflows
+
+runs:
+  using: composite
+  steps:
+    - name: Install Claude Code
+      run: npm install -g @anthropic-ai/claude-code@2.1.160
+      shell: pwsh
+
+    - name: Install GitHub Copilot CLI
+      run: npm install -g @github/copilot@1.0.57
+      shell: pwsh
diff --git a/.github/workflows/claude-evaluation.yml b/.github/workflows/claude-evaluation.yml
index 695bb328b..4fb021bd1 100644
--- a/.github/workflows/claude-evaluation.yml
+++ b/.github/workflows/claude-evaluation.yml
@@ -114,14 +114,8 @@ jobs:
           dotnet tool install -g Microsoft.Dynamics.BusinessCentral.Development.Tools --version 18.0.37.11445-beta
           echo "$HOME\.dotnet\tools" >> $env:GITHUB_PATH
 
-      - name: Install Claude Code
-        run: npm install -g @anthropic-ai/claude-code@2.1.160
-
-      # The code-review category scores findings with a semantic judge that runs on Copilot CLI,
-      # regardless of the agent under test. Provision it only when judging is needed.
-      - name: Install GitHub Copilot CLI (code-review judge)
-        if: ${{ inputs.category == 'code-review' }}
-        run: npm install -g @github/copilot@1.0.57
+      - name: Install evaluation CLIs
+        uses: ./.github/actions/install-eval-clis
 
       - name: Run Claude Code for entry ${{ matrix.entry }}
         timeout-minutes: 120
diff --git a/.github/workflows/copilot-evaluation.yml b/.github/workflows/copilot-evaluation.yml
index dd373536d..f0f11d787 100644
--- a/.github/workflows/copilot-evaluation.yml
+++ b/.github/workflows/copilot-evaluation.yml
@@ -121,8 +121,8 @@ jobs:
           dotnet tool install -g Microsoft.Dynamics.BusinessCentral.Development.Tools --version 18.0.37.11445-beta
           echo "$HOME\.dotnet\tools" >> $env:GITHUB_PATH
 
-      - name: Install GitHub Copilot CLI
-        run: npm install -g @github/copilot@1.0.57
+      - name: Install evaluation CLIs
+        uses: ./.github/actions/install-eval-clis
 
       - name: Run GitHub Copilot CLI for entry ${{ matrix.entry }}
         timeout-minutes: 120

From 1886bbbf1ba68e01ed911e9358c85f49cab1979a Mon Sep 17 00:00:00 2001
From: Wenjie Fan <31087545+gggdttt@users.noreply.github.com>
Date: Fri, 26 Jun 2026 14:55:34 +0200
Subject: [PATCH 3/4] Update .github/actions/install-eval-clis/action.yml

Co-authored-by: Sun Haoran <haoransun@microsoft.com>
---
 .github/actions/install-eval-clis/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/install-eval-clis/action.yml b/.github/actions/install-eval-clis/action.yml
index a1bae3a1f..e9bc0c3bb 100644
--- a/.github/actions/install-eval-clis/action.yml
+++ b/.github/actions/install-eval-clis/action.yml
@@ -1,4 +1,4 @@
-name: Install evaluation CLIs
+name: Install Agent Harnesses
 description: Install the Claude Code and GitHub Copilot CLIs used by the evaluation workflows
 
 runs:

From efe69b6fbde45bb8f60e613122af9d1b96595d97 Mon Sep 17 00:00:00 2001
From: Wenjie Fan <31087545+gggdttt@users.noreply.github.com>
Date: Fri, 26 Jun 2026 14:55:40 +0200
Subject: [PATCH 4/4] Update .github/workflows/claude-evaluation.yml

Co-authored-by: Sun Haoran <haoransun@microsoft.com>
---
 .github/workflows/claude-evaluation.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/claude-evaluation.yml b/.github/workflows/claude-evaluation.yml
index 4fb021bd1..5952c6026 100644
--- a/.github/workflows/claude-evaluation.yml
+++ b/.github/workflows/claude-evaluation.yml
@@ -114,7 +114,7 @@ jobs:
           dotnet tool install -g Microsoft.Dynamics.BusinessCentral.Development.Tools --version 18.0.37.11445-beta
           echo "$HOME\.dotnet\tools" >> $env:GITHUB_PATH
 
-      - name: Install evaluation CLIs
+      - name: Install Agent Harnesses
         uses: ./.github/actions/install-eval-clis
 
       - name: Run Claude Code for entry ${{ matrix.entry }}