From 66265ff8ed6bfe80991bdc514cf1cee7f6a6b9e2 Mon Sep 17 00:00:00 2001 From: wenjiefan Date: Fri, 26 Jun 2026 13:30:28 +0200 Subject: [PATCH 1/4] ci: provision Copilot CLI for code-review judge in Claude workflow --- .github/workflows/claude-evaluation.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/claude-evaluation.yml b/.github/workflows/claude-evaluation.yml index d91282bf2..695bb328b 100644 --- a/.github/workflows/claude-evaluation.yml +++ b/.github/workflows/claude-evaluation.yml @@ -77,6 +77,7 @@ jobs: permissions: contents: read id-token: write + copilot-requests: write # code-review semantic judge runs via Copilot CLI name: ${{ matrix.entry }} strategy: fail-fast: false @@ -116,13 +117,21 @@ jobs: - name: Install Claude Code run: npm install -g @anthropic-ai/claude-code@2.1.160 + # The code-review category scores findings with a semantic judge that runs on Copilot CLI, + # regardless of the agent under test. Provision it only when judging is needed. + - name: Install GitHub Copilot CLI (code-review judge) + if: ${{ inputs.category == 'code-review' }} + run: npm install -g @github/copilot@1.0.57 + - name: Run Claude Code for entry ${{ matrix.entry }} timeout-minutes: 120 shell: pwsh env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + COPILOT_GITHUB_TOKEN: ${{ github.token }} # used by the code-review semantic judge run: | Write-Output "::add-mask::$env:ANTHROPIC_API_KEY" + Write-Output "::add-mask::$env:COPILOT_GITHUB_TOKEN" uv run bcbench evaluate claude "${{ matrix.entry }}" ` --model "${{ inputs.model }}" ` From c2ffb754cbeb3bbf448d3bdd0e2611c35ec86a68 Mon Sep 17 00:00:00 2001 From: wenjiefan Date: Fri, 26 Jun 2026 14:46:10 +0200 Subject: [PATCH 2/4] ci: extract eval CLI install into shared composite action; install both CLIs in both workflows --- .github/actions/install-eval-clis/action.yml | 13 +++++++++++++ .github/workflows/claude-evaluation.yml | 10 ++-------- .github/workflows/copilot-evaluation.yml | 4 ++-- 3 files changed, 17 insertions(+), 10 deletions(-) create mode 100644 .github/actions/install-eval-clis/action.yml diff --git a/.github/actions/install-eval-clis/action.yml b/.github/actions/install-eval-clis/action.yml new file mode 100644 index 000000000..a1bae3a1f --- /dev/null +++ b/.github/actions/install-eval-clis/action.yml @@ -0,0 +1,13 @@ +name: Install evaluation CLIs +description: Install the Claude Code and GitHub Copilot CLIs used by the evaluation workflows + +runs: + using: composite + steps: + - name: Install Claude Code + run: npm install -g @anthropic-ai/claude-code@2.1.160 + shell: pwsh + + - name: Install GitHub Copilot CLI + run: npm install -g @github/copilot@1.0.57 + shell: pwsh diff --git a/.github/workflows/claude-evaluation.yml b/.github/workflows/claude-evaluation.yml index 695bb328b..4fb021bd1 100644 --- a/.github/workflows/claude-evaluation.yml +++ b/.github/workflows/claude-evaluation.yml @@ -114,14 +114,8 @@ jobs: dotnet tool install -g Microsoft.Dynamics.BusinessCentral.Development.Tools --version 18.0.37.11445-beta echo "$HOME\.dotnet\tools" >> $env:GITHUB_PATH - - name: Install Claude Code - run: npm install -g @anthropic-ai/claude-code@2.1.160 - - # The code-review category scores findings with a semantic judge that runs on Copilot CLI, - # regardless of the agent under test. Provision it only when judging is needed. - - name: Install GitHub Copilot CLI (code-review judge) - if: ${{ inputs.category == 'code-review' }} - run: npm install -g @github/copilot@1.0.57 + - name: Install evaluation CLIs + uses: ./.github/actions/install-eval-clis - name: Run Claude Code for entry ${{ matrix.entry }} timeout-minutes: 120 diff --git a/.github/workflows/copilot-evaluation.yml b/.github/workflows/copilot-evaluation.yml index dd373536d..f0f11d787 100644 --- a/.github/workflows/copilot-evaluation.yml +++ b/.github/workflows/copilot-evaluation.yml @@ -121,8 +121,8 @@ jobs: dotnet tool install -g Microsoft.Dynamics.BusinessCentral.Development.Tools --version 18.0.37.11445-beta echo "$HOME\.dotnet\tools" >> $env:GITHUB_PATH - - name: Install GitHub Copilot CLI - run: npm install -g @github/copilot@1.0.57 + - name: Install evaluation CLIs + uses: ./.github/actions/install-eval-clis - name: Run GitHub Copilot CLI for entry ${{ matrix.entry }} timeout-minutes: 120 From 1886bbbf1ba68e01ed911e9358c85f49cab1979a Mon Sep 17 00:00:00 2001 From: Wenjie Fan <31087545+gggdttt@users.noreply.github.com> Date: Fri, 26 Jun 2026 14:55:34 +0200 Subject: [PATCH 3/4] Update .github/actions/install-eval-clis/action.yml Co-authored-by: Sun Haoran --- .github/actions/install-eval-clis/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/install-eval-clis/action.yml b/.github/actions/install-eval-clis/action.yml index a1bae3a1f..e9bc0c3bb 100644 --- a/.github/actions/install-eval-clis/action.yml +++ b/.github/actions/install-eval-clis/action.yml @@ -1,4 +1,4 @@ -name: Install evaluation CLIs +name: Install Agent Harnesses description: Install the Claude Code and GitHub Copilot CLIs used by the evaluation workflows runs: From efe69b6fbde45bb8f60e613122af9d1b96595d97 Mon Sep 17 00:00:00 2001 From: Wenjie Fan <31087545+gggdttt@users.noreply.github.com> Date: Fri, 26 Jun 2026 14:55:40 +0200 Subject: [PATCH 4/4] Update .github/workflows/claude-evaluation.yml Co-authored-by: Sun Haoran --- .github/workflows/claude-evaluation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/claude-evaluation.yml b/.github/workflows/claude-evaluation.yml index 4fb021bd1..5952c6026 100644 --- a/.github/workflows/claude-evaluation.yml +++ b/.github/workflows/claude-evaluation.yml @@ -114,7 +114,7 @@ jobs: dotnet tool install -g Microsoft.Dynamics.BusinessCentral.Development.Tools --version 18.0.37.11445-beta echo "$HOME\.dotnet\tools" >> $env:GITHUB_PATH - - name: Install evaluation CLIs + - name: Install Agent Harnesses uses: ./.github/actions/install-eval-clis - name: Run Claude Code for entry ${{ matrix.entry }}