-
Notifications
You must be signed in to change notification settings - Fork 0
128 lines (111 loc) · 4.39 KB
/
evals.yml
File metadata and controls
128 lines (111 loc) · 4.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
name: Run Evals
on:
workflow_dispatch:
inputs:
suite_filter:
description: "Comma-separated glob patterns for eval files to run"
required: false
default: ""
target:
description: "Optional target override (leave empty to use each eval's own target)"
required: false
default: ""
threshold:
description: "Minimum score threshold (0-1)"
required: false
default: "0.8"
jobs:
evals:
name: Run AgentV Evals
runs-on: ubuntu-latest
permissions:
contents: read
checks: write
models: read
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: 22
- uses: ./.github/actions/setup-bun
- name: Build
run: bun run build
- name: Install GitHub Copilot CLI
run: npm install -g @github/copilot
- name: Install Pi CLI
run: npm install -g @mariozechner/pi-coding-agent || echo "pi-cli install failed (non-fatal)"
- name: Install uv (Python package manager)
run: curl -LsSf https://astral.sh/uv/install.sh | sh
- name: Configure credentials
run: |
cat > .env <<EOF
GH_MODELS_TOKEN=${{ secrets.COPILOT_PAT || secrets.GH_MODELS_TOKEN || secrets.GITHUB_TOKEN }}
GH_MODELS_MODEL=${{ vars.GH_MODELS_MODEL || 'gpt-5-mini' }}
COPILOT_MODEL=${{ vars.COPILOT_MODEL || 'gpt-5-mini' }}
AGENT_TARGET=${{ vars.AGENT_TARGET || 'copilot-cli' }}
GRADER_TARGET=${{ vars.GRADER_TARGET || 'openrouter' }}
GOOGLE_GENERATIVE_AI_API_KEY=${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY }}
OPENROUTER_MODEL=${{ vars.OPENROUTER_MODEL || 'openai/gpt-5.4-mini' }}
GEMINI_MODEL_NAME=${{ vars.GEMINI_MODEL_NAME || 'gemini-2.0-flash' }}
EOF
- name: Resolve inputs
id: filter
run: |
PATTERNS="${{ github.event.inputs.suite_filter || vars.EVAL_PATTERNS }}"
EXCLUDES="${{ vars.EVAL_EXCLUDE_PATTERNS }}"
if [ -n "$EXCLUDES" ]; then PATTERNS="$PATTERNS,$EXCLUDES"; fi
echo "patterns=$PATTERNS" >> "$GITHUB_OUTPUT"
echo "target=${{ github.event.inputs.target || vars.EVAL_TARGET || '' }}" >> "$GITHUB_OUTPUT"
echo "threshold=${{ github.event.inputs.threshold || '0.8' }}" >> "$GITHUB_OUTPUT"
- name: Run AgentV evals
id: run-evals
env:
COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_PAT }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
mkdir -p .agentv/ci-results
# Split comma-separated patterns into positional args
IFS=',' read -ra PATTERNS <<< "${{ steps.filter.outputs.patterns }}"
# Build optional --target flag (empty = use each eval's own target)
TARGET_FLAG=()
if [ -n "${{ steps.filter.outputs.target }}" ]; then
TARGET_FLAG=(--target "${{ steps.filter.outputs.target }}")
fi
bun apps/cli/dist/cli.js eval run "${PATTERNS[@]}" \
"${TARGET_FLAG[@]}" \
--workers 3 \
--threshold ${{ steps.filter.outputs.threshold }} \
--output .agentv/ci-results/junit.xml \
--benchmark-json .agentv/ci-results/benchmark.json \
--artifacts .agentv/ci-results/artifacts
EXIT_CODE=$?
echo "exit_code=$EXIT_CODE" >> "$GITHUB_OUTPUT"
- name: Post eval summary
if: always()
run: bun run scripts/ci-summary.ts .agentv/ci-results >> "$GITHUB_STEP_SUMMARY"
- name: Publish JUnit test results
if: always()
continue-on-error: true
uses: dorny/test-reporter@v1
with:
name: AgentV Eval Results
path: .agentv/ci-results/junit.xml
reporter: java-junit
fail-on-error: false
- name: Upload eval artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: eval-results-${{ github.run_id }}
path: |
.agentv/ci-results/
.agentv/logs/
retention-days: 30
- name: Fail if threshold not met
if: always()
run: |
if [ "${{ steps.run-evals.outputs.exit_code }}" != "0" ]; then
echo "::error::Eval score below threshold (${{ steps.filter.outputs.threshold }})"
exit 1
fi