agentv/.github/workflows/evals.yml at main · EntityProcess/agentv · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
name: Run Evals

on:
  workflow_dispatch:
    inputs:
      suite_filter:
        description: "Comma-separated glob patterns for eval files to run"
        required: false
        default: ""
      target:
        description: "Optional target override (leave empty to use each eval's own target)"
        required: false
        default: ""
      threshold:
        description: "Minimum score threshold (0-1)"
        required: false
        default: "0.8"

jobs:
  evals:
    name: Run AgentV Evals
    runs-on: ubuntu-latest
    permissions:
      contents: read
      checks: write
      models: read
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-node@v4
        with:
          node-version: 22
      - uses: ./.github/actions/setup-bun

      - name: Build
        run: bun run build

      - name: Install GitHub Copilot CLI
        run: npm install -g @github/copilot

      - name: Install Pi CLI
        run: npm install -g @mariozechner/pi-coding-agent || echo "pi-cli install failed (non-fatal)"

      - name: Install uv (Python package manager)
        run: curl -LsSf https://astral.sh/uv/install.sh | sh

      - name: Configure credentials
        run: |
          cat > .env <<EOF
          GH_MODELS_TOKEN=${{ secrets.COPILOT_PAT || secrets.GH_MODELS_TOKEN || secrets.GITHUB_TOKEN }}
          GH_MODELS_MODEL=${{ vars.GH_MODELS_MODEL || 'gpt-5-mini' }}
          COPILOT_MODEL=${{ vars.COPILOT_MODEL || 'gpt-5-mini' }}
          AGENT_TARGET=${{ vars.AGENT_TARGET || 'copilot-cli' }}
          GRADER_TARGET=${{ vars.GRADER_TARGET || 'openrouter' }}
          GOOGLE_GENERATIVE_AI_API_KEY=${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
          OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY }}
          OPENROUTER_MODEL=${{ vars.OPENROUTER_MODEL || 'openai/gpt-5.4-mini' }}
          GEMINI_MODEL_NAME=${{ vars.GEMINI_MODEL_NAME || 'gemini-2.0-flash' }}
          EOF

      - name: Resolve inputs
        id: filter
        run: |
          PATTERNS="${{ github.event.inputs.suite_filter || vars.EVAL_PATTERNS }}"
          EXCLUDES="${{ vars.EVAL_EXCLUDE_PATTERNS }}"
          if [ -n "$EXCLUDES" ]; then PATTERNS="$PATTERNS,$EXCLUDES"; fi
          echo "patterns=$PATTERNS" >> "$GITHUB_OUTPUT"
          echo "target=${{ github.event.inputs.target || vars.EVAL_TARGET || '' }}" >> "$GITHUB_OUTPUT"
          echo "threshold=${{ github.event.inputs.threshold || '0.8' }}" >> "$GITHUB_OUTPUT"

      - name: Run AgentV evals
        id: run-evals
        env:
          COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_PAT }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          mkdir -p .agentv/ci-results

          # Split comma-separated patterns into positional args
          IFS=',' read -ra PATTERNS <<< "${{ steps.filter.outputs.patterns }}"

          # Build optional --target flag (empty = use each eval's own target)
          TARGET_FLAG=()
          if [ -n "${{ steps.filter.outputs.target }}" ]; then
            TARGET_FLAG=(--target "${{ steps.filter.outputs.target }}")
          fi

          bun apps/cli/dist/cli.js eval run "${PATTERNS[@]}" \
            "${TARGET_FLAG[@]}" \
            --workers 3 \
            --threshold ${{ steps.filter.outputs.threshold }} \
            --output .agentv/ci-results/junit.xml \
            --benchmark-json .agentv/ci-results/benchmark.json \
            --artifacts .agentv/ci-results/artifacts
          EXIT_CODE=$?

          echo "exit_code=$EXIT_CODE" >> "$GITHUB_OUTPUT"

      - name: Post eval summary
        if: always()
        run: bun run scripts/ci-summary.ts .agentv/ci-results >> "$GITHUB_STEP_SUMMARY"

      - name: Publish JUnit test results
        if: always()
        continue-on-error: true
        uses: dorny/test-reporter@v1
        with:
          name: AgentV Eval Results
          path: .agentv/ci-results/junit.xml
          reporter: java-junit
          fail-on-error: false

      - name: Upload eval artifacts
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: eval-results-${{ github.run_id }}
          path: |
            .agentv/ci-results/
            .agentv/logs/
          retention-days: 30

      - name: Fail if threshold not met
        if: always()
        run: |
          if [ "${{ steps.run-evals.outputs.exit_code }}" != "0" ]; then
            echo "::error::Eval score below threshold (${{ steps.filter.outputs.threshold }})"
            exit 1
          fi