Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
1026e00
feat(evals): set default targets so all evals work out of the box
christso Apr 1, 2026
ebe1688
feat(evals): set default targets so all evals work out of the box
christso Apr 1, 2026
f74fb09
feat(evals): make default target env-var-driven for out-of-box evals
christso Apr 1, 2026
d2102dc
fix(ci): use explicit include patterns instead of negated globs
christso Apr 1, 2026
37a526c
feat(cli): support negation patterns (!glob) in eval path resolution
christso Apr 1, 2026
71d77a5
fix(ci): remove --targets override so per-example targets auto-discover
christso Apr 1, 2026
df3a765
fix: remove deprecated workspace_template from mock target configs
christso Apr 1, 2026
1191250
fix(ci): add Gemini credentials to workflow .env
christso Apr 1, 2026
03f5503
feat(evals): add llm target and classify all evals as llm or agent
christso Apr 1, 2026
b2c6a78
fix(evals): use default (copilot) instead of pi-cli for agent evals
christso Apr 1, 2026
0b04cf9
chore(ci): increase eval workers from 1 to 3
christso Apr 1, 2026
5c53635
fix(ci): exclude evals with local script providers from CI
christso Apr 1, 2026
f3870d6
fix(ci): add missing echo provider and install uv for local script evals
christso Apr 1, 2026
d081bd6
fix(evals): make LLM eval assertions pass with generic models
christso Apr 1, 2026
f8d8e94
fix(evals): switch llm and grader targets to OpenRouter
christso Apr 1, 2026
2a9f1c3
fix(evals): switch per-example grader targets from azure to root grader
christso Apr 1, 2026
2185c65
feat(core): add target alias support for single-env-var provider swit…
christso Apr 1, 2026
6438c23
feat(core): add use_target for target delegation
christso Apr 1, 2026
6936380
refactor(targets): use use_target for llm and grader targets
christso Apr 1, 2026
a076d4e
refactor(core): make provider optional when use_target is set
christso Apr 1, 2026
fddd943
fix(core): allow provider to be omitted when use_target is set
christso Apr 1, 2026
3c39f70
fix(core): allow use_target in targets-file.ts parser
christso Apr 1, 2026
7650b51
fix(ci): exclude copilot-log-eval from CI
christso Apr 1, 2026
3441f91
fix(cli): catch before_all failures per eval file instead of aborting
christso Apr 1, 2026
0dd936a
fix(core): resolve use_target chains in orchestrator for grader targets
christso Apr 1, 2026
50eef93
fix(evals): restore workspace.template for mock agent evals
christso Apr 1, 2026
595fc16
fix(ci): exclude evals with pre-existing workspace/batch bugs
christso Apr 1, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 15 additions & 11 deletions .agentv/targets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,25 @@
# grader_target so eval execution and grading use separate models.

targets:
# ── Grader (LLM-as-judge) ──────────────────────────────────────────
# "default" is an alias so example evals with `target: default` work.
# ── Default target (use) ───────────────────────────────────────────
# Evals without an explicit target resolve to "default". The use
# redirects to a named target, controlled via AGENT_TARGET env var.
# One env var switches the entire provider config (auth, model, etc.).
# Example: AGENT_TARGET=copilot-cli or AGENT_TARGET=claude
- name: default
provider: openai
base_url: https://models.github.ai/inference/v1
api_key: ${{ GH_MODELS_TOKEN }}
model: ${{ GH_MODELS_MODEL }}
use_target: ${{ AGENT_TARGET }}

# ── LLM target (text generation, no agent binary needed) ────────────
# Delegates to GRADER_TARGET — same provider used for grading and LLM evals.
- name: llm
use_target: ${{ GRADER_TARGET }}

# ── Grader (LLM-as-judge) ──────────────────────────────────────────
# Used by agent targets via grader_target. Switch provider via GRADER_TARGET.
- name: grader
provider: openai
base_url: https://models.github.ai/inference/v1
api_key: ${{ GH_MODELS_TOKEN }}
model: ${{ GH_MODELS_MODEL }}
use_target: ${{ GRADER_TARGET }}

# ── Agent targets ──────────────────────────────────────────────────
# ── Named agent targets ───────────────────────────────────────────
- name: copilot-cli
provider: copilot-cli
model: ${{ COPILOT_MODEL }}
Expand Down
45 changes: 36 additions & 9 deletions .github/workflows/evals.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ on:
suite_filter:
description: "Comma-separated glob patterns for eval files to run"
required: false
default: "evals/**/eval.yaml,examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml"
default: "evals/**/*.eval.yaml,examples/**/*.eval.yaml,examples/**/*.EVAL.yaml,examples/**/EVAL.yaml"
target:
description: "Target name from .agentv/targets.yaml"
description: "Optional target override (leave empty to use each eval's own target)"
required: false
default: "copilot-cli"
default: ""
threshold:
description: "Minimum score threshold (0-1)"
required: false
Expand All @@ -34,21 +34,42 @@ jobs:
- name: Install GitHub Copilot CLI
run: curl -fsSL https://gh.io/copilot-install | bash

- name: Install Pi CLI
run: npm install -g @mariozechner/pi-coding-agent || echo "pi-cli install failed (non-fatal)"

- name: Install uv (Python package manager)
run: curl -LsSf https://astral.sh/uv/install.sh | sh

- name: Configure credentials
run: |
cat > .env <<EOF
GH_MODELS_TOKEN=${{ secrets.COPILOT_PAT || secrets.GH_MODELS_TOKEN || secrets.GITHUB_TOKEN }}
GH_MODELS_MODEL=${{ vars.GH_MODELS_MODEL || 'gpt-5-mini' }}
COPILOT_MODEL=${{ vars.COPILOT_MODEL || 'gpt-5-mini' }}
AGENT_TARGET=${{ vars.AGENT_TARGET || 'copilot-cli' }}
GRADER_TARGET=${{ vars.GRADER_TARGET || 'openrouter' }}
GOOGLE_GENERATIVE_AI_API_KEY=${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY }}
OPENROUTER_MODEL=${{ vars.OPENROUTER_MODEL || 'openai/gpt-5.4-mini' }}
GEMINI_MODEL_NAME=${{ vars.GEMINI_MODEL_NAME || 'gemini-2.0-flash' }}
EOF

- name: Resolve inputs
id: filter
env:
DEFAULT_PATTERNS: "evals/**/eval.yaml,examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml"
DEFAULT_PATTERNS: "evals/**/*.eval.yaml,examples/**/*.eval.yaml,examples/**/*.EVAL.yaml,examples/**/EVAL.yaml"
# Exclude evals that need local scripts or multiple agent targets.
# Negation patterns (!glob) are supported by the CLI.
# multi-model-benchmark: needs multiple agents
# copilot-log-eval: needs copilot session files on disk
# batch-cli: batch output format mismatch (pre-existing)
# file-changes-graders: workspace cwd bug on retries (pre-existing)
EXCLUDE_PATTERNS: "!examples/showcase/multi-model-benchmark/**,!examples/features/copilot-log-eval/**,!examples/features/batch-cli/**,!examples/features/file-changes-graders/**"
run: |
echo "patterns=${{ github.event.inputs.suite_filter || vars.EVAL_PATTERNS || env.DEFAULT_PATTERNS }}" >> "$GITHUB_OUTPUT"
echo "target=${{ github.event.inputs.target || vars.EVAL_TARGET || 'copilot-cli' }}" >> "$GITHUB_OUTPUT"
PATTERNS="${{ github.event.inputs.suite_filter || vars.EVAL_PATTERNS || env.DEFAULT_PATTERNS }}"
EXCLUDES="${{ vars.EVAL_EXCLUDE_PATTERNS || env.EXCLUDE_PATTERNS }}"
echo "patterns=${PATTERNS},${EXCLUDES}" >> "$GITHUB_OUTPUT"
echo "target=${{ github.event.inputs.target || vars.EVAL_TARGET || '' }}" >> "$GITHUB_OUTPUT"
echo "threshold=${{ github.event.inputs.threshold || '0.8' }}" >> "$GITHUB_OUTPUT"

- name: Run AgentV evals
Expand All @@ -61,10 +82,16 @@ jobs:

# Split comma-separated patterns into positional args
IFS=',' read -ra PATTERNS <<< "${{ steps.filter.outputs.patterns }}"

# Build optional --target flag (empty = use each eval's own target)
TARGET_FLAG=()
if [ -n "${{ steps.filter.outputs.target }}" ]; then
TARGET_FLAG=(--target "${{ steps.filter.outputs.target }}")
fi

bun apps/cli/dist/cli.js eval run "${PATTERNS[@]}" \
--targets .agentv/targets.yaml \
--target ${{ steps.filter.outputs.target }} \
--workers 1 \
"${TARGET_FLAG[@]}" \
--workers 3 \
--threshold ${{ steps.filter.outputs.threshold }} \
-o .agentv/ci-results/junit.xml \
--benchmark-json .agentv/ci-results/benchmark.json \
Expand Down
76 changes: 51 additions & 25 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1210,31 +1210,57 @@ export async function runEvalCommand(
return [];
}

const result = await runSingleEvalFile({
testFilePath,
cwd,
repoRoot,
options,
outputWriter,
otelExporter,
cache,
evaluationRunner,
workersOverride: perFileWorkers,
yamlWorkers: targetPrep.yamlWorkers,
progressReporter,
seenEvalCases,
displayIdTracker,
selection,
inlineTargetLabel,
evalCases: applicableEvalCases,
trialsConfig: targetPrep.trialsConfig,
matrixMode: targetPrep.selections.length > 1,
totalBudgetUsd: targetPrep.totalBudgetUsd,
failOnError: targetPrep.failOnError,
threshold: resolvedThreshold,
});

return result.results;
try {
const result = await runSingleEvalFile({
testFilePath,
cwd,
repoRoot,
options,
outputWriter,
otelExporter,
cache,
evaluationRunner,
workersOverride: perFileWorkers,
yamlWorkers: targetPrep.yamlWorkers,
progressReporter,
seenEvalCases,
displayIdTracker,
selection,
inlineTargetLabel,
evalCases: applicableEvalCases,
trialsConfig: targetPrep.trialsConfig,
matrixMode: targetPrep.selections.length > 1,
totalBudgetUsd: targetPrep.totalBudgetUsd,
failOnError: targetPrep.failOnError,
threshold: resolvedThreshold,
});

return result.results;
} catch (fileError) {
// before_all or other setup failures should not abort the entire run.
// Mark all tests in this file as errors and continue with other files.
const message = fileError instanceof Error ? fileError.message : String(fileError);
console.error(`\n⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`);
const errorResults: EvaluationResult[] = applicableEvalCases.map((evalCase) => ({
timestamp: new Date().toISOString(),
testId: evalCase.id,
score: 0,
assertions: [],
output: [],
scores: [],
error: message,
executionStatus: 'execution_error' as const,
failureStage: 'setup' as const,
failureReasonCode: 'setup_error' as const,
durationMs: 0,
tokenUsage: { input: 0, output: 0, inputTokens: 0, outputTokens: 0 },
target: selection.targetName,
}));
for (const errResult of errorResults) {
await outputWriter.append(errResult);
}
return errorResults;
}
}),
);
for (const results of targetResults) {
Expand Down
20 changes: 19 additions & 1 deletion apps/cli/src/commands/eval/shared.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,26 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis
throw new Error('No eval paths provided.');
}

// Separate negation patterns (!glob) from include patterns.
// Negation patterns are passed to fast-glob as `ignore`.
const includePatterns: string[] = [];
const ignorePatterns: string[] = [];
for (const input of normalizedInputs) {
if (input.startsWith('!')) {
ignorePatterns.push(input.slice(1));
} else {
includePatterns.push(input);
}
}

if (includePatterns.length === 0) {
throw new Error('No eval paths provided (only negation patterns found).');
}

const unmatched: string[] = [];
const results = new Set<string>();

for (const pattern of normalizedInputs) {
for (const pattern of includePatterns) {
// If the pattern points to an existing file or directory, short-circuit globbing
const candidatePath = path.isAbsolute(pattern)
? path.normalize(pattern)
Expand All @@ -32,6 +48,7 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis
unique: true,
dot: true,
followSymbolicLinks: true,
ignore: ignorePatterns,
});
if (dirMatches.length === 0) {
unmatched.push(pattern);
Expand All @@ -54,6 +71,7 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis
unique: true,
dot: true,
followSymbolicLinks: true,
ignore: ignorePatterns,
});

const yamlMatches = matches.filter((filePath) => /\.(ya?ml|jsonl|json)$/i.test(filePath));
Expand Down
71 changes: 53 additions & 18 deletions apps/cli/src/commands/eval/targets.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,57 @@ function isTTY(): boolean {
return process.stdout.isTTY ?? false;
}

/**
* Resolve a target definition, following alias chains.
*
* If a target has an `alias` field (supports ${{ ENV_VAR }} syntax),
* it is resolved to the referenced target. This allows a single env var
* to switch the entire provider config:
*
* - name: default
* alias: ${{ AGENT_TARGET }} # e.g. "copilot-cli"
*
* use_target chains are followed up to 5 levels deep to prevent cycles.
*/
function resolveUseTarget(
name: string,
definitions: readonly TargetDefinition[],
env: NodeJS.ProcessEnv,
targetsFilePath: string,
): TargetDefinition {
const maxDepth = 5;
let current: TargetDefinition | undefined = definitions.find((d) => d.name === name);
if (!current) {
const available = listTargetNames(definitions).join(', ');
throw new Error(
`Target '${name}' not found in ${targetsFilePath}. Available targets: ${available}`,
);
}

for (let depth = 0; depth < maxDepth; depth++) {
const useTarget = current.use_target;
if (useTarget === undefined || useTarget === null) break;
const raw: string = String(useTarget).trim();
if (raw.length === 0) break;

// Resolve ${{ ENV_VAR }} syntax
const envMatch: RegExpMatchArray | null = raw.match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
const resolved: string = envMatch ? (env[envMatch[1]] ?? '') : raw;
if (resolved.trim().length === 0) break;

const next: TargetDefinition | undefined = definitions.find((d) => d.name === resolved.trim());
if (!next) {
const available = listTargetNames(definitions).join(', ');
throw new Error(
`Target '${name}' use_target '${resolved.trim()}' not found in ${targetsFilePath}. Available targets: ${available}`,
);
}
current = next;
}

return current;
}

export async function readTestSuiteTarget(testFilePath: string): Promise<string | undefined> {
const metadata = await readTestSuiteMetadata(testFilePath);
return metadata.target;
Expand Down Expand Up @@ -122,15 +173,7 @@ export async function selectTarget(options: TargetSelectionOptions): Promise<Tar
const fileTargetName = await readTestSuiteTarget(testFilePath);
const targetChoice = pickTargetName({ cliTargetName, fileTargetName });

const targetDefinition = definitions.find(
(definition: TargetDefinition) => definition.name === targetChoice.name,
);
if (!targetDefinition) {
const available = listTargetNames(definitions).join(', ');
throw new Error(
`Target '${targetChoice.name}' not found in ${targetsFilePath}. Available targets: ${available}`,
);
}
const targetDefinition = resolveUseTarget(targetChoice.name, definitions, env, targetsFilePath);

if (dryRun) {
const mockTarget: ResolvedTarget = {
Expand Down Expand Up @@ -226,15 +269,7 @@ export async function selectMultipleTargets(
const results: TargetSelection[] = [];

for (const name of targetNames) {
const targetDefinition = definitions.find(
(definition: TargetDefinition) => definition.name === name,
);
if (!targetDefinition) {
const available = listTargetNames(definitions).join(', ');
throw new Error(
`Target '${name}' not found in ${targetsFilePath}. Available targets: ${available}`,
);
}
const targetDefinition = resolveUseTarget(name, definitions, env, targetsFilePath);

if (dryRun) {
const mockTarget: ResolvedTarget = {
Expand Down
4 changes: 1 addition & 3 deletions evals/agentic-engineering/agent-plugin-review.eval.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
description: Evaluates that the agent-plugin-review skill is triggered and catches planted issues in a mock plugin

execution:
targets:
- pi-cli
tags: [agent]

workspace:
template: ./workspace-template
Expand Down
3 changes: 0 additions & 3 deletions examples/features/agent-skills-evals/.agentv/targets.yaml

This file was deleted.

Loading
Loading