From 7173597092357c1b37c1a9a9fb8ec24b63a8462e Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 27 Mar 2026 18:43:26 +1100 Subject: [PATCH] feat(pipeline): agent-mode reliability improvements - Add --llm-scores flag to pipeline bench (#790) - Add pipeline run combined command (#791) - Fix Windows subprocess in Python scripts (#789) - Update agentv-bench skill docs for cross-platform use (#792) --- apps/cli/src/commands/pipeline/bench.ts | 23 +- apps/cli/src/commands/pipeline/index.ts | 2 + apps/cli/src/commands/pipeline/run.ts | 421 ++++++++++++++++++ .../agentv-dev/skills/agentv-bench/SKILL.md | 72 +-- .../skills/agentv-bench/scripts/bench.py | 12 +- .../agentv-bench/scripts/run_code_graders.py | 12 +- .../skills/agentv-bench/scripts/run_tests.py | 42 +- 7 files changed, 542 insertions(+), 42 deletions(-) create mode 100644 apps/cli/src/commands/pipeline/run.ts diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts index 502858a3..5cd4cefc 100644 --- a/apps/cli/src/commands/pipeline/bench.ts +++ b/apps/cli/src/commands/pipeline/bench.ts @@ -2,8 +2,9 @@ * `agentv pipeline bench` — Merge code-grader and LLM grader scores into final * benchmark artifacts. * - * Reads code_grader_results from disk and LLM grader scores from stdin, - * computes weighted pass_rate per test, and writes: + * Reads code_grader_results from disk and LLM grader scores from a file + * (`--llm-scores `) or stdin, computes weighted pass_rate per test, + * and writes: * - /grading.json (per-test grading breakdown) * - index.jsonl (one line per test) * - benchmark.json (aggregate statistics) @@ -14,7 +15,7 @@ import { readFile, readdir, writeFile } from 'node:fs/promises'; import { join } from 'node:path'; -import { command, positional, string } from 'cmd-ts'; +import { command, option, optional, positional, string } from 'cmd-ts'; interface EvaluatorScore { readonly name: string; @@ -33,14 +34,24 @@ export const evalBenchCommand = command({ displayName: 'export-dir', description: 'Export directory from pipeline input/grade', }), + llmScores: option({ + type: optional(string), + long: 'llm-scores', + description: 'Path to LLM scores JSON file (reads from stdin if omitted)', + }), }, - handler: async ({ exportDir }) => { + handler: async ({ exportDir, llmScores: llmScoresPath }) => { const manifest = JSON.parse(await readFile(join(exportDir, 'manifest.json'), 'utf8')); const testIds: string[] = manifest.test_ids; const targetName: string = manifest.target?.name ?? 'unknown'; - // Read LLM scores from stdin - const stdinData = await readStdin(); + // Read LLM scores from file or stdin + let stdinData: string; + if (llmScoresPath) { + stdinData = await readFile(llmScoresPath, 'utf8'); + } else { + stdinData = await readStdin(); + } const llmScores: Record< string, Record< diff --git a/apps/cli/src/commands/pipeline/index.ts b/apps/cli/src/commands/pipeline/index.ts index f038e698..365b0d4e 100644 --- a/apps/cli/src/commands/pipeline/index.ts +++ b/apps/cli/src/commands/pipeline/index.ts @@ -3,6 +3,7 @@ import { subcommands } from 'cmd-ts'; import { evalBenchCommand } from './bench.js'; import { evalGradeCommand } from './grade.js'; import { evalInputCommand } from './input.js'; +import { evalRunCommand } from './run.js'; export const pipelineCommand = subcommands({ name: 'pipeline', @@ -11,5 +12,6 @@ export const pipelineCommand = subcommands({ input: evalInputCommand, grade: evalGradeCommand, bench: evalBenchCommand, + run: evalRunCommand, }, }); diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts new file mode 100644 index 00000000..c52e7445 --- /dev/null +++ b/apps/cli/src/commands/pipeline/run.ts @@ -0,0 +1,421 @@ +/** + * `agentv pipeline run` — Combined command that runs input extraction, CLI target + * invocation, and code grading in a single step. + * + * Equivalent to running: + * 1. `agentv pipeline input --out ` + * 2. Invoking each CLI target in parallel (writing response.md + timing.json) + * 3. `agentv pipeline grade ` + * + * For `kind: agent` targets, steps 2 is skipped (agent handles execution). + * + * To add new features: extend the handler — all logic is self-contained. + */ +import { execSync } from 'node:child_process'; +import { existsSync, readFileSync, unlinkSync } from 'node:fs'; +import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { dirname, join, resolve } from 'node:path'; + +import { executeScript, loadTestSuite } from '@agentv/core'; +import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core'; +import { command, number, option, optional, positional, string } from 'cmd-ts'; + +import { findRepoRoot } from '../eval/shared.js'; +import { selectTarget } from '../eval/targets.js'; + +/** Load key=value pairs from a .env file. Ignores comments and blank lines. */ +function loadEnvFile(dir: string): Record { + let current = resolve(dir); + while (true) { + const candidate = join(current, '.env'); + if (existsSync(candidate)) { + const env: Record = {}; + for (const line of readFileSync(candidate, 'utf8').split('\n')) { + const trimmed = line.trim(); + if (!trimmed || trimmed.startsWith('#')) continue; + const eqIdx = trimmed.indexOf('='); + if (eqIdx === -1) continue; + env[trimmed.slice(0, eqIdx).trim()] = trimmed.slice(eqIdx + 1).trim(); + } + return env; + } + const parent = dirname(current); + if (parent === current) break; + current = parent; + } + return {}; +} + +export const evalRunCommand = command({ + name: 'run', + description: 'Extract inputs, invoke CLI targets, and run code graders in one step', + args: { + evalPath: positional({ + type: string, + displayName: 'eval-path', + description: 'Path to eval YAML file', + }), + out: option({ + type: string, + long: 'out', + description: 'Output directory for results', + }), + workers: option({ + type: optional(number), + long: 'workers', + description: 'Parallel workers for target invocation (default: all tests)', + }), + }, + handler: async ({ evalPath, out, workers }) => { + const resolvedEvalPath = resolve(evalPath); + const outDir = resolve(out); + const repoRoot = await findRepoRoot(dirname(resolvedEvalPath)); + const evalDir = dirname(resolvedEvalPath); + + // ── Step 1: Extract inputs (same as pipeline input) ────────────── + const suite = await loadTestSuite(resolvedEvalPath, repoRoot); + const tests = suite.tests; + + if (tests.length === 0) { + console.error('No tests found in eval file.'); + process.exit(1); + } + + let targetInfo: { + kind: 'cli'; + command: string; + cwd: string; + timeoutMs: number; + } | null = null; + let targetName = 'agent'; + let targetKind = 'agent'; + + try { + const selection = await selectTarget({ + testFilePath: resolvedEvalPath, + repoRoot, + cwd: evalDir, + dryRun: false, + dryRunDelay: 0, + dryRunDelayMin: 0, + dryRunDelayMax: 0, + env: process.env, + }); + targetName = selection.targetName; + if (selection.resolvedTarget.kind === 'cli') { + targetKind = 'cli'; + const config = selection.resolvedTarget.config; + targetInfo = { + kind: 'cli', + command: config.command, + cwd: config.cwd ?? evalDir, + timeoutMs: config.timeoutMs ?? 30000, + }; + } + } catch { + // No targets file — agent-as-target mode + } + + const testIds: string[] = []; + + for (const test of tests) { + const testDir = join(outDir, test.id); + await mkdir(testDir, { recursive: true }); + testIds.push(test.id); + + const inputText = test.question; + const inputMessages = test.input.map((m) => ({ + role: m.role, + content: typeof m.content === 'string' ? m.content : m.content, + })); + await writeJson(join(testDir, 'input.json'), { + input_text: inputText, + input_messages: inputMessages, + file_paths: test.file_paths, + metadata: test.metadata ?? {}, + }); + + if (targetInfo) { + await writeJson(join(testDir, 'invoke.json'), { + kind: 'cli', + command: targetInfo.command, + cwd: targetInfo.cwd, + timeout_ms: targetInfo.timeoutMs, + env: {}, + }); + } else { + await writeJson(join(testDir, 'invoke.json'), { + kind: 'agent', + instructions: 'Execute this task in the current workspace. The agent IS the target.', + }); + } + + await writeFile(join(testDir, 'criteria.md'), test.criteria ?? '', 'utf8'); + + if ( + test.expected_output.length > 0 || + (test.reference_answer !== undefined && test.reference_answer !== '') + ) { + await writeJson(join(testDir, 'expected_output.json'), { + expected_output: test.expected_output, + reference_answer: test.reference_answer ?? '', + }); + } + + await writeGraderConfigs(testDir, test.assertions ?? [], evalDir); + } + + await writeJson(join(outDir, 'manifest.json'), { + eval_file: resolvedEvalPath, + timestamp: new Date().toISOString(), + target: { name: targetName, kind: targetKind }, + test_ids: testIds, + }); + + console.log(`Extracted ${testIds.length} test(s) to ${outDir}`); + + // ── Step 2: Invoke CLI targets in parallel ─────────────────────── + if (targetInfo) { + const envVars = loadEnvFile(evalDir); + const mergedEnv = { ...process.env, ...envVars }; + const maxWorkers = workers ?? testIds.length; + + console.log(`Invoking ${testIds.length} CLI target(s) (${maxWorkers} workers)...`); + + const invokeTarget = async (testId: string): Promise => { + const testDir = join(outDir, testId); + const invoke = JSON.parse(await readFile(join(testDir, 'invoke.json'), 'utf8')); + if (invoke.kind !== 'cli') return; + + const inputData = JSON.parse(await readFile(join(testDir, 'input.json'), 'utf8')); + const template: string = invoke.command; + const cwd: string = invoke.cwd; + const timeoutMs: number = invoke.timeout_ms ?? 120000; + + // Write temp prompt file + const promptFile = join(tmpdir(), `agentv-prompt-${testId}-${Date.now()}.txt`); + const outputFile = join(tmpdir(), `agentv-output-${testId}-${Date.now()}.txt`); + await writeFile(promptFile, inputData.input_text, 'utf8'); + + let rendered = template; + rendered = rendered.replace('{PROMPT_FILE}', promptFile); + rendered = rendered.replace('{OUTPUT_FILE}', outputFile); + rendered = rendered.replace('{PROMPT}', inputData.input_text); + + const start = performance.now(); + try { + execSync(rendered, { + cwd, + timeout: timeoutMs, + env: mergedEnv, + stdio: ['pipe', 'pipe', 'pipe'], + maxBuffer: 10 * 1024 * 1024, + }); + const durationMs = Math.round(performance.now() - start); + + let response: string; + if (existsSync(outputFile)) { + response = readFileSync(outputFile, 'utf8'); + } else { + response = 'ERROR: No output file generated'; + } + + await writeFile(join(testDir, 'response.md'), response, 'utf8'); + await writeJson(join(testDir, 'timing.json'), { + duration_ms: durationMs, + total_duration_seconds: Math.round(durationMs / 10) / 100, + }); + + console.log(` ${testId}: OK (${durationMs}ms, ${response.length} chars)`); + } catch (error) { + const durationMs = Math.round(performance.now() - start); + const message = error instanceof Error ? error.message : String(error); + const response = `ERROR: target failed — ${message}`; + await writeFile(join(testDir, 'response.md'), response, 'utf8'); + await writeJson(join(testDir, 'timing.json'), { + duration_ms: durationMs, + total_duration_seconds: Math.round(durationMs / 10) / 100, + }); + console.error(` ${testId}: FAILED (${durationMs}ms) — ${message.slice(0, 200)}`); + } finally { + // Cleanup temp files + try { + if (existsSync(promptFile)) unlinkSync(promptFile); + if (existsSync(outputFile)) unlinkSync(outputFile); + } catch { + /* ignore cleanup errors */ + } + } + }; + + // Run all targets in parallel + const allTasks = testIds.map((testId) => invokeTarget(testId)); + await Promise.all(allTasks); + } else { + console.log('Agent-as-target mode — skipping CLI invocation.'); + } + + // ── Step 3: Run code graders (same as pipeline grade) ──────────── + let totalGraders = 0; + let totalPassed = 0; + + for (const testId of testIds) { + const testDir = join(outDir, testId); + const codeGradersDir = join(testDir, 'code_graders'); + const resultsDir = join(testDir, 'code_grader_results'); + + let graderFiles: string[]; + try { + graderFiles = (await readdir(codeGradersDir)).filter((f) => f.endsWith('.json')); + } catch { + continue; + } + if (graderFiles.length === 0) continue; + await mkdir(resultsDir, { recursive: true }); + + const responseText = await readFile(join(testDir, 'response.md'), 'utf8'); + const inputData = JSON.parse(await readFile(join(testDir, 'input.json'), 'utf8')); + + for (const graderFile of graderFiles) { + const graderConfig = JSON.parse(await readFile(join(codeGradersDir, graderFile), 'utf8')); + const graderName = graderConfig.name; + + const payload = JSON.stringify({ + output: [{ role: 'assistant', content: responseText }], + input: inputData.input_messages, + question: inputData.input_text, + criteria: '', + expected_output: [], + reference_answer: '', + input_files: [], + trace: null, + token_usage: null, + cost_usd: null, + duration_ms: null, + start_time: null, + end_time: null, + file_changes: null, + workspace_path: null, + config: graderConfig.config ?? null, + metadata: {}, + input_text: inputData.input_text, + output_text: responseText, + expected_output_text: '', + }); + + try { + const stdout = await executeScript( + graderConfig.command, + payload, + undefined, + graderConfig.cwd, + ); + const parsed = JSON.parse(stdout); + const score = typeof parsed.score === 'number' ? parsed.score : 0; + const assertions = Array.isArray(parsed.assertions) ? parsed.assertions : []; + + await writeFile( + join(resultsDir, `${graderName}.json`), + `${JSON.stringify( + { + name: graderName, + type: 'code-grader', + score, + weight: graderConfig.weight ?? 1.0, + assertions, + details: parsed.details ?? {}, + }, + null, + 2, + )}\n`, + 'utf8', + ); + totalGraders++; + if (score >= 0.5) totalPassed++; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.error(` ${testId}/${graderName}: ERROR — ${message}`); + await writeFile( + join(resultsDir, `${graderName}.json`), + `${JSON.stringify( + { + name: graderName, + type: 'code-grader', + score: 0, + weight: graderConfig.weight ?? 1.0, + assertions: [{ text: `Error: ${message}`, passed: false }], + details: { error: message }, + }, + null, + 2, + )}\n`, + 'utf8', + ); + totalGraders++; + } + } + } + + console.log(`Graded ${totalGraders} code-grader(s): ${totalPassed} passed`); + console.log(`\nDone. Agent can now perform LLM grading on responses in ${outDir}`); + }, +}); + +// ── Helpers (shared with input.ts) ────────────────────────────────── + +async function writeJson(filePath: string, data: unknown): Promise { + await writeFile(filePath, `${JSON.stringify(data, null, 2)}\n`, 'utf8'); +} + +async function writeGraderConfigs( + testDir: string, + assertions: readonly EvaluatorConfig[], + evalDir: string, +): Promise { + const codeGradersDir = join(testDir, 'code_graders'); + const llmGradersDir = join(testDir, 'llm_graders'); + + let hasCodeGraders = false; + let hasLlmGraders = false; + + for (const assertion of assertions) { + if (assertion.type === 'code-grader') { + if (!hasCodeGraders) { + await mkdir(codeGradersDir, { recursive: true }); + hasCodeGraders = true; + } + const config = assertion as CodeEvaluatorConfig; + await writeJson(join(codeGradersDir, `${config.name}.json`), { + name: config.name, + command: config.command, + cwd: config.resolvedCwd ?? config.cwd ?? evalDir, + weight: config.weight ?? 1.0, + config: config.config ?? {}, + }); + } else if (assertion.type === 'llm-grader') { + if (!hasLlmGraders) { + await mkdir(llmGradersDir, { recursive: true }); + hasLlmGraders = true; + } + const config = assertion as LlmGraderEvaluatorConfig; + let promptContent = ''; + if (config.resolvedPromptPath) { + try { + promptContent = readFileSync(config.resolvedPromptPath, 'utf8'); + } catch { + promptContent = typeof config.prompt === 'string' ? config.prompt : ''; + } + } else if (typeof config.prompt === 'string') { + promptContent = config.prompt; + } + await writeJson(join(llmGradersDir, `${config.name}.json`), { + name: config.name, + prompt_content: promptContent, + weight: config.weight ?? 1.0, + threshold: 0.5, + config: {}, + }); + } + } +} diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md index 1126df59..4b742834 100644 --- a/plugins/agentv-dev/skills/agentv-bench/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-bench/SKILL.md @@ -48,17 +48,14 @@ These scripts break the eval pipeline into discrete steps. The agent runs them i ### Agent-mode workflow ```bash -# 1. Extract inputs and run CLI targets -python scripts/run_tests.py evals/repro.eval.yaml --out .agentv/results/export/run-1 +# 1. Extract inputs, invoke CLI targets, run code graders (one command): +agentv pipeline run evals/repro.eval.yaml --out .agentv/results/export/run-1 -# 2. Run code graders (deterministic, no LLM needed) -python scripts/run_code_graders.py .agentv/results/export/run-1 - -# 3. Agent performs LLM grading (reads llm_graders/*.json, produces scores JSON) +# 2. Agent performs LLM grading (reads llm_graders/*.json, produces scores JSON) # ... agent reads prompts, grades responses, writes llm_scores.json ... -# 4. Merge all scores and produce final artifacts -python scripts/bench.py .agentv/results/export/run-1 < llm_scores.json +# 3. Merge all scores and produce final artifacts +agentv pipeline bench .agentv/results/export/run-1 --llm-scores llm_scores.json ``` ### Skill management scripts @@ -276,30 +273,51 @@ When `AGENT_EVAL_MODE=agent` (default), use the pipeline CLI subcommands (`pipel - `agentv` CLI is installed (or run from source with `bun apps/cli/src/cli.ts`) - Read `references/eval-yaml-spec.md` for the full schema -**Step 1: Extract inputs and run targets** +**Recommended: Single command for CLI targets** + +For evals with CLI targets, `pipeline run` handles input extraction, target invocation, and code grading in one step: ```bash -# Using Python wrapper (recommended — handles target invocation in parallel): -python scripts/run_tests.py evals/repro.eval.yaml --out .agentv/results/export/run-1 +# Extract inputs, invoke all CLI targets in parallel, run code graders: +agentv pipeline run evals/repro.eval.yaml --out .agentv/results/export/run-1 +``` -# Or using CLI directly (extract only, agent handles execution): -agentv pipeline input evals/repro.eval.yaml --out .agentv/results/export/run-1 +Then the agent performs LLM grading and merges scores: + +```bash +# bash: +agentv pipeline bench .agentv/results/export/run-1 --llm-scores llm_scores.json + +# PowerShell (if --llm-scores is unavailable, pipe via Get-Content): +# Get-Content llm_scores.json | agentv pipeline bench .agentv/results/export/run-1 ``` -This creates an export directory with per-test `input.json`, `invoke.json`, `criteria.md`, and grader configs (`code_graders/*.json`, `llm_graders/*.json`). For CLI targets, `run_tests.py` also invokes the target and writes `response.md`. +That's the entire pipeline: **2 commands** plus the agent's LLM grading step. -For agent-as-target mode, the agent executes each test using the extracted `input.json` and writes `response.md` directly. +**Alternative: Step-by-step (agent-as-target or fine-grained control)** -**Step 2: Run code graders** +Use individual commands when the agent IS the target or you need control over each step: ```bash -python scripts/run_code_graders.py .agentv/results/export/run-1 -# Or: agentv pipeline grade .agentv/results/export/run-1 +# Step 1: Extract inputs +agentv pipeline input evals/repro.eval.yaml --out .agentv/results/export/run-1 + +# Step 2: Agent invokes each test (reads input.json, writes response.md) +# For CLI targets, you can also use the Python wrapper: +# python scripts/run_tests.py evals/repro.eval.yaml --out .agentv/results/export/run-1 + +# Step 3: Run code graders +agentv pipeline grade .agentv/results/export/run-1 + +# Step 4: Agent does LLM grading, writes llm_scores.json + +# Step 5: Merge scores +agentv pipeline bench .agentv/results/export/run-1 --llm-scores llm_scores.json ``` -Executes all code-grader assertions against `response.md` files. Writes `code_grader_results/.json` per test. +This creates an export directory with per-test `input.json`, `invoke.json`, `criteria.md`, and grader configs (`code_graders/*.json`, `llm_graders/*.json`). -**Step 3: LLM grading (agent performs directly)** +**Step 3 (LLM grading): agent performs directly** The agent reads `llm_graders/.json` for each test, grades the response using the prompt content, and produces a scores JSON: @@ -314,19 +332,11 @@ The agent reads `llm_graders/.json` for each test, grades the response usi } ``` -Dispatch the `grader` subagent (read `agents/grader.md`) for this step. +**Subagent environments (Claude Code):** Dispatch the `grader` subagent (read `agents/grader.md`) for this step. -**Step 4: Merge scores and produce artifacts** - -```bash -python scripts/bench.py .agentv/results/export/run-1 < llm_scores.json -# Or: agentv pipeline bench .agentv/results/export/run-1 < llm_scores.json -``` +**Non-subagent environments (VS Code Copilot, Codex, etc.):** Perform LLM grading inline. Read each `llm_graders/.json`, grade the response against the `prompt_content` criteria, score 0.0–1.0 with evidence, and write the result to `llm_scores.json` in the export directory. -Merges code-grader + LLM scores, computes weighted pass_rate, and writes: -- `/grading.json` — per-test grading breakdown -- `index.jsonl` — one line per test -- `benchmark.json` — aggregate statistics +**Note on Python wrapper scripts:** The `scripts/` directory contains Python wrappers (`run_tests.py`, `run_code_graders.py`, `bench.py`) that call the CLI commands. These are provided as an alternative but the direct CLI commands above are preferred — they work cross-platform without Python dependency issues. **Output structure:** ``` diff --git a/plugins/agentv-dev/skills/agentv-bench/scripts/bench.py b/plugins/agentv-dev/skills/agentv-bench/scripts/bench.py index 07624279..0197ced1 100644 --- a/plugins/agentv-dev/skills/agentv-bench/scripts/bench.py +++ b/plugins/agentv-dev/skills/agentv-bench/scripts/bench.py @@ -29,10 +29,20 @@ //grading.json <- merged grading per test """ import argparse +import shutil import subprocess import sys +def _find_agentv() -> str: + """Resolve the agentv executable via PATH (handles .ps1/.cmd on Windows).""" + path = shutil.which("agentv") + if not path: + print("agentv CLI not found. Install: bun install -g agentv", file=sys.stderr) + sys.exit(1) + return path + + def main(): parser = argparse.ArgumentParser( description="Merge scores and produce benchmark artifacts" @@ -42,7 +52,7 @@ def main(): # Pass stdin through to agentv pipeline bench result = subprocess.run( - ["agentv", "pipeline", "bench", args.export_dir], + [_find_agentv(), "pipeline", "bench", args.export_dir], stdin=sys.stdin, ) sys.exit(result.returncode) diff --git a/plugins/agentv-dev/skills/agentv-bench/scripts/run_code_graders.py b/plugins/agentv-dev/skills/agentv-bench/scripts/run_code_graders.py index 2b846c42..b69dbf56 100644 --- a/plugins/agentv-dev/skills/agentv-bench/scripts/run_code_graders.py +++ b/plugins/agentv-dev/skills/agentv-bench/scripts/run_code_graders.py @@ -19,17 +19,27 @@ //code_grader_results/.json """ import argparse +import shutil import subprocess import sys +def _find_agentv() -> str: + """Resolve the agentv executable via PATH (handles .ps1/.cmd on Windows).""" + path = shutil.which("agentv") + if not path: + print("agentv CLI not found. Install: bun install -g agentv", file=sys.stderr) + sys.exit(1) + return path + + def main(): parser = argparse.ArgumentParser(description="Run code-grader assertions") parser.add_argument("export_dir", help="Export directory from pipeline input") args = parser.parse_args() result = subprocess.run( - ["agentv", "pipeline", "grade", args.export_dir], + [_find_agentv(), "pipeline", "grade", args.export_dir], capture_output=False, ) sys.exit(result.returncode) diff --git a/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py b/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py index 3dbda367..c8ce32f7 100644 --- a/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py +++ b/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py @@ -26,6 +26,7 @@ import argparse import json import os +import shutil import subprocess import sys import tempfile @@ -35,10 +36,33 @@ from pathlib import Path +def _find_agentv() -> str: + """Resolve the agentv executable via PATH (handles .ps1/.cmd on Windows).""" + path = shutil.which("agentv") + if not path: + print("agentv CLI not found. Install: bun install -g agentv", file=sys.stderr) + sys.exit(1) + return path + + +def _load_env(env_file: Path) -> dict: + """Read key=value pairs from a .env file, ignoring comments and blanks.""" + env = {} + for line in env_file.read_text().splitlines(): + line = line.strip() + if not line or line.startswith("#"): + continue + if "=" not in line: + continue + key, _, value = line.partition("=") + env[key.strip()] = value.strip() + return env + + def run_agentv_input(eval_path: str, out_dir: str) -> dict: """Call agentv pipeline input and return the manifest.""" result = subprocess.run( - ["agentv", "pipeline", "input", eval_path, "--out", out_dir], + [_find_agentv(), "pipeline", "input", eval_path, "--out", out_dir], capture_output=True, text=True, ) @@ -49,7 +73,7 @@ def run_agentv_input(eval_path: str, out_dir: str) -> dict: return json.loads(manifest_path.read_text()) -def invoke_cli_target(test_dir: Path) -> None: +def invoke_cli_target(test_dir: Path, extra_env: dict | None = None) -> None: """Read invoke.json and execute the CLI target command.""" invoke_path = test_dir / "invoke.json" invoke = json.loads(invoke_path.read_text()) @@ -61,6 +85,7 @@ def invoke_cli_target(test_dir: Path) -> None: command_template = invoke["command"] cwd = invoke.get("cwd") timeout_s = invoke.get("timeout_ms", 30000) / 1000 + merged_env = {**os.environ, **(extra_env or {})} # Write prompt to temp file for {PROMPT_FILE} placeholder with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as pf: @@ -85,6 +110,7 @@ def invoke_cli_target(test_dir: Path) -> None: capture_output=True, text=True, timeout=timeout_s, + env=merged_env, ) duration_ms = int((time.time() - start) * 1000) @@ -130,6 +156,16 @@ def main(): ts = datetime.now(timezone.utc).isoformat().replace(":", "-").replace(".", "-") os.environ["AGENTV_RUN_TIMESTAMP"] = ts + # Load .env from eval directory or any parent + eval_dir = Path(args.eval_path).resolve().parent + env_file = None + for p in [eval_dir] + list(eval_dir.parents): + candidate = p / ".env" + if candidate.exists(): + env_file = candidate + break + extra_env = _load_env(env_file) if env_file else {} + manifest = run_agentv_input(args.eval_path, args.out) out = Path(args.out) @@ -149,7 +185,7 @@ def main(): print(f"Running {len(cli_tests)} CLI target(s) with {args.workers} workers...") with ThreadPoolExecutor(max_workers=args.workers) as pool: - futures = {pool.submit(invoke_cli_target, td): td.name for td in cli_tests} + futures = {pool.submit(invoke_cli_target, td, extra_env): td.name for td in cli_tests} for future in as_completed(futures): tid = futures[future] try: