diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts index 1c829dc7..58a86c27 100644 --- a/apps/cli/src/commands/pipeline/bench.ts +++ b/apps/cli/src/commands/pipeline/bench.ts @@ -2,21 +2,18 @@ * `agentv pipeline bench` — Merge code-grader and LLM grader scores into final * benchmark artifacts. * - * Reads code_grader_results from disk and LLM grader scores from a file - * (`--llm-scores `) or stdin, computes weighted pass_rate per test, - * and writes: + * Reads code_grader_results and llm_grader_results from disk per test. + * + * Writes: * - /grading.json (per-test grading breakdown) * - index.jsonl (one line per test) * - benchmark.json (aggregate statistics) - * - * Stdin format (LLM scores): - * { "": { "": { "score": 0.85, "assertions": [...] } } } */ import { existsSync } from 'node:fs'; import { readFile, readdir, writeFile } from 'node:fs/promises'; import { join } from 'node:path'; -import { command, option, optional, positional, string } from 'cmd-ts'; +import { command, positional, string } from 'cmd-ts'; interface EvaluatorScore { readonly name: string; @@ -35,13 +32,8 @@ export const evalBenchCommand = command({ displayName: 'export-dir', description: 'Export directory from pipeline input/grade', }), - llmScores: option({ - type: optional(string), - long: 'llm-scores', - description: 'Path to LLM scores JSON file (reads from stdin if omitted)', - }), }, - handler: async ({ exportDir, llmScores: llmScoresPath }) => { + handler: async ({ exportDir }) => { const manifest = JSON.parse(await readFile(join(exportDir, 'manifest.json'), 'utf8')); const testIds: string[] = manifest.test_ids; const targetName: string = manifest.target?.name ?? 'unknown'; @@ -49,21 +41,6 @@ export const evalBenchCommand = command({ const experiment: string | undefined = manifest.experiment; const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, '_') : ''; - // Read LLM scores from file or stdin - let stdinData: string; - if (llmScoresPath) { - stdinData = await readFile(llmScoresPath, 'utf8'); - } else { - stdinData = await readStdin(); - } - const llmScores: Record< - string, - Record< - string, - { score: number; assertions: { text: string; passed: boolean; evidence?: string }[] } - > - > = stdinData ? JSON.parse(stdinData) : {}; - const indexLines: string[] = []; const allPassRates: number[] = []; @@ -95,16 +72,23 @@ export const evalBenchCommand = command({ // No code grader results } - // Collect LLM grader scores (from stdin data) - const testLlmScores = llmScores[testId] ?? {}; - // Read LLM grader metadata for weights + // Collect LLM grader scores from per-test disk results const llmGradersDir = join(testDir, 'llm_graders'); try { const graderFiles = (await readdir(llmGradersDir)).filter((f) => f.endsWith('.json')); for (const file of graderFiles) { const graderMeta = JSON.parse(await readFile(join(llmGradersDir, file), 'utf8')); const graderName = graderMeta.name; - const llmResult = testLlmScores[graderName]; + + const diskResultPath = join(testDir, 'llm_grader_results', `${graderName}.json`); + let llmResult: + | { score: number; assertions?: { text: string; passed: boolean; evidence?: string }[] } + | undefined; + try { + llmResult = JSON.parse(await readFile(diskResultPath, 'utf8')); + } catch { + // No result for this grader + } if (llmResult) { evaluators.push({ @@ -133,7 +117,11 @@ export const evalBenchCommand = command({ const passed = allAssertions.filter((a) => a.passed).length; const failed = allAssertions.filter((a) => !a.passed).length; const passRate = - allAssertions.length > 0 ? Math.round((passed / allAssertions.length) * 1000) / 1000 : 0; + allAssertions.length > 0 + ? Math.round((passed / allAssertions.length) * 1000) / 1000 + : weightedScore >= 0.5 + ? 1.0 + : 0.0; allPassRates.push(passRate); @@ -238,14 +226,6 @@ export const evalBenchCommand = command({ }, }); -async function readStdin(): Promise { - const chunks: Buffer[] = []; - for await (const chunk of process.stdin) { - chunks.push(chunk); - } - return Buffer.concat(chunks).toString('utf8').trim(); -} - function computeStats(values: readonly number[]): { mean: number; stddev: number } { if (values.length === 0) return { mean: 0, stddev: 0 }; const mean = values.reduce((sum, v) => sum + v, 0) / values.length; diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts index 89146f21..80729f0b 100644 --- a/apps/cli/src/commands/pipeline/grade.ts +++ b/apps/cli/src/commands/pipeline/grade.ts @@ -104,7 +104,15 @@ export async function runCodeGraders( ); const parsed = JSON.parse(stdout); const score = typeof parsed.score === 'number' ? parsed.score : 0; - const assertions = Array.isArray(parsed.assertions) ? parsed.assertions : []; + // TODO: Remove hits/misses fallback once all grader scripts emit assertions natively. + // The hits/misses format is deprecated; graders should output { assertions: [...] } directly. + const assertions: { text: string; passed: boolean }[] = + Array.isArray(parsed.assertions) && parsed.assertions.length > 0 + ? parsed.assertions + : [ + ...(parsed.hits ?? []).map((h: string) => ({ text: h, passed: true })), + ...(parsed.misses ?? []).map((m: string) => ({ text: m, passed: false })), + ]; const result = { name: graderName, diff --git a/apps/cli/test/commands/eval/pipeline/bench.test.ts b/apps/cli/test/commands/eval/pipeline/bench.test.ts index f6225bbc..8d6b6be8 100644 --- a/apps/cli/test/commands/eval/pipeline/bench.test.ts +++ b/apps/cli/test/commands/eval/pipeline/bench.test.ts @@ -10,9 +10,11 @@ describe('pipeline bench', () => { const testDir = join(OUT_DIR, 'test-01'); const codeResultsDir = join(testDir, 'code_grader_results'); const llmGradersDir = join(testDir, 'llm_graders'); + const llmResultsDir = join(testDir, 'llm_grader_results'); const codeGradersDir = join(testDir, 'code_graders'); await mkdir(codeResultsDir, { recursive: true }); await mkdir(llmGradersDir, { recursive: true }); + await mkdir(llmResultsDir, { recursive: true }); await mkdir(codeGradersDir, { recursive: true }); await writeFile( @@ -58,17 +60,17 @@ describe('pipeline bench', () => { }); it('writes grading.json with merged scores and pass_rate', async () => { - const llmScores = JSON.stringify({ - 'test-01': { - relevance: { - score: 0.8, - assertions: [{ text: 'Relevant response', passed: true, evidence: 'matches criteria' }], - }, - }, - }); + // Write LLM grader result to disk (the default flow) + await writeFile( + join(OUT_DIR, 'test-01', 'llm_grader_results', 'relevance.json'), + JSON.stringify({ + score: 0.8, + assertions: [{ text: 'Relevant response', passed: true, evidence: 'matches criteria' }], + }), + ); const { execa } = await import('execa'); - await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: llmScores }); + await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]); const grading = JSON.parse(await readFile(join(OUT_DIR, 'test-01', 'grading.json'), 'utf8')); expect(grading.summary.pass_rate).toBeGreaterThan(0); @@ -77,17 +79,16 @@ describe('pipeline bench', () => { }); it('writes index.jsonl with one entry per test', async () => { - const llmScores = JSON.stringify({ - 'test-01': { - relevance: { - score: 0.8, - assertions: [{ text: 'Relevant', passed: true }], - }, - }, - }); + await writeFile( + join(OUT_DIR, 'test-01', 'llm_grader_results', 'relevance.json'), + JSON.stringify({ + score: 0.8, + assertions: [{ text: 'Relevant', passed: true }], + }), + ); const { execa } = await import('execa'); - await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: llmScores }); + await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]); const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8'); const lines = indexContent @@ -100,14 +101,16 @@ describe('pipeline bench', () => { }); it('writes benchmark.json with run_summary', async () => { - const llmScores = JSON.stringify({ - 'test-01': { - relevance: { score: 0.8, assertions: [{ text: 'ok', passed: true }] }, - }, - }); + await writeFile( + join(OUT_DIR, 'test-01', 'llm_grader_results', 'relevance.json'), + JSON.stringify({ + score: 0.8, + assertions: [{ text: 'ok', passed: true }], + }), + ); const { execa } = await import('execa'); - await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: llmScores }); + await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]); const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8')); expect(benchmark.metadata.targets).toContain('test-target'); @@ -128,7 +131,7 @@ describe('pipeline bench', () => { ); const { execa } = await import('execa'); - await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: '{}' }); + await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]); const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8'); const entry = JSON.parse(indexContent.trim().split('\n')[0]); @@ -140,7 +143,7 @@ describe('pipeline bench', () => { it('omits experiment from output when manifest has no experiment', async () => { const { execa } = await import('execa'); - await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: '{}' }); + await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]); const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8'); const entry = JSON.parse(indexContent.trim().split('\n')[0]); diff --git a/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts b/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts index b2542b0e..c0e7422b 100644 --- a/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts +++ b/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts @@ -1,4 +1,4 @@ -import { readFile, rm, writeFile } from 'node:fs/promises'; +import { mkdir, readFile, rm, writeFile } from 'node:fs/promises'; import { join } from 'node:path'; import { afterEach, describe, expect, it } from 'vitest'; @@ -33,16 +33,17 @@ describe('eval pipeline e2e', () => { ); expect(gradeResult.score).toBe(1); - // Step 4: pipeline bench with mock LLM scores - const llmScores = JSON.stringify({ - 'test-01': { - relevance: { - score: 0.9, - assertions: [{ text: 'Response is relevant', passed: true, evidence: 'echoes input' }], - }, - }, - }); - await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: llmScores }); + // Step 4: Write mock LLM grader result to disk, then run pipeline bench + const llmResultsDir = join(OUT_DIR, 'input-test', 'test-01', 'llm_grader_results'); + await mkdir(llmResultsDir, { recursive: true }); + await writeFile( + join(llmResultsDir, 'relevance.json'), + JSON.stringify({ + score: 0.9, + assertions: [{ text: 'Response is relevant', passed: true, evidence: 'echoes input' }], + }), + ); + await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]); // Verify final artifacts const grading = JSON.parse( diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md index 9aa7e796..c09fb76c 100644 --- a/plugins/agentv-dev/skills/agentv-bench/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-bench/SKILL.md @@ -200,18 +200,26 @@ Each grader subagent (read `agents/grader.md`): 1. Reads `/llm_graders/.json` for the grading prompt 2. Reads `/response.md` for the candidate output 3. Grades the response against the prompt criteria -4. Returns score (0.0–1.0) and per-assertion evidence +4. **Writes its result to disk**: `///llm_grader_results/.json` +5. Returns score (0.0–1.0) and per-assertion evidence to the orchestrator -After **all** grader subagents complete, merge their results into a single `llm_scores.json` in the run directory. +**Writing to disk is critical.** Assertion arrays are lost if accumulated only in the orchestrator's context across multiple batches (context summarization drops detail). Writing per-test results to `llm_grader_results/.json` makes grading resumable and assertion evidence durable. + +The result file format is: +```json +{ "score": 0.85, "assertions": [{"text": "...", "passed": true, "evidence": "..."}] } +``` + +After **all** grader subagents complete, run Phase 3 directly. **Phase 3: Merge and validate** ```bash -agentv pipeline bench --llm-scores llm_scores.json +agentv pipeline bench agentv results validate ``` -This merges code-grader + LLM scores, computes weighted pass_rate, writes `grading.json` + `index.jsonl` + `benchmark.json`. +`pipeline bench` reads LLM grader results from `llm_grader_results/.json` per test automatically, merges with code-grader scores, computes weighted pass_rate, and writes `grading.json` + `index.jsonl` + `benchmark.json`. ### Artifacts diff --git a/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md b/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md index d95a8595..e7c908ef 100644 --- a/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md +++ b/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md @@ -313,18 +313,13 @@ Runs code-grader assertions against `response.md` files in each test directory. ### `agentv pipeline bench ` -Merges code-grader results with LLM grader scores (read from stdin) and produces final artifacts. +Merges code-grader results with LLM grader scores and produces final artifacts. -**Stdin format (LLM grader scores):** +LLM grader results are read from disk at `/llm_grader_results/.json` per test. + +**LLM grader result file format** (`llm_grader_results/.json`): ```json -{ - "": { - "": { - "score": 0.85, - "assertions": [{"text": "...", "passed": true, "evidence": "..."}] - } - } -} +{ "score": 0.85, "assertions": [{"text": "...", "passed": true, "evidence": "..."}] } ``` **Output:** diff --git a/plugins/agentv-dev/skills/agentv-bench/references/subagent-pipeline.md b/plugins/agentv-dev/skills/agentv-bench/references/subagent-pipeline.md index dfb918bc..50f92988 100644 --- a/plugins/agentv-dev/skills/agentv-bench/references/subagent-pipeline.md +++ b/plugins/agentv-dev/skills/agentv-bench/references/subagent-pipeline.md @@ -109,10 +109,10 @@ agentv pipeline input evals/repro.eval.yaml # Step 3: Run code graders agentv pipeline grade -# Step 4: Subagent does LLM grading, writes llm_scores.json +# Step 4: Subagent does LLM grading, writes results to llm_grader_results/.json per test # Step 5: Merge scores (writes index.jsonl with full scores[] for dashboard) -agentv pipeline bench --llm-scores llm_scores.json +agentv pipeline bench # Step 6: Validate agentv results validate