Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 21 additions & 41 deletions apps/cli/src/commands/pipeline/bench.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,18 @@
* `agentv pipeline bench` — Merge code-grader and LLM grader scores into final
* benchmark artifacts.
*
* Reads code_grader_results from disk and LLM grader scores from a file
* (`--llm-scores <path>`) or stdin, computes weighted pass_rate per test,
* and writes:
* Reads code_grader_results and llm_grader_results from disk per test.
*
* Writes:
* - <test-id>/grading.json (per-test grading breakdown)
* - index.jsonl (one line per test)
* - benchmark.json (aggregate statistics)
*
* Stdin format (LLM scores):
* { "<test-id>": { "<grader-name>": { "score": 0.85, "assertions": [...] } } }
*/
import { existsSync } from 'node:fs';
import { readFile, readdir, writeFile } from 'node:fs/promises';
import { join } from 'node:path';

import { command, option, optional, positional, string } from 'cmd-ts';
import { command, positional, string } from 'cmd-ts';

interface EvaluatorScore {
readonly name: string;
Expand All @@ -35,35 +32,15 @@ export const evalBenchCommand = command({
displayName: 'export-dir',
description: 'Export directory from pipeline input/grade',
}),
llmScores: option({
type: optional(string),
long: 'llm-scores',
description: 'Path to LLM scores JSON file (reads from stdin if omitted)',
}),
},
handler: async ({ exportDir, llmScores: llmScoresPath }) => {
handler: async ({ exportDir }) => {
const manifest = JSON.parse(await readFile(join(exportDir, 'manifest.json'), 'utf8'));
const testIds: string[] = manifest.test_ids;
const targetName: string = manifest.target?.name ?? 'unknown';
const evalSet: string = manifest.dataset ?? '';
const experiment: string | undefined = manifest.experiment;
const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, '_') : '';

// Read LLM scores from file or stdin
let stdinData: string;
if (llmScoresPath) {
stdinData = await readFile(llmScoresPath, 'utf8');
} else {
stdinData = await readStdin();
}
const llmScores: Record<
string,
Record<
string,
{ score: number; assertions: { text: string; passed: boolean; evidence?: string }[] }
>
> = stdinData ? JSON.parse(stdinData) : {};

const indexLines: string[] = [];
const allPassRates: number[] = [];

Expand Down Expand Up @@ -95,16 +72,23 @@ export const evalBenchCommand = command({
// No code grader results
}

// Collect LLM grader scores (from stdin data)
const testLlmScores = llmScores[testId] ?? {};
// Read LLM grader metadata for weights
// Collect LLM grader scores from per-test disk results
const llmGradersDir = join(testDir, 'llm_graders');
try {
const graderFiles = (await readdir(llmGradersDir)).filter((f) => f.endsWith('.json'));
for (const file of graderFiles) {
const graderMeta = JSON.parse(await readFile(join(llmGradersDir, file), 'utf8'));
const graderName = graderMeta.name;
const llmResult = testLlmScores[graderName];

const diskResultPath = join(testDir, 'llm_grader_results', `${graderName}.json`);
let llmResult:
| { score: number; assertions?: { text: string; passed: boolean; evidence?: string }[] }
| undefined;
try {
llmResult = JSON.parse(await readFile(diskResultPath, 'utf8'));
} catch {
// No result for this grader
}

if (llmResult) {
evaluators.push({
Expand Down Expand Up @@ -133,7 +117,11 @@ export const evalBenchCommand = command({
const passed = allAssertions.filter((a) => a.passed).length;
const failed = allAssertions.filter((a) => !a.passed).length;
const passRate =
allAssertions.length > 0 ? Math.round((passed / allAssertions.length) * 1000) / 1000 : 0;
allAssertions.length > 0
? Math.round((passed / allAssertions.length) * 1000) / 1000
: weightedScore >= 0.5
? 1.0
: 0.0;

allPassRates.push(passRate);

Expand Down Expand Up @@ -238,14 +226,6 @@ export const evalBenchCommand = command({
},
});

async function readStdin(): Promise<string> {
const chunks: Buffer[] = [];
for await (const chunk of process.stdin) {
chunks.push(chunk);
}
return Buffer.concat(chunks).toString('utf8').trim();
}

function computeStats(values: readonly number[]): { mean: number; stddev: number } {
if (values.length === 0) return { mean: 0, stddev: 0 };
const mean = values.reduce((sum, v) => sum + v, 0) / values.length;
Expand Down
10 changes: 9 additions & 1 deletion apps/cli/src/commands/pipeline/grade.ts
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,15 @@ export async function runCodeGraders(
);
const parsed = JSON.parse(stdout);
const score = typeof parsed.score === 'number' ? parsed.score : 0;
const assertions = Array.isArray(parsed.assertions) ? parsed.assertions : [];
// TODO: Remove hits/misses fallback once all grader scripts emit assertions natively.
// The hits/misses format is deprecated; graders should output { assertions: [...] } directly.
const assertions: { text: string; passed: boolean }[] =
Array.isArray(parsed.assertions) && parsed.assertions.length > 0
? parsed.assertions
: [
...(parsed.hits ?? []).map((h: string) => ({ text: h, passed: true })),
...(parsed.misses ?? []).map((m: string) => ({ text: m, passed: false })),
];

const result = {
name: graderName,
Expand Down
55 changes: 29 additions & 26 deletions apps/cli/test/commands/eval/pipeline/bench.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@ describe('pipeline bench', () => {
const testDir = join(OUT_DIR, 'test-01');
const codeResultsDir = join(testDir, 'code_grader_results');
const llmGradersDir = join(testDir, 'llm_graders');
const llmResultsDir = join(testDir, 'llm_grader_results');
const codeGradersDir = join(testDir, 'code_graders');
await mkdir(codeResultsDir, { recursive: true });
await mkdir(llmGradersDir, { recursive: true });
await mkdir(llmResultsDir, { recursive: true });
await mkdir(codeGradersDir, { recursive: true });

await writeFile(
Expand Down Expand Up @@ -58,17 +60,17 @@ describe('pipeline bench', () => {
});

it('writes grading.json with merged scores and pass_rate', async () => {
const llmScores = JSON.stringify({
'test-01': {
relevance: {
score: 0.8,
assertions: [{ text: 'Relevant response', passed: true, evidence: 'matches criteria' }],
},
},
});
// Write LLM grader result to disk (the default flow)
await writeFile(
join(OUT_DIR, 'test-01', 'llm_grader_results', 'relevance.json'),
JSON.stringify({
score: 0.8,
assertions: [{ text: 'Relevant response', passed: true, evidence: 'matches criteria' }],
}),
);

const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: llmScores });
await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]);

const grading = JSON.parse(await readFile(join(OUT_DIR, 'test-01', 'grading.json'), 'utf8'));
expect(grading.summary.pass_rate).toBeGreaterThan(0);
Expand All @@ -77,17 +79,16 @@ describe('pipeline bench', () => {
});

it('writes index.jsonl with one entry per test', async () => {
const llmScores = JSON.stringify({
'test-01': {
relevance: {
score: 0.8,
assertions: [{ text: 'Relevant', passed: true }],
},
},
});
await writeFile(
join(OUT_DIR, 'test-01', 'llm_grader_results', 'relevance.json'),
JSON.stringify({
score: 0.8,
assertions: [{ text: 'Relevant', passed: true }],
}),
);

const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: llmScores });
await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]);

const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8');
const lines = indexContent
Expand All @@ -100,14 +101,16 @@ describe('pipeline bench', () => {
});

it('writes benchmark.json with run_summary', async () => {
const llmScores = JSON.stringify({
'test-01': {
relevance: { score: 0.8, assertions: [{ text: 'ok', passed: true }] },
},
});
await writeFile(
join(OUT_DIR, 'test-01', 'llm_grader_results', 'relevance.json'),
JSON.stringify({
score: 0.8,
assertions: [{ text: 'ok', passed: true }],
}),
);

const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: llmScores });
await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]);

const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8'));
expect(benchmark.metadata.targets).toContain('test-target');
Expand All @@ -128,7 +131,7 @@ describe('pipeline bench', () => {
);

const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: '{}' });
await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]);

const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8');
const entry = JSON.parse(indexContent.trim().split('\n')[0]);
Expand All @@ -140,7 +143,7 @@ describe('pipeline bench', () => {

it('omits experiment from output when manifest has no experiment', async () => {
const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: '{}' });
await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]);

const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8');
const entry = JSON.parse(indexContent.trim().split('\n')[0]);
Expand Down
23 changes: 12 additions & 11 deletions apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { readFile, rm, writeFile } from 'node:fs/promises';
import { mkdir, readFile, rm, writeFile } from 'node:fs/promises';
import { join } from 'node:path';
import { afterEach, describe, expect, it } from 'vitest';

Expand Down Expand Up @@ -33,16 +33,17 @@ describe('eval pipeline e2e', () => {
);
expect(gradeResult.score).toBe(1);

// Step 4: pipeline bench with mock LLM scores
const llmScores = JSON.stringify({
'test-01': {
relevance: {
score: 0.9,
assertions: [{ text: 'Response is relevant', passed: true, evidence: 'echoes input' }],
},
},
});
await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: llmScores });
// Step 4: Write mock LLM grader result to disk, then run pipeline bench
const llmResultsDir = join(OUT_DIR, 'input-test', 'test-01', 'llm_grader_results');
await mkdir(llmResultsDir, { recursive: true });
await writeFile(
join(llmResultsDir, 'relevance.json'),
JSON.stringify({
score: 0.9,
assertions: [{ text: 'Response is relevant', passed: true, evidence: 'echoes input' }],
}),
);
await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]);

// Verify final artifacts
const grading = JSON.parse(
Expand Down
16 changes: 12 additions & 4 deletions plugins/agentv-dev/skills/agentv-bench/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -200,18 +200,26 @@ Each grader subagent (read `agents/grader.md`):
1. Reads `<test-id>/llm_graders/<name>.json` for the grading prompt
2. Reads `<test-id>/response.md` for the candidate output
3. Grades the response against the prompt criteria
4. Returns score (0.0–1.0) and per-assertion evidence
4. **Writes its result to disk**: `<run-dir>/<evalset>/<test-id>/llm_grader_results/<name>.json`
5. Returns score (0.0–1.0) and per-assertion evidence to the orchestrator

After **all** grader subagents complete, merge their results into a single `llm_scores.json` in the run directory.
**Writing to disk is critical.** Assertion arrays are lost if accumulated only in the orchestrator's context across multiple batches (context summarization drops detail). Writing per-test results to `llm_grader_results/<name>.json` makes grading resumable and assertion evidence durable.

The result file format is:
```json
{ "score": 0.85, "assertions": [{"text": "...", "passed": true, "evidence": "..."}] }
```

After **all** grader subagents complete, run Phase 3 directly.

**Phase 3: Merge and validate**

```bash
agentv pipeline bench <run-dir> --llm-scores llm_scores.json
agentv pipeline bench <run-dir>
agentv results validate <run-dir>
```

This merges code-grader + LLM scores, computes weighted pass_rate, writes `grading.json` + `index.jsonl` + `benchmark.json`.
`pipeline bench` reads LLM grader results from `llm_grader_results/<name>.json` per test automatically, merges with code-grader scores, computes weighted pass_rate, and writes `grading.json` + `index.jsonl` + `benchmark.json`.

### Artifacts

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -313,18 +313,13 @@ Runs code-grader assertions against `response.md` files in each test directory.

### `agentv pipeline bench <export-dir>`

Merges code-grader results with LLM grader scores (read from stdin) and produces final artifacts.
Merges code-grader results with LLM grader scores and produces final artifacts.

**Stdin format (LLM grader scores):**
LLM grader results are read from disk at `<test-id>/llm_grader_results/<name>.json` per test.

**LLM grader result file format** (`llm_grader_results/<name>.json`):
```json
{
"<test-id>": {
"<grader-name>": {
"score": 0.85,
"assertions": [{"text": "...", "passed": true, "evidence": "..."}]
}
}
}
{ "score": 0.85, "assertions": [{"text": "...", "passed": true, "evidence": "..."}] }
```

**Output:**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,10 +109,10 @@ agentv pipeline input evals/repro.eval.yaml
# Step 3: Run code graders
agentv pipeline grade <run-dir>

# Step 4: Subagent does LLM grading, writes llm_scores.json
# Step 4: Subagent does LLM grading, writes results to llm_grader_results/<name>.json per test

# Step 5: Merge scores (writes index.jsonl with full scores[] for dashboard)
agentv pipeline bench <run-dir> --llm-scores llm_scores.json
agentv pipeline bench <run-dir>

# Step 6: Validate
agentv results validate <run-dir>
Expand Down
Loading