EntityProcess · christso · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026
diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts
@@ -2,21 +2,18 @@
  * `agentv pipeline bench` — Merge code-grader and LLM grader scores into final
  * benchmark artifacts.
  *
- * Reads code_grader_results from disk and LLM grader scores from a file
- * (`--llm-scores <path>`) or stdin, computes weighted pass_rate per test,
- * and writes:
+ * Reads code_grader_results and llm_grader_results from disk per test.
+ *
+ * Writes:
  *   - <test-id>/grading.json  (per-test grading breakdown)
  *   - index.jsonl             (one line per test)
  *   - benchmark.json          (aggregate statistics)
- *
- * Stdin format (LLM scores):
- *   { "<test-id>": { "<grader-name>": { "score": 0.85, "assertions": [...] } } }
  */
 import { existsSync } from 'node:fs';
 import { readFile, readdir, writeFile } from 'node:fs/promises';
 import { join } from 'node:path';
 
-import { command, option, optional, positional, string } from 'cmd-ts';
+import { command, positional, string } from 'cmd-ts';
 
 interface EvaluatorScore {
   readonly name: string;
@@ -35,35 +32,15 @@ export const evalBenchCommand = command({
       displayName: 'export-dir',
       description: 'Export directory from pipeline input/grade',
     }),
-    llmScores: option({
-      type: optional(string),
-      long: 'llm-scores',
-      description: 'Path to LLM scores JSON file (reads from stdin if omitted)',
-    }),
   },
-  handler: async ({ exportDir, llmScores: llmScoresPath }) => {
+  handler: async ({ exportDir }) => {
     const manifest = JSON.parse(await readFile(join(exportDir, 'manifest.json'), 'utf8'));
     const testIds: string[] = manifest.test_ids;
     const targetName: string = manifest.target?.name ?? 'unknown';
     const evalSet: string = manifest.dataset ?? '';
     const experiment: string | undefined = manifest.experiment;
     const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, '_') : '';
 
-    // Read LLM scores from file or stdin
-    let stdinData: string;
-    if (llmScoresPath) {
-      stdinData = await readFile(llmScoresPath, 'utf8');
-    } else {
-      stdinData = await readStdin();
-    }
-    const llmScores: Record<
-      string,
-      Record<
-        string,
-        { score: number; assertions: { text: string; passed: boolean; evidence?: string }[] }
-      >
-    > = stdinData ? JSON.parse(stdinData) : {};
-
     const indexLines: string[] = [];
     const allPassRates: number[] = [];
 
@@ -95,16 +72,23 @@ export const evalBenchCommand = command({
         // No code grader results
       }
 
-      // Collect LLM grader scores (from stdin data)
-      const testLlmScores = llmScores[testId] ?? {};
-      // Read LLM grader metadata for weights
+      // Collect LLM grader scores from per-test disk results
       const llmGradersDir = join(testDir, 'llm_graders');
       try {
         const graderFiles = (await readdir(llmGradersDir)).filter((f) => f.endsWith('.json'));
         for (const file of graderFiles) {
           const graderMeta = JSON.parse(await readFile(join(llmGradersDir, file), 'utf8'));
           const graderName = graderMeta.name;
-          const llmResult = testLlmScores[graderName];
+
+          const diskResultPath = join(testDir, 'llm_grader_results', `${graderName}.json`);
+          let llmResult:
+            | { score: number; assertions?: { text: string; passed: boolean; evidence?: string }[] }
+            | undefined;
+          try {
+            llmResult = JSON.parse(await readFile(diskResultPath, 'utf8'));
+          } catch {
+            // No result for this grader
+          }
 
           if (llmResult) {
             evaluators.push({
@@ -133,7 +117,11 @@ export const evalBenchCommand = command({
       const passed = allAssertions.filter((a) => a.passed).length;
       const failed = allAssertions.filter((a) => !a.passed).length;
       const passRate =
-        allAssertions.length > 0 ? Math.round((passed / allAssertions.length) * 1000) / 1000 : 0;
+        allAssertions.length > 0
+          ? Math.round((passed / allAssertions.length) * 1000) / 1000
+          : weightedScore >= 0.5
+            ? 1.0
+            : 0.0;
 
       allPassRates.push(passRate);
 
@@ -238,14 +226,6 @@ export const evalBenchCommand = command({
   },
 });
 
-async function readStdin(): Promise<string> {
-  const chunks: Buffer[] = [];
-  for await (const chunk of process.stdin) {
-    chunks.push(chunk);
-  }
-  return Buffer.concat(chunks).toString('utf8').trim();
-}
-
 function computeStats(values: readonly number[]): { mean: number; stddev: number } {
   if (values.length === 0) return { mean: 0, stddev: 0 };
   const mean = values.reduce((sum, v) => sum + v, 0) / values.length;

diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts
@@ -104,7 +104,15 @@ export async function runCodeGraders(
       );
       const parsed = JSON.parse(stdout);
       const score = typeof parsed.score === 'number' ? parsed.score : 0;
-      const assertions = Array.isArray(parsed.assertions) ? parsed.assertions : [];
+      // TODO: Remove hits/misses fallback once all grader scripts emit assertions natively.
+      // The hits/misses format is deprecated; graders should output { assertions: [...] } directly.
+      const assertions: { text: string; passed: boolean }[] =
+        Array.isArray(parsed.assertions) && parsed.assertions.length > 0
+          ? parsed.assertions
+          : [
+              ...(parsed.hits ?? []).map((h: string) => ({ text: h, passed: true })),
+              ...(parsed.misses ?? []).map((m: string) => ({ text: m, passed: false })),
+            ];
 
       const result = {
         name: graderName,

diff --git a/apps/cli/test/commands/eval/pipeline/bench.test.ts b/apps/cli/test/commands/eval/pipeline/bench.test.ts
@@ -10,9 +10,11 @@ describe('pipeline bench', () => {
     const testDir = join(OUT_DIR, 'test-01');
     const codeResultsDir = join(testDir, 'code_grader_results');
     const llmGradersDir = join(testDir, 'llm_graders');
+    const llmResultsDir = join(testDir, 'llm_grader_results');
     const codeGradersDir = join(testDir, 'code_graders');
     await mkdir(codeResultsDir, { recursive: true });
     await mkdir(llmGradersDir, { recursive: true });
+    await mkdir(llmResultsDir, { recursive: true });
     await mkdir(codeGradersDir, { recursive: true });
 
     await writeFile(
@@ -58,17 +60,17 @@ describe('pipeline bench', () => {
   });
 
   it('writes grading.json with merged scores and pass_rate', async () => {
-    const llmScores = JSON.stringify({
-      'test-01': {
-        relevance: {
-          score: 0.8,
-          assertions: [{ text: 'Relevant response', passed: true, evidence: 'matches criteria' }],
-        },
-      },
-    });
+    // Write LLM grader result to disk (the default flow)
+    await writeFile(
+      join(OUT_DIR, 'test-01', 'llm_grader_results', 'relevance.json'),
+      JSON.stringify({
+        score: 0.8,
+        assertions: [{ text: 'Relevant response', passed: true, evidence: 'matches criteria' }],
+      }),
+    );
 
     const { execa } = await import('execa');
-    await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: llmScores });
+    await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]);
 
     const grading = JSON.parse(await readFile(join(OUT_DIR, 'test-01', 'grading.json'), 'utf8'));
     expect(grading.summary.pass_rate).toBeGreaterThan(0);
@@ -77,17 +79,16 @@ describe('pipeline bench', () => {
   });
 
   it('writes index.jsonl with one entry per test', async () => {
-    const llmScores = JSON.stringify({
-      'test-01': {
-        relevance: {
-          score: 0.8,
-          assertions: [{ text: 'Relevant', passed: true }],
-        },
-      },
-    });
+    await writeFile(
+      join(OUT_DIR, 'test-01', 'llm_grader_results', 'relevance.json'),
+      JSON.stringify({
+        score: 0.8,
+        assertions: [{ text: 'Relevant', passed: true }],
+      }),
+    );
 
     const { execa } = await import('execa');
-    await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: llmScores });
+    await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]);
 
     const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8');
     const lines = indexContent
@@ -100,14 +101,16 @@ describe('pipeline bench', () => {
   });
 
   it('writes benchmark.json with run_summary', async () => {
-    const llmScores = JSON.stringify({
-      'test-01': {
-        relevance: { score: 0.8, assertions: [{ text: 'ok', passed: true }] },
-      },
-    });
+    await writeFile(
+      join(OUT_DIR, 'test-01', 'llm_grader_results', 'relevance.json'),
+      JSON.stringify({
+        score: 0.8,
+        assertions: [{ text: 'ok', passed: true }],
+      }),
+    );
 
     const { execa } = await import('execa');
-    await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: llmScores });
+    await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]);
 
     const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8'));
     expect(benchmark.metadata.targets).toContain('test-target');
@@ -128,7 +131,7 @@ describe('pipeline bench', () => {
     );
 
     const { execa } = await import('execa');
-    await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: '{}' });
+    await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]);
 
     const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8');
     const entry = JSON.parse(indexContent.trim().split('\n')[0]);
@@ -140,7 +143,7 @@ describe('pipeline bench', () => {
 
   it('omits experiment from output when manifest has no experiment', async () => {
     const { execa } = await import('execa');
-    await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: '{}' });
+    await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]);
 
     const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8');
     const entry = JSON.parse(indexContent.trim().split('\n')[0]);

diff --git a/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts b/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts
@@ -1,4 +1,4 @@
-import { readFile, rm, writeFile } from 'node:fs/promises';
+import { mkdir, readFile, rm, writeFile } from 'node:fs/promises';
 import { join } from 'node:path';
 import { afterEach, describe, expect, it } from 'vitest';
 
@@ -33,16 +33,17 @@ describe('eval pipeline e2e', () => {
     );
     expect(gradeResult.score).toBe(1);
 
-    // Step 4: pipeline bench with mock LLM scores
-    const llmScores = JSON.stringify({
-      'test-01': {
-        relevance: {
-          score: 0.9,
-          assertions: [{ text: 'Response is relevant', passed: true, evidence: 'echoes input' }],
-        },
-      },
-    });
-    await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: llmScores });
+    // Step 4: Write mock LLM grader result to disk, then run pipeline bench
+    const llmResultsDir = join(OUT_DIR, 'input-test', 'test-01', 'llm_grader_results');
+    await mkdir(llmResultsDir, { recursive: true });
+    await writeFile(
+      join(llmResultsDir, 'relevance.json'),
+      JSON.stringify({
+        score: 0.9,
+        assertions: [{ text: 'Response is relevant', passed: true, evidence: 'echoes input' }],
+      }),
+    );
+    await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]);
 
     // Verify final artifacts
     const grading = JSON.parse(

diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md
@@ -200,18 +200,26 @@ Each grader subagent (read `agents/grader.md`):
 1. Reads `<test-id>/llm_graders/<name>.json` for the grading prompt
 2. Reads `<test-id>/response.md` for the candidate output
 3. Grades the response against the prompt criteria
-4. Returns score (0.0–1.0) and per-assertion evidence
+4. **Writes its result to disk**: `<run-dir>/<evalset>/<test-id>/llm_grader_results/<name>.json`
+5. Returns score (0.0–1.0) and per-assertion evidence to the orchestrator
 
-After **all** grader subagents complete, merge their results into a single `llm_scores.json` in the run directory.
+**Writing to disk is critical.** Assertion arrays are lost if accumulated only in the orchestrator's context across multiple batches (context summarization drops detail). Writing per-test results to `llm_grader_results/<name>.json` makes grading resumable and assertion evidence durable.
+
+The result file format is:
+```json
+{ "score": 0.85, "assertions": [{"text": "...", "passed": true, "evidence": "..."}] }
+```
+
+After **all** grader subagents complete, run Phase 3 directly.
 
 **Phase 3: Merge and validate**
 
 ```bash
-agentv pipeline bench <run-dir> --llm-scores llm_scores.json
+agentv pipeline bench <run-dir>
 agentv results validate <run-dir>
 ```
 
-This merges code-grader + LLM scores, computes weighted pass_rate, writes `grading.json` + `index.jsonl` + `benchmark.json`.
+`pipeline bench` reads LLM grader results from `llm_grader_results/<name>.json` per test automatically, merges with code-grader scores, computes weighted pass_rate, and writes `grading.json` + `index.jsonl` + `benchmark.json`.
 
 ### Artifacts
 

diff --git a/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md b/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md
@@ -313,18 +313,13 @@ Runs code-grader assertions against `response.md` files in each test directory.
 
 ### `agentv pipeline bench <export-dir>`
 
-Merges code-grader results with LLM grader scores (read from stdin) and produces final artifacts.
+Merges code-grader results with LLM grader scores and produces final artifacts.
 
-**Stdin format (LLM grader scores):**
+LLM grader results are read from disk at `<test-id>/llm_grader_results/<name>.json` per test.
+
+**LLM grader result file format** (`llm_grader_results/<name>.json`):
 ```json
-{
-  "<test-id>": {
-    "<grader-name>": {
-      "score": 0.85,
-      "assertions": [{"text": "...", "passed": true, "evidence": "..."}]
-    }
-  }
-}
+{ "score": 0.85, "assertions": [{"text": "...", "passed": true, "evidence": "..."}] }
 ```
 
 **Output:**

diff --git a/plugins/agentv-dev/skills/agentv-bench/references/subagent-pipeline.md b/plugins/agentv-dev/skills/agentv-bench/references/subagent-pipeline.md
@@ -109,10 +109,10 @@ agentv pipeline input evals/repro.eval.yaml
 # Step 3: Run code graders
 agentv pipeline grade <run-dir>
 
-# Step 4: Subagent does LLM grading, writes llm_scores.json
+# Step 4: Subagent does LLM grading, writes results to llm_grader_results/<name>.json per test
 
 # Step 5: Merge scores (writes index.jsonl with full scores[] for dashboard)
-agentv pipeline bench <run-dir> --llm-scores llm_scores.json
+agentv pipeline bench <run-dir>
 
 # Step 6: Validate
 agentv results validate <run-dir>