feat(pipeline): agent-mode artifacts align with CLI-mode schema (#796)

christso · claude · web-flow · commit 2c778b9ea620 · 2026-03-28T00:48:51.000+11:00
* feat(pipeline): agent-mode artifacts align with CLI-mode schema

- pipeline run/input: --out now optional, defaults to .agentv/results/runs/eval_&lt;timestamp&gt;
- pipeline bench: index.jsonl now includes scores[], execution_status, response_path to match CLI-mode dashboard schema
- results validate: new command to check run dir naming, index.jsonl fields, artifact presence, and score bounds
- skill: update agent-mode workflow docs to use default --out, add validate step, clarify llm_scores.json -&gt; index.jsonl flow; user-stated mode overrides .env

* fix: address code review issues for pipeline artifact alignment

1. execution_status: run.ts now writes status into timing.json ('ok' or
   'execution_error'), bench.ts reads it back instead of hardcoding 'ok'
2. response_path: use null instead of undefined so the field is always
   present in index.jsonl
3. --workers concurrency: implement actual concurrency limiter using
   Promise.race instead of unbounded Promise.all
4. validate.ts: validate scores[] entry structure (name, type, score,
   verdict) and warn on unknown execution_status values

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts
@@ -12,6 +12,7 @@
  * Stdin format (LLM scores):
  *   { "<test-id>": { "<grader-name>": { "score": 0.85, "assertions": [...] } } }
  */
+import { existsSync } from 'node:fs';
 import { readFile, readdir, writeFile } from 'node:fs/promises';
 import { join } from 'node:path';
 
@@ -150,15 +151,46 @@ export const evalBenchCommand = command({
         'utf8',
       );
 
-      // Build index entry
+      // Build index entry (match CLI-mode schema for dashboard compatibility)
+      const scores = evaluators.map((e) => ({
+        name: e.name,
+        type: e.type,
+        score: e.score,
+        weight: e.weight,
+        verdict: e.score >= 0.5 ? 'pass' : 'fail',
+        assertions: e.assertions.map((a) => ({
+          text: a.text,
+          passed: a.passed,
+          evidence: a.evidence ?? '',
+        })),
+      }));
+
+      // Read execution_status from timing.json (written by pipeline run)
+      let executionStatus = 'ok';
+      const timingPath = join(testDir, 'timing.json');
+      if (existsSync(timingPath)) {
+        try {
+          const timing = JSON.parse(await readFile(timingPath, 'utf8'));
+          if (typeof timing.execution_status === 'string') {
+            executionStatus = timing.execution_status;
+          }
+        } catch {
+          // Fall back to 'ok' if timing.json is unreadable
+        }
+      }
+
+      const hasResponse = existsSync(join(testDir, 'response.md'));
       indexLines.push(
         JSON.stringify({
           timestamp: manifest.timestamp,
           test_id: testId,
           score: Math.round(weightedScore * 1000) / 1000,
           target: targetName,
+          scores,
+          execution_status: executionStatus,
           grading_path: `${testId}/grading.json`,
           timing_path: `${testId}/timing.json`,
+          response_path: hasResponse ? `${testId}/response.md` : null,
         }),
       );
     }
diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts
@@ -23,8 +23,9 @@ import { dirname, join, resolve } from 'node:path';
 
 import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core';
 import { loadTestSuite } from '@agentv/core';
-import { command, option, positional, string } from 'cmd-ts';
+import { command, option, optional, positional, string } from 'cmd-ts';
 
+import { buildDefaultRunDir } from '../eval/result-layout.js';
 import { findRepoRoot } from '../eval/shared.js';
 import { selectTarget } from '../eval/targets.js';
 
@@ -38,14 +39,15 @@ export const evalInputCommand = command({
       description: 'Path to eval YAML file',
     }),
     out: option({
-      type: string,
+      type: optional(string),
       long: 'out',
-      description: 'Output directory for extracted inputs',
+      description:
+        'Output directory for extracted inputs (default: .agentv/results/runs/eval_<timestamp>)',
     }),
   },
   handler: async ({ evalPath, out }) => {
     const resolvedEvalPath = resolve(evalPath);
-    const outDir = resolve(out);
+    const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
     const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
     const evalDir = dirname(resolvedEvalPath);
 
diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts
@@ -21,6 +21,7 @@ import { executeScript, loadTestSuite } from '@agentv/core';
 import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core';
 import { command, number, option, optional, positional, string } from 'cmd-ts';
 
+import { buildDefaultRunDir } from '../eval/result-layout.js';
 import { findRepoRoot } from '../eval/shared.js';
 import { selectTarget } from '../eval/targets.js';
 
@@ -57,9 +58,9 @@ export const evalRunCommand = command({
       description: 'Path to eval YAML file',
     }),
     out: option({
-      type: string,
+      type: optional(string),
       long: 'out',
-      description: 'Output directory for results',
+      description: 'Output directory for results (default: .agentv/results/runs/eval_<timestamp>)',
     }),
     workers: option({
       type: optional(number),
@@ -69,7 +70,7 @@ export const evalRunCommand = command({
   },
   handler: async ({ evalPath, out, workers }) => {
     const resolvedEvalPath = resolve(evalPath);
-    const outDir = resolve(out);
+    const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
     const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
     const evalDir = dirname(resolvedEvalPath);
 
@@ -232,6 +233,7 @@ export const evalRunCommand = command({
           await writeJson(join(testDir, 'timing.json'), {
             duration_ms: durationMs,
             total_duration_seconds: Math.round(durationMs / 10) / 100,
+            execution_status: 'ok',
           });
 
           console.log(`  ${testId}: OK (${durationMs}ms, ${response.length} chars)`);
@@ -243,6 +245,7 @@ export const evalRunCommand = command({
           await writeJson(join(testDir, 'timing.json'), {
             duration_ms: durationMs,
             total_duration_seconds: Math.round(durationMs / 10) / 100,
+            execution_status: 'execution_error',
           });
           console.error(`  ${testId}: FAILED (${durationMs}ms) — ${message.slice(0, 200)}`);
         } finally {
@@ -256,9 +259,18 @@ export const evalRunCommand = command({
         }
       };
 
-      // Run all targets in parallel
-      const allTasks = testIds.map((testId) => invokeTarget(testId));
-      await Promise.all(allTasks);
+      // Run targets with concurrency limit
+      const pending = new Set<Promise<void>>();
+      for (const testId of testIds) {
+        const task = invokeTarget(testId).then(() => {
+          pending.delete(task);
+        });
+        pending.add(task);
+        if (pending.size >= maxWorkers) {
+          await Promise.race(pending);
+        }
+      }
+      await Promise.all(pending);
     } else {
       console.log('Agent-as-target mode — skipping CLI invocation.');
     }
diff --git a/apps/cli/src/commands/results/index.ts b/apps/cli/src/commands/results/index.ts
@@ -4,6 +4,7 @@ import { resultsExportCommand } from './export.js';
 import { resultsFailuresCommand } from './failures.js';
 import { resultsShowCommand } from './show.js';
 import { resultsSummaryCommand } from './summary.js';
+import { resultsValidateCommand } from './validate.js';
 
 export const resultsCommand = subcommands({
   name: 'results',
@@ -13,5 +14,6 @@ export const resultsCommand = subcommands({
     summary: resultsSummaryCommand,
     failures: resultsFailuresCommand,
     show: resultsShowCommand,
+    validate: resultsValidateCommand,
   },
 });
diff --git a/apps/cli/src/commands/results/validate.ts b/apps/cli/src/commands/results/validate.ts
diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md