Skip to content

Commit 2c778b9

Browse files
christsoclaude
andauthored
feat(pipeline): agent-mode artifacts align with CLI-mode schema (#796)
* feat(pipeline): agent-mode artifacts align with CLI-mode schema - pipeline run/input: --out now optional, defaults to .agentv/results/runs/eval_<timestamp> - pipeline bench: index.jsonl now includes scores[], execution_status, response_path to match CLI-mode dashboard schema - results validate: new command to check run dir naming, index.jsonl fields, artifact presence, and score bounds - skill: update agent-mode workflow docs to use default --out, add validate step, clarify llm_scores.json -> index.jsonl flow; user-stated mode overrides .env * fix: address code review issues for pipeline artifact alignment 1. execution_status: run.ts now writes status into timing.json ('ok' or 'execution_error'), bench.ts reads it back instead of hardcoding 'ok' 2. response_path: use null instead of undefined so the field is always present in index.jsonl 3. --workers concurrency: implement actual concurrency limiter using Promise.race instead of unbounded Promise.all 4. validate.ts: validate scores[] entry structure (name, type, score, verdict) and warn on unknown execution_status values Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 4707612 commit 2c778b9

File tree

6 files changed

+378
-33
lines changed

6 files changed

+378
-33
lines changed

apps/cli/src/commands/pipeline/bench.ts

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* Stdin format (LLM scores):
1313
* { "<test-id>": { "<grader-name>": { "score": 0.85, "assertions": [...] } } }
1414
*/
15+
import { existsSync } from 'node:fs';
1516
import { readFile, readdir, writeFile } from 'node:fs/promises';
1617
import { join } from 'node:path';
1718

@@ -150,15 +151,46 @@ export const evalBenchCommand = command({
150151
'utf8',
151152
);
152153

153-
// Build index entry
154+
// Build index entry (match CLI-mode schema for dashboard compatibility)
155+
const scores = evaluators.map((e) => ({
156+
name: e.name,
157+
type: e.type,
158+
score: e.score,
159+
weight: e.weight,
160+
verdict: e.score >= 0.5 ? 'pass' : 'fail',
161+
assertions: e.assertions.map((a) => ({
162+
text: a.text,
163+
passed: a.passed,
164+
evidence: a.evidence ?? '',
165+
})),
166+
}));
167+
168+
// Read execution_status from timing.json (written by pipeline run)
169+
let executionStatus = 'ok';
170+
const timingPath = join(testDir, 'timing.json');
171+
if (existsSync(timingPath)) {
172+
try {
173+
const timing = JSON.parse(await readFile(timingPath, 'utf8'));
174+
if (typeof timing.execution_status === 'string') {
175+
executionStatus = timing.execution_status;
176+
}
177+
} catch {
178+
// Fall back to 'ok' if timing.json is unreadable
179+
}
180+
}
181+
182+
const hasResponse = existsSync(join(testDir, 'response.md'));
154183
indexLines.push(
155184
JSON.stringify({
156185
timestamp: manifest.timestamp,
157186
test_id: testId,
158187
score: Math.round(weightedScore * 1000) / 1000,
159188
target: targetName,
189+
scores,
190+
execution_status: executionStatus,
160191
grading_path: `${testId}/grading.json`,
161192
timing_path: `${testId}/timing.json`,
193+
response_path: hasResponse ? `${testId}/response.md` : null,
162194
}),
163195
);
164196
}

apps/cli/src/commands/pipeline/input.ts

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@ import { dirname, join, resolve } from 'node:path';
2323

2424
import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core';
2525
import { loadTestSuite } from '@agentv/core';
26-
import { command, option, positional, string } from 'cmd-ts';
26+
import { command, option, optional, positional, string } from 'cmd-ts';
2727

28+
import { buildDefaultRunDir } from '../eval/result-layout.js';
2829
import { findRepoRoot } from '../eval/shared.js';
2930
import { selectTarget } from '../eval/targets.js';
3031

@@ -38,14 +39,15 @@ export const evalInputCommand = command({
3839
description: 'Path to eval YAML file',
3940
}),
4041
out: option({
41-
type: string,
42+
type: optional(string),
4243
long: 'out',
43-
description: 'Output directory for extracted inputs',
44+
description:
45+
'Output directory for extracted inputs (default: .agentv/results/runs/eval_<timestamp>)',
4446
}),
4547
},
4648
handler: async ({ evalPath, out }) => {
4749
const resolvedEvalPath = resolve(evalPath);
48-
const outDir = resolve(out);
50+
const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
4951
const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
5052
const evalDir = dirname(resolvedEvalPath);
5153

apps/cli/src/commands/pipeline/run.ts

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import { executeScript, loadTestSuite } from '@agentv/core';
2121
import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core';
2222
import { command, number, option, optional, positional, string } from 'cmd-ts';
2323

24+
import { buildDefaultRunDir } from '../eval/result-layout.js';
2425
import { findRepoRoot } from '../eval/shared.js';
2526
import { selectTarget } from '../eval/targets.js';
2627

@@ -57,9 +58,9 @@ export const evalRunCommand = command({
5758
description: 'Path to eval YAML file',
5859
}),
5960
out: option({
60-
type: string,
61+
type: optional(string),
6162
long: 'out',
62-
description: 'Output directory for results',
63+
description: 'Output directory for results (default: .agentv/results/runs/eval_<timestamp>)',
6364
}),
6465
workers: option({
6566
type: optional(number),
@@ -69,7 +70,7 @@ export const evalRunCommand = command({
6970
},
7071
handler: async ({ evalPath, out, workers }) => {
7172
const resolvedEvalPath = resolve(evalPath);
72-
const outDir = resolve(out);
73+
const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
7374
const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
7475
const evalDir = dirname(resolvedEvalPath);
7576

@@ -232,6 +233,7 @@ export const evalRunCommand = command({
232233
await writeJson(join(testDir, 'timing.json'), {
233234
duration_ms: durationMs,
234235
total_duration_seconds: Math.round(durationMs / 10) / 100,
236+
execution_status: 'ok',
235237
});
236238

237239
console.log(` ${testId}: OK (${durationMs}ms, ${response.length} chars)`);
@@ -243,6 +245,7 @@ export const evalRunCommand = command({
243245
await writeJson(join(testDir, 'timing.json'), {
244246
duration_ms: durationMs,
245247
total_duration_seconds: Math.round(durationMs / 10) / 100,
248+
execution_status: 'execution_error',
246249
});
247250
console.error(` ${testId}: FAILED (${durationMs}ms) — ${message.slice(0, 200)}`);
248251
} finally {
@@ -256,9 +259,18 @@ export const evalRunCommand = command({
256259
}
257260
};
258261

259-
// Run all targets in parallel
260-
const allTasks = testIds.map((testId) => invokeTarget(testId));
261-
await Promise.all(allTasks);
262+
// Run targets with concurrency limit
263+
const pending = new Set<Promise<void>>();
264+
for (const testId of testIds) {
265+
const task = invokeTarget(testId).then(() => {
266+
pending.delete(task);
267+
});
268+
pending.add(task);
269+
if (pending.size >= maxWorkers) {
270+
await Promise.race(pending);
271+
}
272+
}
273+
await Promise.all(pending);
262274
} else {
263275
console.log('Agent-as-target mode — skipping CLI invocation.');
264276
}

apps/cli/src/commands/results/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import { resultsExportCommand } from './export.js';
44
import { resultsFailuresCommand } from './failures.js';
55
import { resultsShowCommand } from './show.js';
66
import { resultsSummaryCommand } from './summary.js';
7+
import { resultsValidateCommand } from './validate.js';
78

89
export const resultsCommand = subcommands({
910
name: 'results',
@@ -13,5 +14,6 @@ export const resultsCommand = subcommands({
1314
summary: resultsSummaryCommand,
1415
failures: resultsFailuresCommand,
1516
show: resultsShowCommand,
17+
validate: resultsValidateCommand,
1618
},
1719
});

0 commit comments

Comments
 (0)