Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 33 additions & 1 deletion apps/cli/src/commands/pipeline/bench.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
* Stdin format (LLM scores):
* { "<test-id>": { "<grader-name>": { "score": 0.85, "assertions": [...] } } }
*/
import { existsSync } from 'node:fs';
import { readFile, readdir, writeFile } from 'node:fs/promises';
import { join } from 'node:path';

Expand Down Expand Up @@ -150,15 +151,46 @@ export const evalBenchCommand = command({
'utf8',
);

// Build index entry
// Build index entry (match CLI-mode schema for dashboard compatibility)
const scores = evaluators.map((e) => ({
name: e.name,
type: e.type,
score: e.score,
weight: e.weight,
verdict: e.score >= 0.5 ? 'pass' : 'fail',
assertions: e.assertions.map((a) => ({
text: a.text,
passed: a.passed,
evidence: a.evidence ?? '',
})),
}));

// Read execution_status from timing.json (written by pipeline run)
let executionStatus = 'ok';
const timingPath = join(testDir, 'timing.json');
if (existsSync(timingPath)) {
try {
const timing = JSON.parse(await readFile(timingPath, 'utf8'));
if (typeof timing.execution_status === 'string') {
executionStatus = timing.execution_status;
}
} catch {
// Fall back to 'ok' if timing.json is unreadable
}
}

const hasResponse = existsSync(join(testDir, 'response.md'));
indexLines.push(
JSON.stringify({
timestamp: manifest.timestamp,
test_id: testId,
score: Math.round(weightedScore * 1000) / 1000,
target: targetName,
scores,
execution_status: executionStatus,
grading_path: `${testId}/grading.json`,
timing_path: `${testId}/timing.json`,
response_path: hasResponse ? `${testId}/response.md` : null,
}),
);
}
Expand Down
10 changes: 6 additions & 4 deletions apps/cli/src/commands/pipeline/input.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ import { dirname, join, resolve } from 'node:path';

import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core';
import { loadTestSuite } from '@agentv/core';
import { command, option, positional, string } from 'cmd-ts';
import { command, option, optional, positional, string } from 'cmd-ts';

import { buildDefaultRunDir } from '../eval/result-layout.js';
import { findRepoRoot } from '../eval/shared.js';
import { selectTarget } from '../eval/targets.js';

Expand All @@ -38,14 +39,15 @@ export const evalInputCommand = command({
description: 'Path to eval YAML file',
}),
out: option({
type: string,
type: optional(string),
long: 'out',
description: 'Output directory for extracted inputs',
description:
'Output directory for extracted inputs (default: .agentv/results/runs/eval_<timestamp>)',
}),
},
handler: async ({ evalPath, out }) => {
const resolvedEvalPath = resolve(evalPath);
const outDir = resolve(out);
const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
const evalDir = dirname(resolvedEvalPath);

Expand Down
24 changes: 18 additions & 6 deletions apps/cli/src/commands/pipeline/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import { executeScript, loadTestSuite } from '@agentv/core';
import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core';
import { command, number, option, optional, positional, string } from 'cmd-ts';

import { buildDefaultRunDir } from '../eval/result-layout.js';
import { findRepoRoot } from '../eval/shared.js';
import { selectTarget } from '../eval/targets.js';

Expand Down Expand Up @@ -57,9 +58,9 @@ export const evalRunCommand = command({
description: 'Path to eval YAML file',
}),
out: option({
type: string,
type: optional(string),
long: 'out',
description: 'Output directory for results',
description: 'Output directory for results (default: .agentv/results/runs/eval_<timestamp>)',
}),
workers: option({
type: optional(number),
Expand All @@ -69,7 +70,7 @@ export const evalRunCommand = command({
},
handler: async ({ evalPath, out, workers }) => {
const resolvedEvalPath = resolve(evalPath);
const outDir = resolve(out);
const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
const evalDir = dirname(resolvedEvalPath);

Expand Down Expand Up @@ -232,6 +233,7 @@ export const evalRunCommand = command({
await writeJson(join(testDir, 'timing.json'), {
duration_ms: durationMs,
total_duration_seconds: Math.round(durationMs / 10) / 100,
execution_status: 'ok',
});

console.log(` ${testId}: OK (${durationMs}ms, ${response.length} chars)`);
Expand All @@ -243,6 +245,7 @@ export const evalRunCommand = command({
await writeJson(join(testDir, 'timing.json'), {
duration_ms: durationMs,
total_duration_seconds: Math.round(durationMs / 10) / 100,
execution_status: 'execution_error',
});
console.error(` ${testId}: FAILED (${durationMs}ms) — ${message.slice(0, 200)}`);
} finally {
Expand All @@ -256,9 +259,18 @@ export const evalRunCommand = command({
}
};

// Run all targets in parallel
const allTasks = testIds.map((testId) => invokeTarget(testId));
await Promise.all(allTasks);
// Run targets with concurrency limit
const pending = new Set<Promise<void>>();
for (const testId of testIds) {
const task = invokeTarget(testId).then(() => {
pending.delete(task);
});
pending.add(task);
if (pending.size >= maxWorkers) {
await Promise.race(pending);
}
}
await Promise.all(pending);
} else {
console.log('Agent-as-target mode — skipping CLI invocation.');
}
Expand Down
2 changes: 2 additions & 0 deletions apps/cli/src/commands/results/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { resultsExportCommand } from './export.js';
import { resultsFailuresCommand } from './failures.js';
import { resultsShowCommand } from './show.js';
import { resultsSummaryCommand } from './summary.js';
import { resultsValidateCommand } from './validate.js';

export const resultsCommand = subcommands({
name: 'results',
Expand All @@ -13,5 +14,6 @@ export const resultsCommand = subcommands({
summary: resultsSummaryCommand,
failures: resultsFailuresCommand,
show: resultsShowCommand,
validate: resultsValidateCommand,
},
});
Loading
Loading