Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
234 changes: 155 additions & 79 deletions apps/cli/src/commands/pipeline/grade.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,19 @@
* with the response text on stdin (matching CodeEvaluator payload format),
* and writes results to code_grader_results/<name>.json.
*
* Graders run concurrently (default: 4 workers) for performance.
* Progress is printed to stderr so users see real-time feedback.
*
* Export directory additions:
* <out-dir>/<eval-set>/<test-id>/code_grader_results/<name>.json
*/
import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises';
import { join } from 'node:path';

import { executeScript } from '@agentv/core';
import { command, positional, string } from 'cmd-ts';
import { command, number, option, optional, positional, string } from 'cmd-ts';

const DEFAULT_CONCURRENCY = 10;

/**
* Convert a Message[] array to plain text.
Expand All @@ -26,6 +31,142 @@ function extractInputText(input: Array<{ role: string; content: string }>): stri
return input.map((m) => `@[${m.role}]:\n${m.content}`).join('\n\n');
}

/** Describes a single grader to execute. */
export interface GraderTask {
testId: string;
testDir: string;
resultsDir: string;
graderFile: string;
responseText: string;
inputData: {
input: Array<{ role: string; content: string }>;
input_files?: unknown[];
metadata?: Record<string, unknown>;
};
}

/**
* Run code-grader tasks with concurrency and progress feedback.
* Shared by `pipeline grade` and `pipeline run`.
*/
export async function runCodeGraders(
tasks: GraderTask[],
concurrency: number,
): Promise<{ totalGraders: number; totalPassed: number }> {
let totalGraders = 0;
let totalPassed = 0;
let completed = 0;
const total = tasks.length;

if (total === 0) return { totalGraders: 0, totalPassed: 0 };

const writeProgress = () => {
process.stderr.write(`\rGrading: ${completed}/${total} done`);
};

writeProgress();

const executeGrader = async (task: GraderTask) => {
const { testId, testDir, resultsDir, graderFile, responseText, inputData } = task;
const graderConfig = JSON.parse(
await readFile(join(testDir, 'code_graders', graderFile), 'utf8'),
);
const graderName = graderConfig.name;

const inputText = extractInputText(inputData.input);
const payload = JSON.stringify({
output: [{ role: 'assistant', content: responseText }],
input: inputData.input,
criteria: '',
expected_output: [],
input_files: inputData.input_files ?? [],
trace: null,
token_usage: null,
cost_usd: null,
duration_ms: null,
start_time: null,
end_time: null,
file_changes: null,
workspace_path: null,
config: graderConfig.config ?? null,
metadata: inputData.metadata ?? {},
input_text: inputText,
output_text: responseText,
expected_output_text: '',
});

try {
const stdout = await executeScript(
graderConfig.command,
payload,
undefined,
graderConfig.cwd,
);
const parsed = JSON.parse(stdout);
const score = typeof parsed.score === 'number' ? parsed.score : 0;
const assertions = Array.isArray(parsed.assertions) ? parsed.assertions : [];

const result = {
name: graderName,
type: 'code-grader',
score,
weight: graderConfig.weight ?? 1.0,
assertions,
details: parsed.details ?? {},
};

await writeFile(
join(resultsDir, `${graderName}.json`),
`${JSON.stringify(result, null, 2)}\n`,
'utf8',
);

totalGraders++;
if (score >= 0.5) totalPassed++;
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
process.stderr.write(`\n ${testId}/${graderName}: ERROR — ${message}\n`);

const errorResult = {
name: graderName,
type: 'code-grader',
score: 0,
weight: graderConfig.weight ?? 1.0,
assertions: [{ text: `Error: ${message}`, passed: false }],
details: { error: message },
};

await writeFile(
join(resultsDir, `${graderName}.json`),
`${JSON.stringify(errorResult, null, 2)}\n`,
'utf8',
);
totalGraders++;
} finally {
completed++;
writeProgress();
}
};

// Run with concurrency limit
const pending = new Set<Promise<void>>();
for (const task of tasks) {
const p = executeGrader(task).then(() => {
pending.delete(p);
});
pending.add(p);
if (pending.size >= concurrency) {
await Promise.race(pending);
}
}
await Promise.all(pending);

// Clear the progress line and print final summary
process.stderr.write('\n');

return { totalGraders, totalPassed };
}

export const evalGradeCommand = command({
name: 'grade',
description: 'Run code-grader assertions on responses in an export directory',
Expand All @@ -35,16 +176,23 @@ export const evalGradeCommand = command({
displayName: 'export-dir',
description: 'Export directory from pipeline input',
}),
concurrency: option({
type: optional(number),
long: 'concurrency',
short: 'j',
description: `Number of graders to run in parallel (default: ${DEFAULT_CONCURRENCY})`,
}),
},
handler: async ({ exportDir }) => {
handler: async ({ exportDir, concurrency }) => {
const maxWorkers = concurrency ?? DEFAULT_CONCURRENCY;
const manifestPath = join(exportDir, 'manifest.json');
const manifest = JSON.parse(await readFile(manifestPath, 'utf8'));
const testIds: string[] = manifest.test_ids;
const evalSet: string = manifest.dataset ?? '';
const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, '_') : '';

let totalGraders = 0;
let totalPassed = 0;
// Collect all grader tasks upfront so we know the total count
const tasks: GraderTask[] = [];

for (const testId of testIds) {
const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
Expand All @@ -62,88 +210,16 @@ export const evalGradeCommand = command({
if (graderFiles.length === 0) continue;
await mkdir(resultsDir, { recursive: true });

// Read response and input for stdin payload
// Read response and input once per test (shared by all graders for this test)
const responseText = await readFile(join(testDir, 'response.md'), 'utf8');
const inputData = JSON.parse(await readFile(join(testDir, 'input.json'), 'utf8'));

for (const graderFile of graderFiles) {
const graderConfig = JSON.parse(await readFile(join(codeGradersDir, graderFile), 'utf8'));
const graderName = graderConfig.name;

// Build stdin payload matching CodeEvaluator format (snake_case)
const inputText = extractInputText(inputData.input);
const payload = JSON.stringify({
output: [{ role: 'assistant', content: responseText }],
input: inputData.input,
criteria: '',
expected_output: [],
input_files: inputData.input_files ?? [],
trace: null,
token_usage: null,
cost_usd: null,
duration_ms: null,
start_time: null,
end_time: null,
file_changes: null,
workspace_path: null,
config: graderConfig.config ?? null,
metadata: inputData.metadata ?? {},
input_text: inputText,
output_text: responseText,
expected_output_text: '',
});

try {
const stdout = await executeScript(
graderConfig.command,
payload,
undefined,
graderConfig.cwd,
);
const parsed = JSON.parse(stdout);
const score = typeof parsed.score === 'number' ? parsed.score : 0;
const assertions = Array.isArray(parsed.assertions) ? parsed.assertions : [];

const result = {
name: graderName,
type: 'code-grader',
score,
weight: graderConfig.weight ?? 1.0,
assertions,
details: parsed.details ?? {},
};

await writeFile(
join(resultsDir, `${graderName}.json`),
`${JSON.stringify(result, null, 2)}\n`,
'utf8',
);

totalGraders++;
if (score >= 0.5) totalPassed++;
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
console.error(` ${testId}/${graderName}: ERROR — ${message}`);

const errorResult = {
name: graderName,
type: 'code-grader',
score: 0,
weight: graderConfig.weight ?? 1.0,
assertions: [{ text: `Error: ${message}`, passed: false }],
details: { error: message },
};

await writeFile(
join(resultsDir, `${graderName}.json`),
`${JSON.stringify(errorResult, null, 2)}\n`,
'utf8',
);
totalGraders++;
}
tasks.push({ testId, testDir, resultsDir, graderFile, responseText, inputData });
}
}

const { totalGraders, totalPassed } = await runCodeGraders(tasks, maxWorkers);
console.log(`Graded ${totalGraders} code-grader(s): ${totalPassed} passed`);
},
});
86 changes: 8 additions & 78 deletions apps/cli/src/commands/pipeline/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@ import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { dirname, join, relative, resolve } from 'node:path';

import { deriveCategory, executeScript, loadTestSuite } from '@agentv/core';
import { deriveCategory, loadTestSuite } from '@agentv/core';
import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core';
import { command, number, oneOf, option, optional, positional, string } from 'cmd-ts';

import { buildDefaultRunDir } from '../eval/result-layout.js';
import { findRepoRoot } from '../eval/shared.js';
import { selectTarget } from '../eval/targets.js';
import type { GraderTask } from './grade.js';
import { runCodeGraders } from './grade.js';

/**
* Convert a Message[] array to plain text.
Expand Down Expand Up @@ -313,8 +315,8 @@ export const evalRunCommand = command({
return;
}

let totalGraders = 0;
let totalPassed = 0;
// Collect grader tasks and run concurrently with progress feedback
const graderTasks: GraderTask[] = [];

for (const testId of testIds) {
const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
Expand All @@ -335,84 +337,12 @@ export const evalRunCommand = command({
const inputData = JSON.parse(await readFile(join(testDir, 'input.json'), 'utf8'));

for (const graderFile of graderFiles) {
const graderConfig = JSON.parse(await readFile(join(codeGradersDir, graderFile), 'utf8'));
const graderName = graderConfig.name;

const inputText = extractInputText(inputData.input);
const payload = JSON.stringify({
output: [{ role: 'assistant', content: responseText }],
input: inputData.input,
criteria: '',
expected_output: [],
input_files: inputData.input_files ?? [],
trace: null,
token_usage: null,
cost_usd: null,
duration_ms: null,
start_time: null,
end_time: null,
file_changes: null,
workspace_path: null,
config: graderConfig.config ?? null,
metadata: inputData.metadata ?? {},
input_text: inputText,
output_text: responseText,
expected_output_text: '',
});

try {
const stdout = await executeScript(
graderConfig.command,
payload,
undefined,
graderConfig.cwd,
);
const parsed = JSON.parse(stdout);
const score = typeof parsed.score === 'number' ? parsed.score : 0;
const assertions = Array.isArray(parsed.assertions) ? parsed.assertions : [];

await writeFile(
join(resultsDir, `${graderName}.json`),
`${JSON.stringify(
{
name: graderName,
type: 'code-grader',
score,
weight: graderConfig.weight ?? 1.0,
assertions,
details: parsed.details ?? {},
},
null,
2,
)}\n`,
'utf8',
);
totalGraders++;
if (score >= 0.5) totalPassed++;
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
console.error(` ${testId}/${graderName}: ERROR — ${message}`);
await writeFile(
join(resultsDir, `${graderName}.json`),
`${JSON.stringify(
{
name: graderName,
type: 'code-grader',
score: 0,
weight: graderConfig.weight ?? 1.0,
assertions: [{ text: `Error: ${message}`, passed: false }],
details: { error: message },
},
null,
2,
)}\n`,
'utf8',
);
totalGraders++;
}
graderTasks.push({ testId, testDir, resultsDir, graderFile, responseText, inputData });
}
}

const graderConcurrency = workers ?? 10;
const { totalGraders, totalPassed } = await runCodeGraders(graderTasks, graderConcurrency);
console.log(`Graded ${totalGraders} code-grader(s): ${totalPassed} passed`);
console.log(`\nDone. Agent can now perform LLM grading on responses in ${outDir}`);
},
Expand Down
Loading