diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index 59c65d93..be062a4c 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -11,7 +11,7 @@ * * To add new features: extend the handler — all logic is self-contained. */ -import { execSync } from 'node:child_process'; +import { exec } from 'node:child_process'; import { existsSync, readFileSync, unlinkSync } from 'node:fs'; import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; @@ -78,7 +78,7 @@ export const evalRunCommand = command({ workers: option({ type: optional(number), long: 'workers', - description: 'Parallel workers for target invocation (default: all tests)', + description: 'Parallel workers for target invocation (default: targets.yaml workers, then 5)', }), experiment: option({ type: optional(string), @@ -116,6 +116,7 @@ export const evalRunCommand = command({ } | null = null; let targetName = 'agent'; let targetKind = 'agent'; + let targetWorkers: number | undefined; try { const selection = await selectTarget({ @@ -129,6 +130,7 @@ export const evalRunCommand = command({ env: process.env, }); targetName = selection.targetName; + targetWorkers = selection.resolvedTarget.workers; if (selection.resolvedTarget.kind === 'cli') { targetKind = 'cli'; const config = selection.resolvedTarget.config; @@ -216,9 +218,16 @@ export const evalRunCommand = command({ .replace(/\./g, '-'); } const mergedEnv = { ...process.env, ...envVars }; - const maxWorkers = workers ?? testIds.length; + const maxWorkers = workers ?? targetWorkers ?? 5; + let invCompleted = 0; + const invTotal = testIds.length; - console.log(`Invoking ${testIds.length} CLI target(s) (${maxWorkers} workers)...`); + const writeInvProgress = () => { + process.stderr.write(`\rInvoking: ${invCompleted}/${invTotal} done`); + }; + + console.log(`Invoking ${invTotal} CLI target(s) (${maxWorkers} workers)...`); + writeInvProgress(); const invokeTarget = async (testId: string): Promise => { const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId]; @@ -244,12 +253,20 @@ export const evalRunCommand = command({ const start = performance.now(); try { - execSync(rendered, { - cwd, - timeout: timeoutMs, - env: mergedEnv, - stdio: ['pipe', 'pipe', 'pipe'], - maxBuffer: 10 * 1024 * 1024, + await new Promise((resolveP, rejectP) => { + exec( + rendered, + { + cwd, + timeout: timeoutMs, + env: mergedEnv, + maxBuffer: 10 * 1024 * 1024, + }, + (error) => { + if (error) rejectP(error); + else resolveP(); + }, + ); }); const durationMs = Math.round(performance.now() - start); @@ -267,7 +284,7 @@ export const evalRunCommand = command({ execution_status: 'ok', }); - console.log(` ${testId}: OK (${durationMs}ms, ${response.length} chars)`); + process.stderr.write(`\n ${testId}: OK (${durationMs}ms, ${response.length} chars)\n`); } catch (error) { const durationMs = Math.round(performance.now() - start); const message = error instanceof Error ? error.message : String(error); @@ -278,8 +295,12 @@ export const evalRunCommand = command({ total_duration_seconds: Math.round(durationMs / 10) / 100, execution_status: 'execution_error', }); - console.error(` ${testId}: FAILED (${durationMs}ms) — ${message.slice(0, 200)}`); + process.stderr.write( + `\n ${testId}: FAILED (${durationMs}ms) — ${message.slice(0, 200)}\n`, + ); } finally { + invCompleted++; + writeInvProgress(); // Cleanup temp files try { if (existsSync(promptFile)) unlinkSync(promptFile); @@ -302,6 +323,7 @@ export const evalRunCommand = command({ } } await Promise.all(pending); + process.stderr.write('\n'); } else { console.log('Subagent-as-target mode — skipping CLI invocation.'); } diff --git a/packages/core/src/evaluation/category.ts b/packages/core/src/evaluation/category.ts index 926e34f8..7f4a39e5 100644 --- a/packages/core/src/evaluation/category.ts +++ b/packages/core/src/evaluation/category.ts @@ -1,5 +1,3 @@ -import path from 'node:path'; - /** Default category for eval files without subdirectory structure. */ export const DEFAULT_CATEGORY = 'Uncategorized'; @@ -11,7 +9,7 @@ export const DEFAULT_CATEGORY = 'Uncategorized'; * at the root level. */ export function deriveCategory(relativePath: string): string { - const parts = relativePath.split(path.sep); + const parts = relativePath.split(/[/\\]/); if (parts.length <= 1) { return DEFAULT_CATEGORY; } diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md index c09fb76c..41601781 100644 --- a/plugins/agentv-dev/skills/agentv-bench/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-bench/SKILL.md @@ -191,7 +191,7 @@ This runs all `code-grader` assertions against the `response.md` files. Results **Phase 2: LLM grading** (semantic — do NOT skip this phase) -Dispatch one `grader` subagent per (test × LLM grader) pair, **all in parallel**. +Dispatch one `grader` subagent per (test × LLM grader) pair, **all in parallel**. Do not write a script to call an LLM API instead — the grader subagents use their own reasoning, which IS the LLM grading. Example: 5 tests × 2 LLM graders = 10 grader subagents launched simultaneously. **Do NOT dispatch a single grader for multiple tests.** Each subagent grades exactly one (test, grader) pair.