EntityProcess · christso · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026
diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts
@@ -11,7 +11,7 @@
  *
  * To add new features: extend the handler — all logic is self-contained.
  */
-import { execSync } from 'node:child_process';
+import { exec } from 'node:child_process';
 import { existsSync, readFileSync, unlinkSync } from 'node:fs';
 import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises';
 import { tmpdir } from 'node:os';
@@ -78,7 +78,7 @@ export const evalRunCommand = command({
     workers: option({
       type: optional(number),
       long: 'workers',
-      description: 'Parallel workers for target invocation (default: all tests)',
+      description: 'Parallel workers for target invocation (default: targets.yaml workers, then 5)',
     }),
     experiment: option({
       type: optional(string),
@@ -116,6 +116,7 @@ export const evalRunCommand = command({
     } | null = null;
     let targetName = 'agent';
     let targetKind = 'agent';
+    let targetWorkers: number | undefined;
 
     try {
       const selection = await selectTarget({
@@ -129,6 +130,7 @@ export const evalRunCommand = command({
         env: process.env,
       });
       targetName = selection.targetName;
+      targetWorkers = selection.resolvedTarget.workers;
       if (selection.resolvedTarget.kind === 'cli') {
         targetKind = 'cli';
         const config = selection.resolvedTarget.config;
@@ -216,9 +218,16 @@ export const evalRunCommand = command({
           .replace(/\./g, '-');
       }
       const mergedEnv = { ...process.env, ...envVars };
-      const maxWorkers = workers ?? testIds.length;
+      const maxWorkers = workers ?? targetWorkers ?? 5;
+      let invCompleted = 0;
+      const invTotal = testIds.length;
 
-      console.log(`Invoking ${testIds.length} CLI target(s) (${maxWorkers} workers)...`);
+      const writeInvProgress = () => {
+        process.stderr.write(`\rInvoking: ${invCompleted}/${invTotal} done`);
+      };
+
+      console.log(`Invoking ${invTotal} CLI target(s) (${maxWorkers} workers)...`);
+      writeInvProgress();
 
       const invokeTarget = async (testId: string): Promise<void> => {
         const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
@@ -244,12 +253,20 @@ export const evalRunCommand = command({
 
         const start = performance.now();
         try {
-          execSync(rendered, {
-            cwd,
-            timeout: timeoutMs,
-            env: mergedEnv,
-            stdio: ['pipe', 'pipe', 'pipe'],
-            maxBuffer: 10 * 1024 * 1024,
+          await new Promise<void>((resolveP, rejectP) => {
+            exec(
+              rendered,
+              {
+                cwd,
+                timeout: timeoutMs,
+                env: mergedEnv,
+                maxBuffer: 10 * 1024 * 1024,
+              },
+              (error) => {
+                if (error) rejectP(error);
+                else resolveP();
+              },
+            );
           });
           const durationMs = Math.round(performance.now() - start);
 
@@ -267,7 +284,7 @@ export const evalRunCommand = command({
             execution_status: 'ok',
           });
 
-          console.log(`  ${testId}: OK (${durationMs}ms, ${response.length} chars)`);
+          process.stderr.write(`\n  ${testId}: OK (${durationMs}ms, ${response.length} chars)\n`);
         } catch (error) {
           const durationMs = Math.round(performance.now() - start);
           const message = error instanceof Error ? error.message : String(error);
@@ -278,8 +295,12 @@ export const evalRunCommand = command({
             total_duration_seconds: Math.round(durationMs / 10) / 100,
             execution_status: 'execution_error',
           });
-          console.error(`  ${testId}: FAILED (${durationMs}ms) — ${message.slice(0, 200)}`);
+          process.stderr.write(
+            `\n  ${testId}: FAILED (${durationMs}ms) — ${message.slice(0, 200)}\n`,
+          );
         } finally {
+          invCompleted++;
+          writeInvProgress();
           // Cleanup temp files
           try {
             if (existsSync(promptFile)) unlinkSync(promptFile);
@@ -302,6 +323,7 @@ export const evalRunCommand = command({
         }
       }
       await Promise.all(pending);
+      process.stderr.write('\n');
     } else {
       console.log('Subagent-as-target mode — skipping CLI invocation.');
     }

diff --git a/packages/core/src/evaluation/category.ts b/packages/core/src/evaluation/category.ts
@@ -1,5 +1,3 @@
-import path from 'node:path';
-
 /** Default category for eval files without subdirectory structure. */
 export const DEFAULT_CATEGORY = 'Uncategorized';
 
@@ -11,7 +9,7 @@ export const DEFAULT_CATEGORY = 'Uncategorized';
  * at the root level.
  */
 export function deriveCategory(relativePath: string): string {
-  const parts = relativePath.split(path.sep);
+  const parts = relativePath.split(/[/\\]/);
   if (parts.length <= 1) {
     return DEFAULT_CATEGORY;
   }

diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md
@@ -191,7 +191,7 @@ This runs all `code-grader` assertions against the `response.md` files. Results
 
 **Phase 2: LLM grading** (semantic — do NOT skip this phase)
 
-Dispatch one `grader` subagent per (test × LLM grader) pair, **all in parallel**.
+Dispatch one `grader` subagent per (test × LLM grader) pair, **all in parallel**. Do not write a script to call an LLM API instead — the grader subagents use their own reasoning, which IS the LLM grading.
 Example: 5 tests × 2 LLM graders = 10 grader subagents launched simultaneously.
 
 **Do NOT dispatch a single grader for multiple tests.** Each subagent grades exactly one (test, grader) pair.