From 1fd06dd20917839f92429b23db39bad23eed3ab0 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Mon, 30 Mar 2026 13:20:15 +1100
Subject: [PATCH 1/5] fix(pipeline): use async exec for true parallel target
 invocation

execSync blocks the Node.js event loop, so the concurrency pool ran
targets sequentially despite the async wrapper. Switch to async exec
so workers actually run in parallel. Also adds real-time progress
reporting to the invocation step and defaults workers to 5.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/cli/src/commands/pipeline/run.ts | 36 +++++++++++++++++++--------
 1 file changed, 25 insertions(+), 11 deletions(-)
diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts
index 59c65d93..4eb8978a 100644
--- a/apps/cli/src/commands/pipeline/run.ts
+++ b/apps/cli/src/commands/pipeline/run.ts
@@ -11,7 +11,7 @@
  *
  * To add new features: extend the handler — all logic is self-contained.
  */
-import { execSync } from 'node:child_process';
+import { exec } from 'node:child_process';
 import { existsSync, readFileSync, unlinkSync } from 'node:fs';
 import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises';
 import { tmpdir } from 'node:os';
@@ -216,9 +216,16 @@ export const evalRunCommand = command({
           .replace(/\./g, '-');
       }
       const mergedEnv = { ...process.env, ...envVars };
-      const maxWorkers = workers ?? testIds.length;
+      const maxWorkers = workers ?? 5;
+      let invCompleted = 0;
+      const invTotal = testIds.length;
 
-      console.log(`Invoking ${testIds.length} CLI target(s) (${maxWorkers} workers)...`);
+      const writeInvProgress = () => {
+        process.stderr.write(`\rInvoking: ${invCompleted}/${invTotal} done`);
+      };
+
+      console.log(`Invoking ${invTotal} CLI target(s) (${maxWorkers} workers)...`);
+      writeInvProgress();
 
       const invokeTarget = async (testId: string): Promise<void> => {
         const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
@@ -244,12 +251,16 @@ export const evalRunCommand = command({
 
         const start = performance.now();
         try {
-          execSync(rendered, {
-            cwd,
-            timeout: timeoutMs,
-            env: mergedEnv,
-            stdio: ['pipe', 'pipe', 'pipe'],
-            maxBuffer: 10 * 1024 * 1024,
+          await new Promise<void>((resolveP, rejectP) => {
+            exec(rendered, {
+              cwd,
+              timeout: timeoutMs,
+              env: mergedEnv,
+              maxBuffer: 10 * 1024 * 1024,
+            }, (error) => {
+              if (error) rejectP(error);
+              else resolveP();
+            });
           });
           const durationMs = Math.round(performance.now() - start);
 
@@ -267,7 +278,7 @@ export const evalRunCommand = command({
             execution_status: 'ok',
           });
 
-          console.log(`  ${testId}: OK (${durationMs}ms, ${response.length} chars)`);
+          process.stderr.write(`\n  ${testId}: OK (${durationMs}ms, ${response.length} chars)\n`);
         } catch (error) {
           const durationMs = Math.round(performance.now() - start);
           const message = error instanceof Error ? error.message : String(error);
@@ -278,8 +289,10 @@ export const evalRunCommand = command({
             total_duration_seconds: Math.round(durationMs / 10) / 100,
             execution_status: 'execution_error',
           });
-          console.error(`  ${testId}: FAILED (${durationMs}ms) — ${message.slice(0, 200)}`);
+          process.stderr.write(`\n  ${testId}: FAILED (${durationMs}ms) — ${message.slice(0, 200)}\n`);
         } finally {
+          invCompleted++;
+          writeInvProgress();
           // Cleanup temp files
           try {
             if (existsSync(promptFile)) unlinkSync(promptFile);
@@ -302,6 +315,7 @@ export const evalRunCommand = command({
         }
       }
       await Promise.all(pending);
+      process.stderr.write('\n');
     } else {
       console.log('Subagent-as-target mode — skipping CLI invocation.');
     }

From 3f1bfec6c79c65e8fbfccddc692d6e566b0863b6 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Mon, 30 Mar 2026 13:23:07 +1100
Subject: [PATCH 2/5] style: fix biome formatting

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/cli/src/commands/pipeline/run.ts | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts
index 4eb8978a..a8594000 100644
--- a/apps/cli/src/commands/pipeline/run.ts
+++ b/apps/cli/src/commands/pipeline/run.ts
@@ -252,15 +252,19 @@ export const evalRunCommand = command({
         const start = performance.now();
         try {
           await new Promise<void>((resolveP, rejectP) => {
-            exec(rendered, {
-              cwd,
-              timeout: timeoutMs,
-              env: mergedEnv,
-              maxBuffer: 10 * 1024 * 1024,
-            }, (error) => {
-              if (error) rejectP(error);
-              else resolveP();
-            });
+            exec(
+              rendered,
+              {
+                cwd,
+                timeout: timeoutMs,
+                env: mergedEnv,
+                maxBuffer: 10 * 1024 * 1024,
+              },
+              (error) => {
+                if (error) rejectP(error);
+                else resolveP();
+              },
+            );
           });
           const durationMs = Math.round(performance.now() - start);
 
@@ -289,7 +293,9 @@ export const evalRunCommand = command({
             total_duration_seconds: Math.round(durationMs / 10) / 100,
             execution_status: 'execution_error',
           });
-          process.stderr.write(`\n  ${testId}: FAILED (${durationMs}ms) — ${message.slice(0, 200)}\n`);
+          process.stderr.write(
+            `\n  ${testId}: FAILED (${durationMs}ms) — ${message.slice(0, 200)}\n`,
+          );
         } finally {
           invCompleted++;
           writeInvProgress();

From ac68ca2a02c9e108d2a398db5f0292ab0631914d Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Mon, 30 Mar 2026 13:25:30 +1100
Subject: [PATCH 3/5] fix(core): handle both path separators in deriveCategory

path.sep is \ on Windows, so paths with / were not split correctly,
causing all categories to return 'Uncategorized'. Split on both
separators to work cross-platform.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 packages/core/src/evaluation/category.ts | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/packages/core/src/evaluation/category.ts b/packages/core/src/evaluation/category.ts
index 926e34f8..7f4a39e5 100644
--- a/packages/core/src/evaluation/category.ts
+++ b/packages/core/src/evaluation/category.ts
@@ -1,5 +1,3 @@
-import path from 'node:path';
-
 /** Default category for eval files without subdirectory structure. */
 export const DEFAULT_CATEGORY = 'Uncategorized';
 
@@ -11,7 +9,7 @@ export const DEFAULT_CATEGORY = 'Uncategorized';
  * at the root level.
  */
 export function deriveCategory(relativePath: string): string {
-  const parts = relativePath.split(path.sep);
+  const parts = relativePath.split(/[/\\]/);
   if (parts.length <= 1) {
     return DEFAULT_CATEGORY;
   }

From 8e8307233d688b9793b0a244810d5e4876ac57b1 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Mon, 30 Mar 2026 13:38:03 +1100
Subject: [PATCH 4/5] fix grading skill

---
 plugins/agentv-dev/skills/agentv-bench/SKILL.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md
index c09fb76c..41601781 100644
--- a/plugins/agentv-dev/skills/agentv-bench/SKILL.md
+++ b/plugins/agentv-dev/skills/agentv-bench/SKILL.md
@@ -191,7 +191,7 @@ This runs all `code-grader` assertions against the `response.md` files. Results
 
 **Phase 2: LLM grading** (semantic — do NOT skip this phase)
 
-Dispatch one `grader` subagent per (test × LLM grader) pair, **all in parallel**.
+Dispatch one `grader` subagent per (test × LLM grader) pair, **all in parallel**. Do not write a script to call an LLM API instead — the grader subagents use their own reasoning, which IS the LLM grading.
 Example: 5 tests × 2 LLM graders = 10 grader subagents launched simultaneously.
 
 **Do NOT dispatch a single grader for multiple tests.** Each subagent grades exactly one (test, grader) pair.

From 885c44299f67b04d976bac800c7f21a1813144bf Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Mon, 30 Mar 2026 13:41:11 +1100
Subject: [PATCH 5/5] feat(pipeline): read workers from targets.yaml config

pipeline run now respects the workers field from targets.yaml as a
fallback when --workers is not passed on the CLI. Precedence:
CLI flag > targets.yaml > default (5).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/cli/src/commands/pipeline/run.ts | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts
index a8594000..be062a4c 100644
--- a/apps/cli/src/commands/pipeline/run.ts
+++ b/apps/cli/src/commands/pipeline/run.ts
@@ -78,7 +78,7 @@ export const evalRunCommand = command({
     workers: option({
       type: optional(number),
       long: 'workers',
-      description: 'Parallel workers for target invocation (default: all tests)',
+      description: 'Parallel workers for target invocation (default: targets.yaml workers, then 5)',
     }),
     experiment: option({
       type: optional(string),
@@ -116,6 +116,7 @@ export const evalRunCommand = command({
     } | null = null;
     let targetName = 'agent';
     let targetKind = 'agent';
+    let targetWorkers: number | undefined;
 
     try {
       const selection = await selectTarget({
@@ -129,6 +130,7 @@ export const evalRunCommand = command({
         env: process.env,
       });
       targetName = selection.targetName;
+      targetWorkers = selection.resolvedTarget.workers;
       if (selection.resolvedTarget.kind === 'cli') {
         targetKind = 'cli';
         const config = selection.resolvedTarget.config;
@@ -216,7 +218,7 @@ export const evalRunCommand = command({
           .replace(/\./g, '-');
       }
       const mergedEnv = { ...process.env, ...envVars };
-      const maxWorkers = workers ?? 5;
+      const maxWorkers = workers ?? targetWorkers ?? 5;
       let invCompleted = 0;
       const invTotal = testIds.length;