From 7173597092357c1b37c1a9a9fb8ec24b63a8462e Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 27 Mar 2026 18:43:26 +1100
Subject: [PATCH] feat(pipeline): agent-mode reliability improvements

- Add --llm-scores flag to pipeline bench (#790)
- Add pipeline run combined command (#791)
- Fix Windows subprocess in Python scripts (#789)
- Update agentv-bench skill docs for cross-platform use (#792)
---
 apps/cli/src/commands/pipeline/bench.ts       |  23 +-
 apps/cli/src/commands/pipeline/index.ts       |   2 +
 apps/cli/src/commands/pipeline/run.ts         | 421 ++++++++++++++++++
 .../agentv-dev/skills/agentv-bench/SKILL.md   |  72 +--
 .../skills/agentv-bench/scripts/bench.py      |  12 +-
 .../agentv-bench/scripts/run_code_graders.py  |  12 +-
 .../skills/agentv-bench/scripts/run_tests.py  |  42 +-
 7 files changed, 542 insertions(+), 42 deletions(-)
 create mode 100644 apps/cli/src/commands/pipeline/run.ts
diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts
index 502858a3..5cd4cefc 100644
--- a/apps/cli/src/commands/pipeline/bench.ts
+++ b/apps/cli/src/commands/pipeline/bench.ts
@@ -2,8 +2,9 @@
  * `agentv pipeline bench` — Merge code-grader and LLM grader scores into final
  * benchmark artifacts.
  *
- * Reads code_grader_results from disk and LLM grader scores from stdin,
- * computes weighted pass_rate per test, and writes:
+ * Reads code_grader_results from disk and LLM grader scores from a file
+ * (`--llm-scores <path>`) or stdin, computes weighted pass_rate per test,
+ * and writes:
  *   - <test-id>/grading.json  (per-test grading breakdown)
  *   - index.jsonl             (one line per test)
  *   - benchmark.json          (aggregate statistics)
@@ -14,7 +15,7 @@
 import { readFile, readdir, writeFile } from 'node:fs/promises';
 import { join } from 'node:path';
 
-import { command, positional, string } from 'cmd-ts';
+import { command, option, optional, positional, string } from 'cmd-ts';
 
 interface EvaluatorScore {
   readonly name: string;
@@ -33,14 +34,24 @@ export const evalBenchCommand = command({
       displayName: 'export-dir',
       description: 'Export directory from pipeline input/grade',
     }),
+    llmScores: option({
+      type: optional(string),
+      long: 'llm-scores',
+      description: 'Path to LLM scores JSON file (reads from stdin if omitted)',
+    }),
   },
-  handler: async ({ exportDir }) => {
+  handler: async ({ exportDir, llmScores: llmScoresPath }) => {
     const manifest = JSON.parse(await readFile(join(exportDir, 'manifest.json'), 'utf8'));
     const testIds: string[] = manifest.test_ids;
     const targetName: string = manifest.target?.name ?? 'unknown';
 
-    // Read LLM scores from stdin
-    const stdinData = await readStdin();
+    // Read LLM scores from file or stdin
+    let stdinData: string;
+    if (llmScoresPath) {
+      stdinData = await readFile(llmScoresPath, 'utf8');
+    } else {
+      stdinData = await readStdin();
+    }
     const llmScores: Record<
       string,
       Record<
diff --git a/apps/cli/src/commands/pipeline/index.ts b/apps/cli/src/commands/pipeline/index.ts
index f038e698..365b0d4e 100644
--- a/apps/cli/src/commands/pipeline/index.ts
+++ b/apps/cli/src/commands/pipeline/index.ts
@@ -3,6 +3,7 @@ import { subcommands } from 'cmd-ts';
 import { evalBenchCommand } from './bench.js';
 import { evalGradeCommand } from './grade.js';
 import { evalInputCommand } from './input.js';
+import { evalRunCommand } from './run.js';
 
 export const pipelineCommand = subcommands({
   name: 'pipeline',
@@ -11,5 +12,6 @@ export const pipelineCommand = subcommands({
     input: evalInputCommand,
     grade: evalGradeCommand,
     bench: evalBenchCommand,
+    run: evalRunCommand,
   },
 });
diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts
new file mode 100644
index 00000000..c52e7445
--- /dev/null
+++ b/apps/cli/src/commands/pipeline/run.ts
@@ -0,0 +1,421 @@
+/**
+ * `agentv pipeline run` — Combined command that runs input extraction, CLI target
+ * invocation, and code grading in a single step.
+ *
+ * Equivalent to running:
+ *   1. `agentv pipeline input <eval> --out <dir>`
+ *   2. Invoking each CLI target in parallel (writing response.md + timing.json)
+ *   3. `agentv pipeline grade <dir>`
+ *
+ * For `kind: agent` targets, steps 2 is skipped (agent handles execution).
+ *
+ * To add new features: extend the handler — all logic is self-contained.
+ */
+import { execSync } from 'node:child_process';
+import { existsSync, readFileSync, unlinkSync } from 'node:fs';
+import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises';
+import { tmpdir } from 'node:os';
+import { dirname, join, resolve } from 'node:path';
+
+import { executeScript, loadTestSuite } from '@agentv/core';
+import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core';
+import { command, number, option, optional, positional, string } from 'cmd-ts';
+
+import { findRepoRoot } from '../eval/shared.js';
+import { selectTarget } from '../eval/targets.js';
+
+/** Load key=value pairs from a .env file. Ignores comments and blank lines. */
+function loadEnvFile(dir: string): Record<string, string> {
+  let current = resolve(dir);
+  while (true) {
+    const candidate = join(current, '.env');
+    if (existsSync(candidate)) {
+      const env: Record<string, string> = {};
+      for (const line of readFileSync(candidate, 'utf8').split('\n')) {
+        const trimmed = line.trim();
+        if (!trimmed || trimmed.startsWith('#')) continue;
+        const eqIdx = trimmed.indexOf('=');
+        if (eqIdx === -1) continue;
+        env[trimmed.slice(0, eqIdx).trim()] = trimmed.slice(eqIdx + 1).trim();
+      }
+      return env;
+    }
+    const parent = dirname(current);
+    if (parent === current) break;
+    current = parent;
+  }
+  return {};
+}
+
+export const evalRunCommand = command({
+  name: 'run',
+  description: 'Extract inputs, invoke CLI targets, and run code graders in one step',
+  args: {
+    evalPath: positional({
+      type: string,
+      displayName: 'eval-path',
+      description: 'Path to eval YAML file',
+    }),
+    out: option({
+      type: string,
+      long: 'out',
+      description: 'Output directory for results',
+    }),
+    workers: option({
+      type: optional(number),
+      long: 'workers',
+      description: 'Parallel workers for target invocation (default: all tests)',
+    }),
+  },
+  handler: async ({ evalPath, out, workers }) => {
+    const resolvedEvalPath = resolve(evalPath);
+    const outDir = resolve(out);
+    const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
+    const evalDir = dirname(resolvedEvalPath);
+
+    // ── Step 1: Extract inputs (same as pipeline input) ──────────────
+    const suite = await loadTestSuite(resolvedEvalPath, repoRoot);
+    const tests = suite.tests;
+
+    if (tests.length === 0) {
+      console.error('No tests found in eval file.');
+      process.exit(1);
+    }
+
+    let targetInfo: {
+      kind: 'cli';
+      command: string;
+      cwd: string;
+      timeoutMs: number;
+    } | null = null;
+    let targetName = 'agent';
+    let targetKind = 'agent';
+
+    try {
+      const selection = await selectTarget({
+        testFilePath: resolvedEvalPath,
+        repoRoot,
+        cwd: evalDir,
+        dryRun: false,
+        dryRunDelay: 0,
+        dryRunDelayMin: 0,
+        dryRunDelayMax: 0,
+        env: process.env,
+      });
+      targetName = selection.targetName;
+      if (selection.resolvedTarget.kind === 'cli') {
+        targetKind = 'cli';
+        const config = selection.resolvedTarget.config;
+        targetInfo = {
+          kind: 'cli',
+          command: config.command,
+          cwd: config.cwd ?? evalDir,
+          timeoutMs: config.timeoutMs ?? 30000,
+        };
+      }
+    } catch {
+      // No targets file — agent-as-target mode
+    }
+
+    const testIds: string[] = [];
+
+    for (const test of tests) {
+      const testDir = join(outDir, test.id);
+      await mkdir(testDir, { recursive: true });
+      testIds.push(test.id);
+
+      const inputText = test.question;
+      const inputMessages = test.input.map((m) => ({
+        role: m.role,
+        content: typeof m.content === 'string' ? m.content : m.content,
+      }));
+      await writeJson(join(testDir, 'input.json'), {
+        input_text: inputText,
+        input_messages: inputMessages,
+        file_paths: test.file_paths,
+        metadata: test.metadata ?? {},
+      });
+
+      if (targetInfo) {
+        await writeJson(join(testDir, 'invoke.json'), {
+          kind: 'cli',
+          command: targetInfo.command,
+          cwd: targetInfo.cwd,
+          timeout_ms: targetInfo.timeoutMs,
+          env: {},
+        });
+      } else {
+        await writeJson(join(testDir, 'invoke.json'), {
+          kind: 'agent',
+          instructions: 'Execute this task in the current workspace. The agent IS the target.',
+        });
+      }
+
+      await writeFile(join(testDir, 'criteria.md'), test.criteria ?? '', 'utf8');
+
+      if (
+        test.expected_output.length > 0 ||
+        (test.reference_answer !== undefined && test.reference_answer !== '')
+      ) {
+        await writeJson(join(testDir, 'expected_output.json'), {
+          expected_output: test.expected_output,
+          reference_answer: test.reference_answer ?? '',
+        });
+      }
+
+      await writeGraderConfigs(testDir, test.assertions ?? [], evalDir);
+    }
+
+    await writeJson(join(outDir, 'manifest.json'), {
+      eval_file: resolvedEvalPath,
+      timestamp: new Date().toISOString(),
+      target: { name: targetName, kind: targetKind },
+      test_ids: testIds,
+    });
+
+    console.log(`Extracted ${testIds.length} test(s) to ${outDir}`);
+
+    // ── Step 2: Invoke CLI targets in parallel ───────────────────────
+    if (targetInfo) {
+      const envVars = loadEnvFile(evalDir);
+      const mergedEnv = { ...process.env, ...envVars };
+      const maxWorkers = workers ?? testIds.length;
+
+      console.log(`Invoking ${testIds.length} CLI target(s) (${maxWorkers} workers)...`);
+
+      const invokeTarget = async (testId: string): Promise<void> => {
+        const testDir = join(outDir, testId);
+        const invoke = JSON.parse(await readFile(join(testDir, 'invoke.json'), 'utf8'));
+        if (invoke.kind !== 'cli') return;
+
+        const inputData = JSON.parse(await readFile(join(testDir, 'input.json'), 'utf8'));
+        const template: string = invoke.command;
+        const cwd: string = invoke.cwd;
+        const timeoutMs: number = invoke.timeout_ms ?? 120000;
+
+        // Write temp prompt file
+        const promptFile = join(tmpdir(), `agentv-prompt-${testId}-${Date.now()}.txt`);
+        const outputFile = join(tmpdir(), `agentv-output-${testId}-${Date.now()}.txt`);
+        await writeFile(promptFile, inputData.input_text, 'utf8');
+
+        let rendered = template;
+        rendered = rendered.replace('{PROMPT_FILE}', promptFile);
+        rendered = rendered.replace('{OUTPUT_FILE}', outputFile);
+        rendered = rendered.replace('{PROMPT}', inputData.input_text);
+
+        const start = performance.now();
+        try {
+          execSync(rendered, {
+            cwd,
+            timeout: timeoutMs,
+            env: mergedEnv,
+            stdio: ['pipe', 'pipe', 'pipe'],
+            maxBuffer: 10 * 1024 * 1024,
+          });
+          const durationMs = Math.round(performance.now() - start);
+
+          let response: string;
+          if (existsSync(outputFile)) {
+            response = readFileSync(outputFile, 'utf8');
+          } else {
+            response = 'ERROR: No output file generated';
+          }
+
+          await writeFile(join(testDir, 'response.md'), response, 'utf8');
+          await writeJson(join(testDir, 'timing.json'), {
+            duration_ms: durationMs,
+            total_duration_seconds: Math.round(durationMs / 10) / 100,
+          });
+
+          console.log(`  ${testId}: OK (${durationMs}ms, ${response.length} chars)`);
+        } catch (error) {
+          const durationMs = Math.round(performance.now() - start);
+          const message = error instanceof Error ? error.message : String(error);
+          const response = `ERROR: target failed — ${message}`;
+          await writeFile(join(testDir, 'response.md'), response, 'utf8');
+          await writeJson(join(testDir, 'timing.json'), {
+            duration_ms: durationMs,
+            total_duration_seconds: Math.round(durationMs / 10) / 100,
+          });
+          console.error(`  ${testId}: FAILED (${durationMs}ms) — ${message.slice(0, 200)}`);
+        } finally {
+          // Cleanup temp files
+          try {
+            if (existsSync(promptFile)) unlinkSync(promptFile);
+            if (existsSync(outputFile)) unlinkSync(outputFile);
+          } catch {
+            /* ignore cleanup errors */
+          }
+        }
+      };
+
+      // Run all targets in parallel
+      const allTasks = testIds.map((testId) => invokeTarget(testId));
+      await Promise.all(allTasks);
+    } else {
+      console.log('Agent-as-target mode — skipping CLI invocation.');
+    }
+
+    // ── Step 3: Run code graders (same as pipeline grade) ────────────
+    let totalGraders = 0;
+    let totalPassed = 0;
+
+    for (const testId of testIds) {
+      const testDir = join(outDir, testId);
+      const codeGradersDir = join(testDir, 'code_graders');
+      const resultsDir = join(testDir, 'code_grader_results');
+
+      let graderFiles: string[];
+      try {
+        graderFiles = (await readdir(codeGradersDir)).filter((f) => f.endsWith('.json'));
+      } catch {
+        continue;
+      }
+      if (graderFiles.length === 0) continue;
+      await mkdir(resultsDir, { recursive: true });
+
+      const responseText = await readFile(join(testDir, 'response.md'), 'utf8');
+      const inputData = JSON.parse(await readFile(join(testDir, 'input.json'), 'utf8'));
+
+      for (const graderFile of graderFiles) {
+        const graderConfig = JSON.parse(await readFile(join(codeGradersDir, graderFile), 'utf8'));
+        const graderName = graderConfig.name;
+
+        const payload = JSON.stringify({
+          output: [{ role: 'assistant', content: responseText }],
+          input: inputData.input_messages,
+          question: inputData.input_text,
+          criteria: '',
+          expected_output: [],
+          reference_answer: '',
+          input_files: [],
+          trace: null,
+          token_usage: null,
+          cost_usd: null,
+          duration_ms: null,
+          start_time: null,
+          end_time: null,
+          file_changes: null,
+          workspace_path: null,
+          config: graderConfig.config ?? null,
+          metadata: {},
+          input_text: inputData.input_text,
+          output_text: responseText,
+          expected_output_text: '',
+        });
+
+        try {
+          const stdout = await executeScript(
+            graderConfig.command,
+            payload,
+            undefined,
+            graderConfig.cwd,
+          );
+          const parsed = JSON.parse(stdout);
+          const score = typeof parsed.score === 'number' ? parsed.score : 0;
+          const assertions = Array.isArray(parsed.assertions) ? parsed.assertions : [];
+
+          await writeFile(
+            join(resultsDir, `${graderName}.json`),
+            `${JSON.stringify(
+              {
+                name: graderName,
+                type: 'code-grader',
+                score,
+                weight: graderConfig.weight ?? 1.0,
+                assertions,
+                details: parsed.details ?? {},
+              },
+              null,
+              2,
+            )}\n`,
+            'utf8',
+          );
+          totalGraders++;
+          if (score >= 0.5) totalPassed++;
+        } catch (error) {
+          const message = error instanceof Error ? error.message : String(error);
+          console.error(`  ${testId}/${graderName}: ERROR — ${message}`);
+          await writeFile(
+            join(resultsDir, `${graderName}.json`),
+            `${JSON.stringify(
+              {
+                name: graderName,
+                type: 'code-grader',
+                score: 0,
+                weight: graderConfig.weight ?? 1.0,
+                assertions: [{ text: `Error: ${message}`, passed: false }],
+                details: { error: message },
+              },
+              null,
+              2,
+            )}\n`,
+            'utf8',
+          );
+          totalGraders++;
+        }
+      }
+    }
+
+    console.log(`Graded ${totalGraders} code-grader(s): ${totalPassed} passed`);
+    console.log(`\nDone. Agent can now perform LLM grading on responses in ${outDir}`);
+  },
+});
+
+// ── Helpers (shared with input.ts) ──────────────────────────────────
+
+async function writeJson(filePath: string, data: unknown): Promise<void> {
+  await writeFile(filePath, `${JSON.stringify(data, null, 2)}\n`, 'utf8');
+}
+
+async function writeGraderConfigs(
+  testDir: string,
+  assertions: readonly EvaluatorConfig[],
+  evalDir: string,
+): Promise<void> {
+  const codeGradersDir = join(testDir, 'code_graders');
+  const llmGradersDir = join(testDir, 'llm_graders');
+
+  let hasCodeGraders = false;
+  let hasLlmGraders = false;
+
+  for (const assertion of assertions) {
+    if (assertion.type === 'code-grader') {
+      if (!hasCodeGraders) {
+        await mkdir(codeGradersDir, { recursive: true });
+        hasCodeGraders = true;
+      }
+      const config = assertion as CodeEvaluatorConfig;
+      await writeJson(join(codeGradersDir, `${config.name}.json`), {
+        name: config.name,
+        command: config.command,
+        cwd: config.resolvedCwd ?? config.cwd ?? evalDir,
+        weight: config.weight ?? 1.0,
+        config: config.config ?? {},
+      });
+    } else if (assertion.type === 'llm-grader') {
+      if (!hasLlmGraders) {
+        await mkdir(llmGradersDir, { recursive: true });
+        hasLlmGraders = true;
+      }
+      const config = assertion as LlmGraderEvaluatorConfig;
+      let promptContent = '';
+      if (config.resolvedPromptPath) {
+        try {
+          promptContent = readFileSync(config.resolvedPromptPath, 'utf8');
+        } catch {
+          promptContent = typeof config.prompt === 'string' ? config.prompt : '';
+        }
+      } else if (typeof config.prompt === 'string') {
+        promptContent = config.prompt;
+      }
+      await writeJson(join(llmGradersDir, `${config.name}.json`), {
+        name: config.name,
+        prompt_content: promptContent,
+        weight: config.weight ?? 1.0,
+        threshold: 0.5,
+        config: {},
+      });
+    }
+  }
+}
diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md
index 1126df59..4b742834 100644
--- a/plugins/agentv-dev/skills/agentv-bench/SKILL.md
+++ b/plugins/agentv-dev/skills/agentv-bench/SKILL.md
@@ -48,17 +48,14 @@ These scripts break the eval pipeline into discrete steps. The agent runs them i
 ### Agent-mode workflow
 
 ```bash
-# 1. Extract inputs and run CLI targets
-python scripts/run_tests.py evals/repro.eval.yaml --out .agentv/results/export/run-1
+# 1. Extract inputs, invoke CLI targets, run code graders (one command):
+agentv pipeline run evals/repro.eval.yaml --out .agentv/results/export/run-1
 
-# 2. Run code graders (deterministic, no LLM needed)
-python scripts/run_code_graders.py .agentv/results/export/run-1
-
-# 3. Agent performs LLM grading (reads llm_graders/*.json, produces scores JSON)
+# 2. Agent performs LLM grading (reads llm_graders/*.json, produces scores JSON)
 # ... agent reads prompts, grades responses, writes llm_scores.json ...
 
-# 4. Merge all scores and produce final artifacts
-python scripts/bench.py .agentv/results/export/run-1 < llm_scores.json
+# 3. Merge all scores and produce final artifacts
+agentv pipeline bench .agentv/results/export/run-1 --llm-scores llm_scores.json
 ```
 
 ### Skill management scripts
@@ -276,30 +273,51 @@ When `AGENT_EVAL_MODE=agent` (default), use the pipeline CLI subcommands (`pipel
 - `agentv` CLI is installed (or run from source with `bun apps/cli/src/cli.ts`)
 - Read `references/eval-yaml-spec.md` for the full schema
 
-**Step 1: Extract inputs and run targets**
+**Recommended: Single command for CLI targets**
+
+For evals with CLI targets, `pipeline run` handles input extraction, target invocation, and code grading in one step:
 
 ```bash
-# Using Python wrapper (recommended — handles target invocation in parallel):
-python scripts/run_tests.py evals/repro.eval.yaml --out .agentv/results/export/run-1
+# Extract inputs, invoke all CLI targets in parallel, run code graders:
+agentv pipeline run evals/repro.eval.yaml --out .agentv/results/export/run-1
+```
 
-# Or using CLI directly (extract only, agent handles execution):
-agentv pipeline input evals/repro.eval.yaml --out .agentv/results/export/run-1
+Then the agent performs LLM grading and merges scores:
+
+```bash
+# bash:
+agentv pipeline bench .agentv/results/export/run-1 --llm-scores llm_scores.json
+
+# PowerShell (if --llm-scores is unavailable, pipe via Get-Content):
+# Get-Content llm_scores.json | agentv pipeline bench .agentv/results/export/run-1
 ```
 
-This creates an export directory with per-test `input.json`, `invoke.json`, `criteria.md`, and grader configs (`code_graders/*.json`, `llm_graders/*.json`). For CLI targets, `run_tests.py` also invokes the target and writes `response.md`.
+That's the entire pipeline: **2 commands** plus the agent's LLM grading step.
 
-For agent-as-target mode, the agent executes each test using the extracted `input.json` and writes `response.md` directly.
+**Alternative: Step-by-step (agent-as-target or fine-grained control)**
 
-**Step 2: Run code graders**
+Use individual commands when the agent IS the target or you need control over each step:
 
 ```bash
-python scripts/run_code_graders.py .agentv/results/export/run-1
-# Or: agentv pipeline grade .agentv/results/export/run-1
+# Step 1: Extract inputs
+agentv pipeline input evals/repro.eval.yaml --out .agentv/results/export/run-1
+
+# Step 2: Agent invokes each test (reads input.json, writes response.md)
+#         For CLI targets, you can also use the Python wrapper:
+#         python scripts/run_tests.py evals/repro.eval.yaml --out .agentv/results/export/run-1
+
+# Step 3: Run code graders
+agentv pipeline grade .agentv/results/export/run-1
+
+# Step 4: Agent does LLM grading, writes llm_scores.json
+
+# Step 5: Merge scores
+agentv pipeline bench .agentv/results/export/run-1 --llm-scores llm_scores.json
 ```
 
-Executes all code-grader assertions against `response.md` files. Writes `code_grader_results/<name>.json` per test.
+This creates an export directory with per-test `input.json`, `invoke.json`, `criteria.md`, and grader configs (`code_graders/*.json`, `llm_graders/*.json`).
 
-**Step 3: LLM grading (agent performs directly)**
+**Step 3 (LLM grading): agent performs directly**
 
 The agent reads `llm_graders/<name>.json` for each test, grades the response using the prompt content, and produces a scores JSON:
 
@@ -314,19 +332,11 @@ The agent reads `llm_graders/<name>.json` for each test, grades the response usi
 }
 ```
 
-Dispatch the `grader` subagent (read `agents/grader.md`) for this step.
+**Subagent environments (Claude Code):** Dispatch the `grader` subagent (read `agents/grader.md`) for this step.
 
-**Step 4: Merge scores and produce artifacts**
-
-```bash
-python scripts/bench.py .agentv/results/export/run-1 < llm_scores.json
-# Or: agentv pipeline bench .agentv/results/export/run-1 < llm_scores.json
-```
+**Non-subagent environments (VS Code Copilot, Codex, etc.):** Perform LLM grading inline. Read each `llm_graders/<name>.json`, grade the response against the `prompt_content` criteria, score 0.0–1.0 with evidence, and write the result to `llm_scores.json` in the export directory.
 
-Merges code-grader + LLM scores, computes weighted pass_rate, and writes:
-- `<test-id>/grading.json` — per-test grading breakdown
-- `index.jsonl` — one line per test
-- `benchmark.json` — aggregate statistics
+**Note on Python wrapper scripts:** The `scripts/` directory contains Python wrappers (`run_tests.py`, `run_code_graders.py`, `bench.py`) that call the CLI commands. These are provided as an alternative but the direct CLI commands above are preferred — they work cross-platform without Python dependency issues.
 
 **Output structure:**
 ```
diff --git a/plugins/agentv-dev/skills/agentv-bench/scripts/bench.py b/plugins/agentv-dev/skills/agentv-bench/scripts/bench.py
index 07624279..0197ced1 100644
--- a/plugins/agentv-dev/skills/agentv-bench/scripts/bench.py
+++ b/plugins/agentv-dev/skills/agentv-bench/scripts/bench.py
@@ -29,10 +29,20 @@
     <export-dir>/<test-id>/grading.json <- merged grading per test
 """
 import argparse
+import shutil
 import subprocess
 import sys
 
 
+def _find_agentv() -> str:
+    """Resolve the agentv executable via PATH (handles .ps1/.cmd on Windows)."""
+    path = shutil.which("agentv")
+    if not path:
+        print("agentv CLI not found. Install: bun install -g agentv", file=sys.stderr)
+        sys.exit(1)
+    return path
+
+
 def main():
     parser = argparse.ArgumentParser(
         description="Merge scores and produce benchmark artifacts"
@@ -42,7 +52,7 @@ def main():
 
     # Pass stdin through to agentv pipeline bench
     result = subprocess.run(
-        ["agentv", "pipeline", "bench", args.export_dir],
+        [_find_agentv(), "pipeline", "bench", args.export_dir],
         stdin=sys.stdin,
     )
     sys.exit(result.returncode)
diff --git a/plugins/agentv-dev/skills/agentv-bench/scripts/run_code_graders.py b/plugins/agentv-dev/skills/agentv-bench/scripts/run_code_graders.py
index 2b846c42..b69dbf56 100644
--- a/plugins/agentv-dev/skills/agentv-bench/scripts/run_code_graders.py
+++ b/plugins/agentv-dev/skills/agentv-bench/scripts/run_code_graders.py
@@ -19,17 +19,27 @@
     <export-dir>/<test-id>/code_grader_results/<name>.json
 """
 import argparse
+import shutil
 import subprocess
 import sys
 
 
+def _find_agentv() -> str:
+    """Resolve the agentv executable via PATH (handles .ps1/.cmd on Windows)."""
+    path = shutil.which("agentv")
+    if not path:
+        print("agentv CLI not found. Install: bun install -g agentv", file=sys.stderr)
+        sys.exit(1)
+    return path
+
+
 def main():
     parser = argparse.ArgumentParser(description="Run code-grader assertions")
     parser.add_argument("export_dir", help="Export directory from pipeline input")
     args = parser.parse_args()
 
     result = subprocess.run(
-        ["agentv", "pipeline", "grade", args.export_dir],
+        [_find_agentv(), "pipeline", "grade", args.export_dir],
         capture_output=False,
     )
     sys.exit(result.returncode)
diff --git a/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py b/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py
index 3dbda367..c8ce32f7 100644
--- a/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py
+++ b/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py
@@ -26,6 +26,7 @@
 import argparse
 import json
 import os
+import shutil
 import subprocess
 import sys
 import tempfile
@@ -35,10 +36,33 @@
 from pathlib import Path
 
 
+def _find_agentv() -> str:
+    """Resolve the agentv executable via PATH (handles .ps1/.cmd on Windows)."""
+    path = shutil.which("agentv")
+    if not path:
+        print("agentv CLI not found. Install: bun install -g agentv", file=sys.stderr)
+        sys.exit(1)
+    return path
+
+
+def _load_env(env_file: Path) -> dict:
+    """Read key=value pairs from a .env file, ignoring comments and blanks."""
+    env = {}
+    for line in env_file.read_text().splitlines():
+        line = line.strip()
+        if not line or line.startswith("#"):
+            continue
+        if "=" not in line:
+            continue
+        key, _, value = line.partition("=")
+        env[key.strip()] = value.strip()
+    return env
+
+
 def run_agentv_input(eval_path: str, out_dir: str) -> dict:
     """Call agentv pipeline input and return the manifest."""
     result = subprocess.run(
-        ["agentv", "pipeline", "input", eval_path, "--out", out_dir],
+        [_find_agentv(), "pipeline", "input", eval_path, "--out", out_dir],
         capture_output=True,
         text=True,
     )
@@ -49,7 +73,7 @@ def run_agentv_input(eval_path: str, out_dir: str) -> dict:
     return json.loads(manifest_path.read_text())
 
 
-def invoke_cli_target(test_dir: Path) -> None:
+def invoke_cli_target(test_dir: Path, extra_env: dict | None = None) -> None:
     """Read invoke.json and execute the CLI target command."""
     invoke_path = test_dir / "invoke.json"
     invoke = json.loads(invoke_path.read_text())
@@ -61,6 +85,7 @@ def invoke_cli_target(test_dir: Path) -> None:
     command_template = invoke["command"]
     cwd = invoke.get("cwd")
     timeout_s = invoke.get("timeout_ms", 30000) / 1000
+    merged_env = {**os.environ, **(extra_env or {})}
 
     # Write prompt to temp file for {PROMPT_FILE} placeholder
     with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as pf:
@@ -85,6 +110,7 @@ def invoke_cli_target(test_dir: Path) -> None:
             capture_output=True,
             text=True,
             timeout=timeout_s,
+            env=merged_env,
         )
         duration_ms = int((time.time() - start) * 1000)
 
@@ -130,6 +156,16 @@ def main():
         ts = datetime.now(timezone.utc).isoformat().replace(":", "-").replace(".", "-")
         os.environ["AGENTV_RUN_TIMESTAMP"] = ts
 
+    # Load .env from eval directory or any parent
+    eval_dir = Path(args.eval_path).resolve().parent
+    env_file = None
+    for p in [eval_dir] + list(eval_dir.parents):
+        candidate = p / ".env"
+        if candidate.exists():
+            env_file = candidate
+            break
+    extra_env = _load_env(env_file) if env_file else {}
+
     manifest = run_agentv_input(args.eval_path, args.out)
     out = Path(args.out)
 
@@ -149,7 +185,7 @@ def main():
 
     print(f"Running {len(cli_tests)} CLI target(s) with {args.workers} workers...")
     with ThreadPoolExecutor(max_workers=args.workers) as pool:
-        futures = {pool.submit(invoke_cli_target, td): td.name for td in cli_tests}
+        futures = {pool.submit(invoke_cli_target, td, extra_env): td.name for td in cli_tests}
         for future in as_completed(futures):
             tid = futures[future]
             try: