EntityProcess · christso · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026
diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts
@@ -19,7 +19,7 @@ import { dirname, join, relative, resolve } from 'node:path';
 
 import { deriveCategory, executeScript, loadTestSuite } from '@agentv/core';
 import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core';
-import { command, number, option, optional, positional, string } from 'cmd-ts';
+import { command, number, oneOf, option, optional, positional, string } from 'cmd-ts';
 
 import { buildDefaultRunDir } from '../eval/result-layout.js';
 import { findRepoRoot } from '../eval/shared.js';
@@ -83,8 +83,13 @@ export const evalRunCommand = command({
       long: 'experiment',
       description: 'Experiment label (e.g. with_skills, without_skills)',
     }),
+    graderType: option({
+      type: optional(oneOf(['code', 'none'])),
+      long: 'grader-type',
+      description: 'Which grading phase to run: "code" runs code-graders inline, omit to skip grading (use pipeline grade separately)',
+    }),
   },
-  handler: async ({ evalPath, out, workers, experiment }) => {
+  handler: async ({ evalPath, out, workers, experiment, graderType }) => {
     const resolvedEvalPath = resolve(evalPath);
     const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
     const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
@@ -298,7 +303,13 @@ export const evalRunCommand = command({
       console.log('Subagent-as-target mode — skipping CLI invocation.');
     }
 
-    // ── Step 3: Run code graders (same as pipeline grade) ────────────
+    // ── Step 3: Run code graders (only when explicitly requested) ─────
+    if (graderType !== 'code') {
+      console.log(`\nDone. Results in ${outDir}`);
+      console.log('To run code graders: agentv pipeline grade <run-dir>  (or re-run with --grader-type code)');
+      return;
+    }
+
     let totalGraders = 0;
     let totalPassed = 0;
 

diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md
diff --git a/plugins/agentv-dev/skills/agentv-bench/references/description-optimization.md b/plugins/agentv-dev/skills/agentv-bench/references/description-optimization.md
@@ -0,0 +1,66 @@
+# Description Optimization
+
+Optimize the `description` field in a skill's SKILL.md frontmatter for better triggering
+accuracy. Use this after the agent/skill is working well — this is a polish step, not a
+core workflow step.
+
+**Provider compatibility**: Description optimization applies to any agent platform with
+skill-discovery mechanisms — Claude Code, Codex (`.agents/` or `.codex/` folders), Copilot,
+and others. The `skill-trigger` evaluator checks whether the agent invoked the right skill,
+regardless of how discovery works on that platform.
+
+## Step 1: Generate Trigger EVAL.yaml
+
+Create 20 test cases:
+- **10 should-trigger**: realistic prompts where this skill should activate — different
+  phrasings, casual speech, uncommon use cases, edge cases where this skill competes with
+  another but should win
+- **10 should-not-trigger**: near-miss prompts that share keywords but actually need
+  something different — adjacent domains, ambiguous phrasing where naive matching would
+  trigger but shouldn't
+
+Prompts must be realistic — include file paths, personal context, typos, casual speech.
+Not abstract requests like "format data" but concrete ones like "ok so my boss sent me
+Q4-sales-FINAL-v2.xlsx and she wants me to add a profit margin column..."
+
+The should-not-trigger cases are the most valuable. "Write a fibonacci function" as a
+negative test for an eval skill is useless — it doesn't test anything. The negative cases
+should be genuinely tricky near-misses.
+
+Write as EVAL.yaml with top-level input (the user prompt doesn't specify the skill name —
+it's a natural utterance):
+
+```yaml
+# trigger-eval.eval.yaml
+tests:
+  - id: should-trigger-casual-optimize
+    input: "ok so I have this agent that keeps failing on the code review tasks, can you help me figure out why and fix it"
+    assertions:
+      - type: skill-trigger
+        skill: agentv-bench
+  - id: should-not-trigger-build-error
+    input: "my TypeScript build is failing with type errors in src/auth.ts"
+    assertions:
+      - type: skill-trigger
+        skill: agentv-bench
+        should_trigger: false
+```
+
+## Step 2: Review with User
+
+Present the eval set. The user adjusts queries, toggles should-trigger, adds/removes cases.
+This step matters — bad eval queries lead to bad descriptions.
+
+## Step 3: Iterate on Description
+
+Run the trigger eval, identify misfires, rewrite the description, re-run. Max 5 iterations.
+Select best description by held-out test accuracy (split 60% train / 40% test) to avoid
+overfitting.
+
+Use the grader and analyzer subagents to identify trigger failures and propose description
+improvements — the same eval → grade → analyze → improve loop used for agent output quality.
+
+## Step 4: Apply
+
+Update the skill's SKILL.md frontmatter with the optimized description. Show the user
+before/after with accuracy scores.
diff --git a/plugins/agentv-dev/skills/agentv-bench/references/environment-adaptation.md b/plugins/agentv-dev/skills/agentv-bench/references/environment-adaptation.md
@@ -0,0 +1,82 @@
+# Environment Adaptation
+
+Provider-specific notes, CI/headless behavior, and fallback strategies for environments
+with limited capabilities.
+
+## CI/Headless Mode
+
+Skip interactive prompts. Exit with pass/fail status code. Always generate artifacts for
+downstream consumption.
+
+## No Subagents Available (e.g., Claude.ai)
+
+Run test cases serially. Skip blind comparison. Present results directly in conversation —
+for each test case, show the prompt and output. Ask for feedback inline. Skip benchmarking
+(it relies on baseline comparisons that aren't meaningful without subagents).
+
+## Provider-Specific Notes
+
+- **Copilot CLI**: Uses ACP protocol via `copilot --acp --stdio`
+- **Claude SDK**: Requires `@anthropic-ai/claude-agent-sdk` installed
+- **Codex**: Supports skills via `.agents/` or `.codex/` folders. Emits `command_execution`
+  and `file_change` tool calls.
+- **Custom CLI**: Needs `command` and output file pattern in target config
+- **Target config**: Uses `${{ ENV_VAR }}` syntax (not `${ENV_VAR}`) for API keys
+
+**Note**: "Description Optimization" (see `references/description-optimization.md`) applies
+to any platform with skill-discovery mechanisms. All listed providers support skills.
+
+## Unsupported Providers: Use a Code-Grader
+
+The built-in `skill-trigger` evaluator covers Claude, Copilot, Pi, Codex and VS Code out
+of the box. For providers with different tool-call formats, write a code-grader that inspects
+the agent's tool call trace.
+
+A code-grader receives the full evaluation context including the agent's output messages and
+tool calls. You can inspect these to determine whether the skill was invoked:
+
+```yaml
+# Example: code-grader for Codex skill-trigger detection
+tests:
+  - id: should-trigger-codex
+    input: "Analyze this CSV file"
+    assertions:
+      - type: code-grader
+        path: ./judges/codex-skill-trigger.ts
+```
+
+```typescript
+// judges/codex-skill-trigger.ts
+import { defineCodeGrader } from '@agentv/eval';
+
+export default defineCodeGrader(({ output }) => {
+  const skillName = 'csv-analyzer';
+  const toolCalls = (output ?? []).flatMap((msg) => msg.toolCalls ?? []);
+  const firstTool = toolCalls[0];
+
+  if (!firstTool) {
+    return { score: 0, reason: 'No tool calls recorded' };
+  }
+
+  // Codex reads skill files via shell commands
+  if (firstTool.tool === 'command_execution') {
+    const cmd = String(firstTool.input ?? '');
+    if (cmd.includes(skillName)) {
+      return { score: 1, reason: `Skill "${skillName}" triggered via command: ${cmd}` };
+    }
+  }
+
+  // Check if skill file was read via file_change or other tools
+  if (firstTool.tool === 'file_change') {
+    const path = String((firstTool.input as Record<string, unknown>)?.path ?? '');
+    if (path.includes(skillName)) {
+      return { score: 1, reason: `Skill file accessed: ${path}` };
+    }
+  }
+
+  return { score: 0, reason: `First tool was "${firstTool.tool}" — not a skill invocation for "${skillName}"` };
+});
+```
+
+This approach is more flexible than config overrides — you can match any tool-call pattern,
+check multiple fields, and add provider-specific logic as needed.
diff --git a/plugins/agentv-dev/skills/agentv-bench/references/schemas.md b/plugins/agentv-dev/skills/agentv-bench/references/schemas.md
@@ -87,6 +87,8 @@ Tracks version progression in Improve mode. Located at workspace root.
 
 Output from the grader agent. Located at `<run-dir>/grading.json`.
 
+**Important:** The `assertions` array must use the fields `text`, `passed`, and `evidence` — downstream tooling depends on these exact field names.
+
 ```json
 {
   "assertions": [

diff --git a/plugins/agentv-dev/skills/agentv-bench/references/subagent-pipeline.md b/plugins/agentv-dev/skills/agentv-bench/references/subagent-pipeline.md
@@ -0,0 +1,165 @@
+# Subagent Pipeline — Running eval.yaml without CLI
+
+This reference documents the detailed procedure for running evaluations in subagent mode
+(`SUBAGENT_EVAL_MODE=subagent`, the default). The orchestrating skill dispatches `executor`
+subagents to perform test cases and `grader` subagents to evaluate outputs.
+
+Read this reference when executing Step 3 (Run and Grade) in subagent mode.
+
+## Prerequisites
+
+- The eval.yaml file exists and contains valid test definitions
+- `agentv` CLI is installed (or run from source via `AGENTV_CLI=bun /path/to/cli.ts` in `.env`)
+- Read `references/eval-yaml-spec.md` for the full schema
+
+## Workspace Context
+
+Some evals pass prompt files directly and don't require a specific workspace — those run fine
+from anywhere. But evals that test agent behavior in a workspace (accessing skills, modifying
+repos, using tools across multiple repos) require the user to be in the **target workspace**
+(e.g., a multi-repo workspace set up by allagents). If the eval references workspace files or
+expects the agent to use skills, check that the current directory is the target workspace, not
+just the eval repo — and warn the user if it's wrong.
+
+## Executor Subagent Eligibility
+
+All providers except `cli` are eligible for executor subagents by default. To opt out a
+specific target, set `subagent_mode_allowed: false` in `.agentv/targets.yaml`:
+
+```yaml
+# .agentv/targets.yaml
+targets:
+  - name: my-target
+    provider: openai
+    model: ${{ OPENAI_MODEL }}
+    api_key: ${{ OPENAI_API_KEY }}
+    subagent_mode_allowed: false  # forces CLI invocation instead of executor subagent
+```
+
+When `subagent_mode_allowed: false`, the target falls back to CLI invocation via `agentv eval`
+even in subagent mode.
+
+## CLI Targets: Single Command
+
+For evals with CLI targets, `pipeline run` handles input extraction, target invocation, and
+code grading in one step. When `--out` is omitted, the output directory defaults to
+`.agentv/results/runs/<timestamp>` (same convention as `agentv eval`):
+
+```bash
+# Extract inputs and invoke all CLI targets in parallel:
+agentv pipeline run evals/repro.eval.yaml
+
+# Also run code graders inline (instead of using pipeline grade separately):
+agentv pipeline run evals/repro.eval.yaml --grader-type code
+```
+
+By default, `pipeline run` extracts inputs and invokes targets only. Pass `--grader-type code`
+to also run code-graders inline, or use `agentv pipeline grade <run-dir>` as a separate step.
+
+The run directory is printed to stdout. Then continue to the grading and merge phases
+described in SKILL.md Step 3.
+
+## Non-CLI Targets: Executor Subagents
+
+When the target provider is not `cli`, check `manifest.json` → `target.subagent_mode_allowed`.
+If `true` (default for all non-CLI providers), the subagent IS the target. If `false` (user
+opted out via `subagent_mode_allowed: false` in `.agentv/targets.yaml`), fall back to
+`agentv eval` CLI mode instead.
+
+### Step 1: Extract inputs
+
+```bash
+# Defaults to .agentv/results/runs/<timestamp>
+agentv pipeline input evals/repro.eval.yaml
+```
+
+This creates a run directory with per-test `input.json`, `invoke.json`,
+`criteria.md`, and grader configs.
+
+### Step 2: Dispatch executor subagents
+
+Read `agents/executor.md`. Launch one `executor` subagent **per test case**, all in parallel.
+Each subagent receives the test directory path, reads `input.json`, performs the task using
+its own tools, and writes `response.md`.
+
+Example: 5 tests = 5 executor subagents launched simultaneously.
+
+```
+# Per executor subagent:
+#   - Reads <run-dir>/<test-id>/input.json
+#   - Performs the task
+#   - Writes <run-dir>/<test-id>/response.md
+```
+
+### Step 3 onward: Grade and merge
+
+See SKILL.md Step 3 "Grading" section for the three-phase grading process (code graders →
+LLM grading → merge and validate).
+
+## Step-by-Step Fine-Grained Control (CLI targets)
+
+Use individual commands when you need control over each step with CLI targets:
+
+```bash
+# Step 1: Extract inputs (defaults to .agentv/results/runs/<timestamp>)
+agentv pipeline input evals/repro.eval.yaml
+
+# Step 2: run_tests.py invokes CLI targets (or use pipeline run instead)
+
+# Step 3: Run code graders
+agentv pipeline grade <run-dir>
+
+# Step 4: Subagent does LLM grading, writes llm_scores.json
+
+# Step 5: Merge scores (writes index.jsonl with full scores[] for dashboard)
+agentv pipeline bench <run-dir> --llm-scores llm_scores.json
+
+# Step 6: Validate
+agentv results validate <run-dir>
+```
+
+## LLM Grading JSON Format
+
+The agent reads `llm_graders/<name>.json` for each test, grades the response using the prompt
+content, and produces a scores JSON:
+
+```json
+{
+  "test-01": {
+    "relevance": {
+      "score": 0.85,
+      "assertions": [{"text": "Response is relevant", "passed": true, "evidence": "..."}]
+    }
+  }
+}
+```
+
+## Pipeline Bench and Dashboard
+
+`pipeline bench` merges LLM scores into `index.jsonl` with a full `scores[]` array per entry,
+matching the CLI-mode schema. The web dashboard (`agentv results serve`) reads this format
+directly — no separate conversion script is needed. Run `agentv results validate <run-dir>`
+to verify compatibility.
+
+## Output Structure
+
+The path hierarchy mirrors the CLI mode: `<evalset-name>` comes from the `name` field in
+the eval.yaml. The target is recorded in `manifest.json` — one run = one target.
+
+```
+.agentv/results/runs/<timestamp>/
+├── manifest.json                    ← eval metadata, target, test_ids
+├── index.jsonl                      ← per-test scores
+├── benchmark.json                   ← aggregate statistics
+└── <evalset-name>/                  ← from eval.yaml "name" field (omitted if absent)
+    └── <test-id>/                   ← test case id
+        ├── input.json               ← test input text + messages
+        ├── invoke.json              ← target command or agent instructions
+        ├── criteria.md              ← grading criteria
+        ├── response.md              ← target/agent output
+        ├── timing.json              ← execution timing
+        ├── code_graders/<name>.json     ← code grader configs
+        ├── llm_graders/<name>.json      ← LLM grader configs
+        ├── code_grader_results/<name>.json ← code grader results
+        └── grading.json             ← merged grading
+```
diff --git a/plugins/agentv-dev/skills/agentv-bench/scripts/__init__.py b/plugins/agentv-dev/skills/agentv-bench/scripts/__init__.py
-Original file line number
+Diff line change
@@ Expand Up @@
     Output from the grader agent. Located at `<run-dir>/grading.json`.
+    **Important:** The `assertions` array must use the fields `text`, `passed`, and `evidence` — downstream tooling depends on these exact field names.
     ```json
     {
       "assertions": [
@@ Expand Down @@