diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index d2c188113..9b082c601 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -19,7 +19,7 @@ import { dirname, join, relative, resolve } from 'node:path'; import { deriveCategory, executeScript, loadTestSuite } from '@agentv/core'; import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core'; -import { command, number, option, optional, positional, string } from 'cmd-ts'; +import { command, number, oneOf, option, optional, positional, string } from 'cmd-ts'; import { buildDefaultRunDir } from '../eval/result-layout.js'; import { findRepoRoot } from '../eval/shared.js'; @@ -83,8 +83,13 @@ export const evalRunCommand = command({ long: 'experiment', description: 'Experiment label (e.g. with_skills, without_skills)', }), + graderType: option({ + type: optional(oneOf(['code', 'none'])), + long: 'grader-type', + description: 'Which grading phase to run: "code" runs code-graders inline, omit to skip grading (use pipeline grade separately)', + }), }, - handler: async ({ evalPath, out, workers, experiment }) => { + handler: async ({ evalPath, out, workers, experiment, graderType }) => { const resolvedEvalPath = resolve(evalPath); const outDir = resolve(out ?? buildDefaultRunDir(process.cwd())); const repoRoot = await findRepoRoot(dirname(resolvedEvalPath)); @@ -298,7 +303,13 @@ export const evalRunCommand = command({ console.log('Subagent-as-target mode — skipping CLI invocation.'); } - // ── Step 3: Run code graders (same as pipeline grade) ──────────── + // ── Step 3: Run code graders (only when explicitly requested) ───── + if (graderType !== 'code') { + console.log(`\nDone. Results in ${outDir}`); + console.log('To run code graders: agentv pipeline grade (or re-run with --grader-type code)'); + return; + } + let totalGraders = 0; let totalPassed = 0; diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md index b46bef58c..9aa7e796f 100644 --- a/plugins/agentv-dev/skills/agentv-bench/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-bench/SKILL.md @@ -1,16 +1,11 @@ --- name: agentv-bench description: >- - Run AgentV evaluations and optimize agents through eval-driven iteration. Use when asked to - run `agentv eval`, execute an EVAL.yaml or evals.json file, benchmark agent performance across - providers, analyze eval results, compare agent outputs, optimize prompts against evals, or - improve agent performance. Also use for offline evaluation of recorded sessions (e.g., - copilot-log transcripts) and deterministic-only evals that need no LLM API key. - Use this skill whenever the user mentions running evals, benchmarking, or optimizing any - agent, prompt, or skill — even if they don't explicitly say "agentv". - Do NOT use for writing or editing eval YAML files without running them — that belongs to - agentv-eval-writer. Do NOT use for analyzing existing trace files or result JSONL without - re-running evals — that belongs to agentv-trace-analyst. + Run AgentV evaluations and optimize agents through eval-driven iteration. + Triggers: run evals, benchmark agents, optimize prompts/skills against evals, compare + agent outputs across providers, analyze eval results, offline evaluation of recorded sessions. + Not for: writing/editing eval YAML without running (use agentv-eval-writer), + analyzing existing traces/JSONL without re-running (use agentv-trace-analyst). --- # AgentV Bench @@ -31,41 +26,7 @@ Your job when using this skill is to figure out where the user is in this proces Be flexible. If the user says "I don't need a full benchmark, just help me debug this failure", do that instead. -After the agent is working well, you can also run description optimization to improve skill triggering accuracy (see the Description Optimization section). - -## Bundled scripts layer - -This skill ships with a Python scripts layer in `plugins/agentv-dev/skills/agentv-bench/scripts/`. Requires Python 3.11+ and the `agentv` CLI installed. No extra dependencies — all scripts use the stdlib only. - -### Eval pipeline scripts (subagent mode) - -These scripts break the eval pipeline into discrete steps. The agent runs them in order, only handling LLM grading directly: - -- `scripts/run_tests.py --out ` — Extract inputs and invoke CLI targets in parallel. Writes `response.md` per test. For agent-as-target (`kind: "agent"`), only extracts inputs — executor subagents handle execution. -- `scripts/run_code_graders.py ` — Run code-grader assertions on existing responses. Writes per-grader results. -- `scripts/bench.py < llm_scores.json` — Merge code-grader + LLM scores, compute weighted pass_rate, write `grading.json` + `index.jsonl` + `benchmark.json`. - -### Subagent-mode workflow - -```bash -# 1. Extract inputs, invoke CLI targets, run code graders (one command): -# --out is optional; defaults to .agentv/results/runs/ -agentv pipeline run evals/repro.eval.yaml - -# 2. Subagent performs LLM grading (reads llm_graders/*.json, produces scores JSON) -# ... subagent reads prompts, grades responses, writes llm_scores.json ... - -# 3. Merge all scores and produce final artifacts (writes index.jsonl for dashboard) -agentv pipeline bench --llm-scores llm_scores.json - -# 4. Validate artifacts are dashboard-compatible -agentv results validate -``` - -### Skill management scripts -- `scripts/quick_validate.py` — validate SKILL.md structure and frontmatter -- `scripts/package_skill.py` — package skill into a distributable `.skill` zip -- `scripts/aggregate_benchmark.py` — aggregate grading results into benchmark statistics +After the agent is working well, you can also run description optimization to improve skill triggering accuracy (see `references/description-optimization.md`). ## Communicating with the user @@ -152,13 +113,7 @@ Start with 2-3 realistic test cases — the kind of thing a real user would actu Good assertions are objectively verifiable and have descriptive names. Subjective quality ("the output is good") is better evaluated qualitatively — don't force assertions onto things that need human judgment. -**Evaluator types** (from cheapest to most expensive): -- `exact`, `contains`, `regex`, `is-json` — deterministic, zero cost, instant -- `field-accuracy` — checks JSON field values against expected -- `composite` — weighted combination of multiple evaluators -- `code-grader` — Python/TypeScript scripts via `defineCodeGrader()` (→ see `agentv-eval-writer` skill) -- `tool-trajectory` — evaluate tool call sequences and patterns -- `llm-grader` — LLM-graded with rubric (most expensive, use when semantic understanding needed) +**Evaluator types** (cheapest to most expensive): `exact`, `contains`, `regex`, `is-json`, `field-accuracy`, `composite`, `code-grader`, `tool-trajectory`, `llm-grader`. See `references/eval-yaml-spec.md` for full config and grading recipes for each type. Prefer deterministic evaluators over LLM graders whenever possible. If an assertion can be checked with `contains` or `regex`, don't use `llm-grader`. @@ -176,16 +131,6 @@ Each run produces a new `.agentv/results/runs//` directory automatica If the user has not specified a mode, default to `subagent`. -### CLI resolution - -The Python wrapper `scripts/agentv_cli.py` resolves the `agentv` command deterministically: - -1. `AGENTV_CLI` environment variable (supports multi-word, e.g. `bun /path/to/cli.ts`) -2. `AGENTV_CLI` in nearest `.env` file (searching upward from cwd) -3. `agentv` on PATH - -Use `scripts/agentv_cli.py` (or the wrapper scripts that call it) to invoke the CLI. The Python wrapper scripts (`scripts/run_tests.py`, etc.) pick up `AGENTV_CLI` automatically — no extra steps needed when calling them. - | `SUBAGENT_EVAL_MODE` | Mode | How | |----------------------|------|-----| | `subagent` (default) | **Subagent mode** | Subagent-driven eval — parses eval.yaml, spawns executor + grader subagents. Zero CLI dependency. | @@ -193,7 +138,7 @@ Use `scripts/agentv_cli.py` (or the wrapper scripts that call it) to invoke the Set `SUBAGENT_EVAL_MODE` in `.env` at the project root as the default when no mode is specified. If absent, default to `subagent`. **User instruction always overrides this.** -**`subagent`** — Parses eval.yaml directly, spawns executor subagents to run each test case in the current workspace, then spawns grader subagents to evaluate all assertion types natively. No CLI or external API calls required. See "Subagent mode: Running eval.yaml without CLI" below. +**`subagent`** — Parses eval.yaml directly, spawns executor subagents to run each test case in the current workspace, then spawns grader subagents to evaluate all assertion types natively. No CLI or external API calls required. Read `references/subagent-pipeline.md` for the detailed procedure. **`cli`** — AgentV CLI handles execution, grading, and artifact generation end-to-end. Works with all providers. Use when you need multi-provider benchmarking or CLI-specific features. @@ -204,7 +149,7 @@ Set `SUBAGENT_EVAL_MODE` in `.env` at the project root as the default when no mo agentv eval --artifacts .agentv/artifacts/ ``` -**Subagent mode** — see "Subagent mode: Running eval.yaml without CLI" below. Parses eval.yaml directly and spawns executor/grader subagents. No CLI required. +**Subagent mode** — read `references/subagent-pipeline.md` for the detailed procedure. In brief: use `pipeline input` to extract inputs, dispatch one `executor` subagent per test case (all in parallel), then proceed to grading below. **Spawn all runs in the same turn.** For each test case that needs both a "with change" and a "baseline" run, launch them simultaneously. Don't run one set first and come back for the other — launch everything at once so results arrive around the same time. @@ -226,203 +171,63 @@ Good assertions are *discriminating* — they pass when the agent genuinely succ ### As runs complete, capture timing data -When each subagent task completes, you receive a notification containing `total_tokens` and `duration_ms`. **Save this data immediately** to `timing.json` in the run directory: - -```json -{ - "total_tokens": 84852, - "duration_ms": 23332, - "total_duration_seconds": 23.3 -} -``` +When each subagent task completes, you receive a notification containing `total_tokens` and `duration_ms`. **Save this data immediately** to `timing.json` in the run directory. See `references/schemas.md` for the timing.json schema. This is the only opportunity to capture this data — it comes through the task notification and isn't persisted elsewhere. Process each notification as it arrives. ### Grading -Once runs complete: - -**Subagent mode grading** — dispatch `grader` subagent (read `agents/grader.md`). The grader evaluates all assertion types natively: deterministic checks (contains, regex, is-json, etc.) via direct string operations, LLM-graded assertions via Claude's own reasoning, and `code-grader` via Bash script execution. No CLI call required. - -**CLI mode grading** — deterministic evaluators run automatically via CLI. LLM-graded assertions are handled by the configured LLM provider. - -Both modes write **grading.json** per test with this structure: -```json -{ - "assertions": [ - {"text": "Response includes error handling", "passed": true, "evidence": "Lines 12-15 contain try/catch block"}, - {"text": "Uses async/await pattern", "passed": false, "evidence": "Uses .then() callback pattern instead"} - ], - "summary": {"passed": 1, "failed": 1, "total": 2, "pass_rate": 0.5} -} -``` - -The grading.json `assertions` array must use the fields `text`, `passed`, and `evidence` — downstream tooling depends on these exact field names. +**In CLI mode**, `agentv eval` handles all grading end-to-end — no manual phases needed. -### Workspace features (EVAL.yaml only) +**In subagent mode**, grading has three phases. **All three are required — do not stop after phase 1.** -- **Workspace isolation** — clone repos, run setup/teardown hooks (before_all, before_each, after_each, after_all) -- **Materialization modes** — `pooled` (reuse slots), `temp` (fresh per run), `static` (existing dir) -- **Multi-repo** — clone multiple repos with sparse checkout and shallow clone support -- **File change tracking** — grade by diffing workspace files before/after agent execution - -### Artifacts - -All artifacts use established schemas — do not modify the structure: - -- **grading.json**: per-test `assertions` with `{text, passed, evidence}`, plus `summary` -- **timing.json**: `{total_tokens, duration_ms, total_duration_seconds}` -- **benchmark.json**: per-target aggregate `{pass_rate, time_seconds, tokens}` with `mean ± stddev` - -Write artifacts to `.agentv/artifacts/` or the iteration directory. - -### Subagent mode: Running eval.yaml without CLI - -When `SUBAGENT_EVAL_MODE=subagent` (default), use the pipeline CLI subcommands (`pipeline input`, `pipeline grade`, `pipeline bench`) and Python wrapper scripts. This mode dispatches `executor` subagents to perform each test case, then `grader` subagents to evaluate the outputs. - -**Executor subagent eligibility:** All providers except `cli` are eligible for executor subagents by default. To opt out a specific target, set `subagent_mode_allowed: false` in `.agentv/targets.yaml`: - -```yaml -# .agentv/targets.yaml -targets: - - name: my-target - provider: openai - model: ${{ OPENAI_MODEL }} - api_key: ${{ OPENAI_API_KEY }} - subagent_mode_allowed: false # forces CLI invocation instead of executor subagent -``` - -When `subagent_mode_allowed: false`, the target falls back to CLI invocation via `agentv eval` even in subagent mode. - -**Prerequisites:** -- The eval.yaml file exists and contains valid test definitions -- `agentv` CLI is installed (or run from source via `AGENTV_CLI=bun /path/to/cli.ts` in `.env`) -- Read `references/eval-yaml-spec.md` for the full schema - -**Workspace matters when evals need it:** Some evals pass prompt files directly and don't require a specific workspace — those run fine from anywhere. But evals that test agent behavior in a workspace (accessing skills, modifying repos, using tools across multiple repos) require the user to be in the **target workspace** (e.g., a multi-repo workspace set up by allagents). If the eval references workspace files or expects the agent to use skills, check that the current directory is the target workspace, not just the eval repo — and warn the user if it's wrong. - -**CLI targets: Single command** - -For evals with CLI targets, `pipeline run` handles input extraction, target invocation, and code grading in one step. When `--out` is omitted, the output directory defaults to `.agentv/results/runs/` (same convention as `agentv eval`): +**Phase 1: Code graders** (deterministic, zero-cost) ```bash -# Extract inputs, invoke all CLI targets in parallel, run code graders: -# Output goes to .agentv/results/runs// by default -agentv pipeline run evals/repro.eval.yaml -``` - -The run directory is printed to stdout. Then the agent performs LLM grading and merges scores: - -```bash -agentv pipeline bench --llm-scores llm_scores.json - -# Validate artifacts are dashboard-compatible: -agentv results validate -``` - -That's the entire pipeline: **2 commands** + LLM grading + optional validation. - -**Non-CLI targets: Executor subagents** - -When the target provider is not `cli`, check `manifest.json` → `target.subagent_mode_allowed`. If `true` (default for all non-CLI providers), the subagent IS the target. If `false` (user opted out via `subagent_mode_allowed: false` in `.agentv/targets.yaml`), fall back to `agentv eval` CLI mode instead. - -For executor subagent targets, use `pipeline input` to extract inputs, then dispatch `executor` subagents to perform each test case: - -```bash -# Step 1: Extract inputs (defaults to .agentv/results/runs/) -agentv pipeline input evals/repro.eval.yaml -``` - -This creates a run directory with per-test `input.json`, `invoke.json` (with `kind: "agent"`), `criteria.md`, and grader configs. - -**Step 2: Dispatch executor subagents** — read `agents/executor.md`. Launch one `executor` subagent **per test case**, all in parallel. Each subagent receives the test directory path, reads `input.json`, performs the task using its own tools, and writes `response.md`. For example, 5 tests = 5 executor subagents launched simultaneously. - -``` -# Per executor subagent: -# - Reads //input.json -# - Performs the task -# - Writes //response.md +agentv pipeline grade ``` -**Step 3 onward: Grade and merge** — same as CLI targets: +This runs all `code-grader` assertions against the `response.md` files. Results are written to `/code_grader_results/.json`. Alternatively, pass `--grader-type code` to `pipeline run` to run code graders inline. -```bash -# Step 3: Run code graders -agentv pipeline grade +**Phase 2: LLM grading** (semantic — do NOT skip this phase) -# Step 4: Subagent does LLM grading, writes llm_scores.json (see below) +Dispatch one `grader` subagent per (test × LLM grader) pair, **all in parallel**. +Example: 5 tests × 2 LLM graders = 10 grader subagents launched simultaneously. -# Step 5: Merge scores (writes index.jsonl with full scores[] for dashboard) -agentv pipeline bench --llm-scores llm_scores.json +**Do NOT dispatch a single grader for multiple tests.** Each subagent grades exactly one (test, grader) pair. -# Step 6: Validate -agentv results validate -``` +Each grader subagent (read `agents/grader.md`): +1. Reads `/llm_graders/.json` for the grading prompt +2. Reads `/response.md` for the candidate output +3. Grades the response against the prompt criteria +4. Returns score (0.0–1.0) and per-assertion evidence -**Step-by-step (fine-grained control for CLI targets)** +After **all** grader subagents complete, merge their results into a single `llm_scores.json` in the run directory. -Use individual commands when you need control over each step with CLI targets: +**Phase 3: Merge and validate** ```bash -# Step 1: Extract inputs (defaults to .agentv/results/runs/) -agentv pipeline input evals/repro.eval.yaml - -# Step 2: run_tests.py invokes CLI targets (or use pipeline run instead) - -# Step 3: Run code graders -agentv pipeline grade - -# Step 4: Subagent does LLM grading, writes llm_scores.json - -# Step 5: Merge scores (writes index.jsonl with full scores[] for dashboard) agentv pipeline bench --llm-scores llm_scores.json - -# Step 6: Validate agentv results validate ``` -**Step 3 (LLM grading): agent performs directly** - -The agent reads `llm_graders/.json` for each test, grades the response using the prompt content, and produces a scores JSON: - -```json -{ - "test-01": { - "relevance": { - "score": 0.85, - "assertions": [{"text": "Response is relevant", "passed": true, "evidence": "..."}] - } - } -} -``` - -Dispatch one `grader` subagent (read `agents/grader.md`) **per (test × LLM grader) pair**, all in parallel. For example, 5 tests × 2 LLM graders = 10 subagents launched simultaneously. Each subagent reads `/llm_graders/.json`, grades the corresponding `/response.md` against the `prompt_content` criteria, and returns its score (0.0–1.0) and assertions. After all subagents complete, merge their results into a single `llm_scores.json` in the run directory. +This merges code-grader + LLM scores, computes weighted pass_rate, writes `grading.json` + `index.jsonl` + `benchmark.json`. -**Note:** `pipeline bench` merges LLM scores into `index.jsonl` with a full `scores[]` array per entry, matching the CLI-mode schema. The web dashboard (`agentv results serve`) reads this format directly — no separate conversion script is needed. Run `agentv results validate ` to verify compatibility. +### Artifacts -**Note on Python wrapper scripts:** The `scripts/` directory contains Python wrappers (`run_tests.py`, `run_code_graders.py`, `bench.py`) that call the CLI commands. These are provided as an alternative but the direct CLI commands above are preferred — they work cross-platform without Python dependency issues. +All artifacts use established schemas — see `references/schemas.md` for the full definitions. Do not modify the structure. Key artifacts per run: +- **grading.json**: per-test assertions with `{text, passed, evidence}`, plus summary +- **timing.json**: `{total_tokens, duration_ms, total_duration_seconds}` +- **benchmark.json**: per-target aggregate `{pass_rate, time_seconds, tokens}` -**Output structure:** +Write artifacts to `.agentv/artifacts/` or the iteration directory. -The path hierarchy mirrors the CLI mode: `` comes from the `name` field in the eval.yaml. The target is recorded in `manifest.json` — one run = one target. +### Workspace features (EVAL.yaml only) -``` -.agentv/results/runs// -├── manifest.json ← eval metadata, target, test_ids -├── index.jsonl ← per-test scores -├── benchmark.json ← aggregate statistics -└── / ← from eval.yaml "name" field (omitted if absent) - └── / ← test case id - ├── input.json ← test input text + messages - ├── invoke.json ← target command or agent instructions - ├── criteria.md ← grading criteria - ├── response.md ← target/agent output - ├── timing.json ← execution timing - ├── code_graders/.json ← code grader configs - ├── llm_graders/.json ← LLM grader configs - ├── code_grader_results/.json ← code grader results - └── grading.json ← merged grading -``` +- **Workspace isolation** — clone repos, run setup/teardown hooks (before_all, before_each, after_each, after_all) +- **Materialization modes** — `pooled` (reuse slots), `temp` (fresh per run), `static` (existing dir) +- **Multi-repo** — clone multiple repos with sparse checkout and shallow clone support +- **File change tracking** — grade by diffing workspace files before/after agent execution --- @@ -548,118 +353,13 @@ This is optional and requires subagents. The human review loop is usually suffic ## Description Optimization -The `description` field in a skill's SKILL.md frontmatter is the primary mechanism that determines whether Claude invokes the skill. After the agent/skill is working well, offer to optimize the description for better triggering accuracy. - -**Provider compatibility**: Description optimization is specific to agents with skill-discovery mechanisms (e.g., Claude Code). Agents like Copilot and Codex don't have skill systems, so description optimization doesn't apply to them. The `skill-trigger` evaluator still works for these providers — it just checks whether the agent invoked the right tools, not whether it discovered the skill via description matching. - -### Step 1: Generate trigger EVAL.yaml - -Create 20 test cases: -- **10 should-trigger**: realistic prompts where this skill should activate — different phrasings, casual speech, uncommon use cases, edge cases where this skill competes with another but should win -- **10 should-not-trigger**: near-miss prompts that share keywords but actually need something different — adjacent domains, ambiguous phrasing where naive matching would trigger but shouldn't - -Prompts must be realistic — include file paths, personal context, typos, casual speech. Not abstract requests like "format data" but concrete ones like "ok so my boss sent me Q4-sales-FINAL-v2.xlsx and she wants me to add a profit margin column..." - -The should-not-trigger cases are the most valuable. "Write a fibonacci function" as a negative test for an eval skill is useless — it doesn't test anything. The negative cases should be genuinely tricky near-misses. - -Write as EVAL.yaml with top-level input (the user prompt doesn't specify the skill name — it's a natural utterance): - -```yaml -# trigger-eval.eval.yaml -tests: - - id: should-trigger-casual-optimize - input: "ok so I have this agent that keeps failing on the code review tasks, can you help me figure out why and fix it" - assertions: - - type: contains - value: "agentv-bench" - - id: should-not-trigger-build-error - input: "my TypeScript build is failing with type errors in src/auth.ts" - assertions: - - type: not-contains - value: "agentv-bench" -``` - -### Step 2: Review with user - -Present the eval set. The user adjusts queries, toggles should-trigger, adds/removes cases. This step matters — bad eval queries lead to bad descriptions. - -### Step 3: Iterate on description - -Run the trigger eval, identify misfires, rewrite the description, re-run. Max 5 iterations. Select best description by held-out test accuracy (split 60% train / 40% test) to avoid overfitting. - -Use the grader and analyzer subagents to identify trigger failures and propose description improvements — the same eval → grade → analyze → improve loop used for agent output quality. - -### Step 4: Apply - -Update the skill's SKILL.md frontmatter with the optimized description. Show the user before/after with accuracy scores. +After the agent is working well, offer to optimize the skill's `description` field for better triggering accuracy. Read `references/description-optimization.md` for the full procedure (generate trigger EVAL.yaml, review with user, iterate, apply). --- ## Environment Adaptation -**CI/headless mode**: Skip interactive prompts. Exit with pass/fail status code. Always generate artifacts for downstream consumption. - -**No subagents available** (e.g., Claude.ai): Run test cases serially. Skip blind comparison. Present results directly in conversation — for each test case, show the prompt and output. Ask for feedback inline. Skip benchmarking (it relies on baseline comparisons that aren't meaningful without subagents). - -**Note**: "Description Optimization" (iterating on SKILL.md descriptions for better triggering accuracy) requires an agent with a skill-discovery mechanism. Agents that don't have skill systems (Copilot, Codex) still benefit from evaluation for testing whether they invoke the right tools. - -**Provider-specific notes**: -- **Copilot CLI**: Uses ACP protocol via `copilot --acp --stdio` -- **Claude SDK**: Requires `@anthropic-ai/claude-agent-sdk` installed -- **Codex**: Supports skills via `.agents/` or `.codex/` folders. Emits `command_execution` and `file_change` tool calls. -- **Custom CLI**: Needs `command` and output file pattern in target config -- **Target config**: Uses `${{ ENV_VAR }}` syntax (not `${ENV_VAR}`) for API keys - -### Unsupported providers: use a code-grader - -The built-in `skill-trigger` evaluator covers Claude, Copilot, Pi, Codex and VS Code out of the box. For providers with different tool-call formats, write a code-grader that inspects the agent's tool call trace. - -A code-grader receives the full evaluation context including the agent's output messages and tool calls. You can inspect these to determine whether the skill was invoked: - -```yaml -# Example: code-grader for Codex skill-trigger detection -tests: - - id: should-trigger-codex - input: "Analyze this CSV file" - assertions: - - type: code-grader - path: ./judges/codex-skill-trigger.ts -``` - -```typescript -// judges/codex-skill-trigger.ts -import { defineCodeGrader } from '@agentv/eval'; - -export default defineCodeGrader(({ output }) => { - const skillName = 'csv-analyzer'; - const toolCalls = (output ?? []).flatMap((msg) => msg.toolCalls ?? []); - const firstTool = toolCalls[0]; - - if (!firstTool) { - return { score: 0, reason: 'No tool calls recorded' }; - } - - // Codex reads skill files via shell commands - if (firstTool.tool === 'command_execution') { - const cmd = String(firstTool.input ?? ''); - if (cmd.includes(skillName)) { - return { score: 1, reason: `Skill "${skillName}" triggered via command: ${cmd}` }; - } - } - - // Check if skill file was read via file_change or other tools - if (firstTool.tool === 'file_change') { - const path = String((firstTool.input as Record)?.path ?? ''); - if (path.includes(skillName)) { - return { score: 1, reason: `Skill file accessed: ${path}` }; - } - } - - return { score: 0, reason: `First tool was "${firstTool.tool}" — not a skill invocation for "${skillName}"` }; -}); -``` - -This approach is more flexible than config overrides — you can match any tool-call pattern, check multiple fields, and add provider-specific logic as needed. +For provider-specific notes (Copilot, Codex, Claude SDK, custom CLI), CI/headless mode behavior, and fallback strategies when subagents aren't available, read `references/environment-adaptation.md`. --- @@ -670,12 +370,16 @@ The `agents/` directory contains instructions for specialized subagents. Read th | Agent | File | Purpose | When to dispatch | |-------|------|---------|-----------------| | executor | `agents/executor.md` | Perform test case tasks as the target agent | Step 3 (agent targets — one per test case) | -| grader | `agents/grader.md` | Grade responses with per-assertion evidence | Step 3 (grading LLM-judged assertions) | +| grader | `agents/grader.md` | Grade responses with per-assertion evidence | Step 3 (grading — one per test × LLM grader pair) | | comparator | `agents/comparator.md` | Blind N-way comparison + post-hoc analysis | Step 4 (comparing iterations/targets) | | analyzer | `agents/analyzer.md` | Quality audit, deterministic upgrades, benchmarks | Step 4 (pattern analysis) | The `references/` directory has additional documentation: -- `references/eval-yaml-spec.md` — Eval YAML schema and assertion grading recipes (read when running subagent-mode evals) +- `references/eval-yaml-spec.md` — Eval YAML schema and assertion grading recipes +- `references/subagent-pipeline.md` — Detailed subagent-mode pipeline commands and output structure +- `references/description-optimization.md` — Skill description optimization workflow +- `references/environment-adaptation.md` — Provider-specific notes and CI/headless behavior +- `references/schemas.md` — JSON schemas for all artifacts (grading.json, benchmark.json, etc.) - `references/migrating-from-skill-creator.md` — Guide for users coming from Anthropic's skill-creator --- diff --git a/plugins/agentv-dev/skills/agentv-bench/references/description-optimization.md b/plugins/agentv-dev/skills/agentv-bench/references/description-optimization.md new file mode 100644 index 000000000..1a0134cba --- /dev/null +++ b/plugins/agentv-dev/skills/agentv-bench/references/description-optimization.md @@ -0,0 +1,66 @@ +# Description Optimization + +Optimize the `description` field in a skill's SKILL.md frontmatter for better triggering +accuracy. Use this after the agent/skill is working well — this is a polish step, not a +core workflow step. + +**Provider compatibility**: Description optimization applies to any agent platform with +skill-discovery mechanisms — Claude Code, Codex (`.agents/` or `.codex/` folders), Copilot, +and others. The `skill-trigger` evaluator checks whether the agent invoked the right skill, +regardless of how discovery works on that platform. + +## Step 1: Generate Trigger EVAL.yaml + +Create 20 test cases: +- **10 should-trigger**: realistic prompts where this skill should activate — different + phrasings, casual speech, uncommon use cases, edge cases where this skill competes with + another but should win +- **10 should-not-trigger**: near-miss prompts that share keywords but actually need + something different — adjacent domains, ambiguous phrasing where naive matching would + trigger but shouldn't + +Prompts must be realistic — include file paths, personal context, typos, casual speech. +Not abstract requests like "format data" but concrete ones like "ok so my boss sent me +Q4-sales-FINAL-v2.xlsx and she wants me to add a profit margin column..." + +The should-not-trigger cases are the most valuable. "Write a fibonacci function" as a +negative test for an eval skill is useless — it doesn't test anything. The negative cases +should be genuinely tricky near-misses. + +Write as EVAL.yaml with top-level input (the user prompt doesn't specify the skill name — +it's a natural utterance): + +```yaml +# trigger-eval.eval.yaml +tests: + - id: should-trigger-casual-optimize + input: "ok so I have this agent that keeps failing on the code review tasks, can you help me figure out why and fix it" + assertions: + - type: skill-trigger + skill: agentv-bench + - id: should-not-trigger-build-error + input: "my TypeScript build is failing with type errors in src/auth.ts" + assertions: + - type: skill-trigger + skill: agentv-bench + should_trigger: false +``` + +## Step 2: Review with User + +Present the eval set. The user adjusts queries, toggles should-trigger, adds/removes cases. +This step matters — bad eval queries lead to bad descriptions. + +## Step 3: Iterate on Description + +Run the trigger eval, identify misfires, rewrite the description, re-run. Max 5 iterations. +Select best description by held-out test accuracy (split 60% train / 40% test) to avoid +overfitting. + +Use the grader and analyzer subagents to identify trigger failures and propose description +improvements — the same eval → grade → analyze → improve loop used for agent output quality. + +## Step 4: Apply + +Update the skill's SKILL.md frontmatter with the optimized description. Show the user +before/after with accuracy scores. diff --git a/plugins/agentv-dev/skills/agentv-bench/references/environment-adaptation.md b/plugins/agentv-dev/skills/agentv-bench/references/environment-adaptation.md new file mode 100644 index 000000000..bcd522cd8 --- /dev/null +++ b/plugins/agentv-dev/skills/agentv-bench/references/environment-adaptation.md @@ -0,0 +1,82 @@ +# Environment Adaptation + +Provider-specific notes, CI/headless behavior, and fallback strategies for environments +with limited capabilities. + +## CI/Headless Mode + +Skip interactive prompts. Exit with pass/fail status code. Always generate artifacts for +downstream consumption. + +## No Subagents Available (e.g., Claude.ai) + +Run test cases serially. Skip blind comparison. Present results directly in conversation — +for each test case, show the prompt and output. Ask for feedback inline. Skip benchmarking +(it relies on baseline comparisons that aren't meaningful without subagents). + +## Provider-Specific Notes + +- **Copilot CLI**: Uses ACP protocol via `copilot --acp --stdio` +- **Claude SDK**: Requires `@anthropic-ai/claude-agent-sdk` installed +- **Codex**: Supports skills via `.agents/` or `.codex/` folders. Emits `command_execution` + and `file_change` tool calls. +- **Custom CLI**: Needs `command` and output file pattern in target config +- **Target config**: Uses `${{ ENV_VAR }}` syntax (not `${ENV_VAR}`) for API keys + +**Note**: "Description Optimization" (see `references/description-optimization.md`) applies +to any platform with skill-discovery mechanisms. All listed providers support skills. + +## Unsupported Providers: Use a Code-Grader + +The built-in `skill-trigger` evaluator covers Claude, Copilot, Pi, Codex and VS Code out +of the box. For providers with different tool-call formats, write a code-grader that inspects +the agent's tool call trace. + +A code-grader receives the full evaluation context including the agent's output messages and +tool calls. You can inspect these to determine whether the skill was invoked: + +```yaml +# Example: code-grader for Codex skill-trigger detection +tests: + - id: should-trigger-codex + input: "Analyze this CSV file" + assertions: + - type: code-grader + path: ./judges/codex-skill-trigger.ts +``` + +```typescript +// judges/codex-skill-trigger.ts +import { defineCodeGrader } from '@agentv/eval'; + +export default defineCodeGrader(({ output }) => { + const skillName = 'csv-analyzer'; + const toolCalls = (output ?? []).flatMap((msg) => msg.toolCalls ?? []); + const firstTool = toolCalls[0]; + + if (!firstTool) { + return { score: 0, reason: 'No tool calls recorded' }; + } + + // Codex reads skill files via shell commands + if (firstTool.tool === 'command_execution') { + const cmd = String(firstTool.input ?? ''); + if (cmd.includes(skillName)) { + return { score: 1, reason: `Skill "${skillName}" triggered via command: ${cmd}` }; + } + } + + // Check if skill file was read via file_change or other tools + if (firstTool.tool === 'file_change') { + const path = String((firstTool.input as Record)?.path ?? ''); + if (path.includes(skillName)) { + return { score: 1, reason: `Skill file accessed: ${path}` }; + } + } + + return { score: 0, reason: `First tool was "${firstTool.tool}" — not a skill invocation for "${skillName}"` }; +}); +``` + +This approach is more flexible than config overrides — you can match any tool-call pattern, +check multiple fields, and add provider-specific logic as needed. diff --git a/plugins/agentv-dev/skills/agentv-bench/references/schemas.md b/plugins/agentv-dev/skills/agentv-bench/references/schemas.md index 7ffe245ff..b02ca0633 100644 --- a/plugins/agentv-dev/skills/agentv-bench/references/schemas.md +++ b/plugins/agentv-dev/skills/agentv-bench/references/schemas.md @@ -87,6 +87,8 @@ Tracks version progression in Improve mode. Located at workspace root. Output from the grader agent. Located at `/grading.json`. +**Important:** The `assertions` array must use the fields `text`, `passed`, and `evidence` — downstream tooling depends on these exact field names. + ```json { "assertions": [ diff --git a/plugins/agentv-dev/skills/agentv-bench/references/subagent-pipeline.md b/plugins/agentv-dev/skills/agentv-bench/references/subagent-pipeline.md new file mode 100644 index 000000000..dfb918bcc --- /dev/null +++ b/plugins/agentv-dev/skills/agentv-bench/references/subagent-pipeline.md @@ -0,0 +1,165 @@ +# Subagent Pipeline — Running eval.yaml without CLI + +This reference documents the detailed procedure for running evaluations in subagent mode +(`SUBAGENT_EVAL_MODE=subagent`, the default). The orchestrating skill dispatches `executor` +subagents to perform test cases and `grader` subagents to evaluate outputs. + +Read this reference when executing Step 3 (Run and Grade) in subagent mode. + +## Prerequisites + +- The eval.yaml file exists and contains valid test definitions +- `agentv` CLI is installed (or run from source via `AGENTV_CLI=bun /path/to/cli.ts` in `.env`) +- Read `references/eval-yaml-spec.md` for the full schema + +## Workspace Context + +Some evals pass prompt files directly and don't require a specific workspace — those run fine +from anywhere. But evals that test agent behavior in a workspace (accessing skills, modifying +repos, using tools across multiple repos) require the user to be in the **target workspace** +(e.g., a multi-repo workspace set up by allagents). If the eval references workspace files or +expects the agent to use skills, check that the current directory is the target workspace, not +just the eval repo — and warn the user if it's wrong. + +## Executor Subagent Eligibility + +All providers except `cli` are eligible for executor subagents by default. To opt out a +specific target, set `subagent_mode_allowed: false` in `.agentv/targets.yaml`: + +```yaml +# .agentv/targets.yaml +targets: + - name: my-target + provider: openai + model: ${{ OPENAI_MODEL }} + api_key: ${{ OPENAI_API_KEY }} + subagent_mode_allowed: false # forces CLI invocation instead of executor subagent +``` + +When `subagent_mode_allowed: false`, the target falls back to CLI invocation via `agentv eval` +even in subagent mode. + +## CLI Targets: Single Command + +For evals with CLI targets, `pipeline run` handles input extraction, target invocation, and +code grading in one step. When `--out` is omitted, the output directory defaults to +`.agentv/results/runs/` (same convention as `agentv eval`): + +```bash +# Extract inputs and invoke all CLI targets in parallel: +agentv pipeline run evals/repro.eval.yaml + +# Also run code graders inline (instead of using pipeline grade separately): +agentv pipeline run evals/repro.eval.yaml --grader-type code +``` + +By default, `pipeline run` extracts inputs and invokes targets only. Pass `--grader-type code` +to also run code-graders inline, or use `agentv pipeline grade ` as a separate step. + +The run directory is printed to stdout. Then continue to the grading and merge phases +described in SKILL.md Step 3. + +## Non-CLI Targets: Executor Subagents + +When the target provider is not `cli`, check `manifest.json` → `target.subagent_mode_allowed`. +If `true` (default for all non-CLI providers), the subagent IS the target. If `false` (user +opted out via `subagent_mode_allowed: false` in `.agentv/targets.yaml`), fall back to +`agentv eval` CLI mode instead. + +### Step 1: Extract inputs + +```bash +# Defaults to .agentv/results/runs/ +agentv pipeline input evals/repro.eval.yaml +``` + +This creates a run directory with per-test `input.json`, `invoke.json`, +`criteria.md`, and grader configs. + +### Step 2: Dispatch executor subagents + +Read `agents/executor.md`. Launch one `executor` subagent **per test case**, all in parallel. +Each subagent receives the test directory path, reads `input.json`, performs the task using +its own tools, and writes `response.md`. + +Example: 5 tests = 5 executor subagents launched simultaneously. + +``` +# Per executor subagent: +# - Reads //input.json +# - Performs the task +# - Writes //response.md +``` + +### Step 3 onward: Grade and merge + +See SKILL.md Step 3 "Grading" section for the three-phase grading process (code graders → +LLM grading → merge and validate). + +## Step-by-Step Fine-Grained Control (CLI targets) + +Use individual commands when you need control over each step with CLI targets: + +```bash +# Step 1: Extract inputs (defaults to .agentv/results/runs/) +agentv pipeline input evals/repro.eval.yaml + +# Step 2: run_tests.py invokes CLI targets (or use pipeline run instead) + +# Step 3: Run code graders +agentv pipeline grade + +# Step 4: Subagent does LLM grading, writes llm_scores.json + +# Step 5: Merge scores (writes index.jsonl with full scores[] for dashboard) +agentv pipeline bench --llm-scores llm_scores.json + +# Step 6: Validate +agentv results validate +``` + +## LLM Grading JSON Format + +The agent reads `llm_graders/.json` for each test, grades the response using the prompt +content, and produces a scores JSON: + +```json +{ + "test-01": { + "relevance": { + "score": 0.85, + "assertions": [{"text": "Response is relevant", "passed": true, "evidence": "..."}] + } + } +} +``` + +## Pipeline Bench and Dashboard + +`pipeline bench` merges LLM scores into `index.jsonl` with a full `scores[]` array per entry, +matching the CLI-mode schema. The web dashboard (`agentv results serve`) reads this format +directly — no separate conversion script is needed. Run `agentv results validate ` +to verify compatibility. + +## Output Structure + +The path hierarchy mirrors the CLI mode: `` comes from the `name` field in +the eval.yaml. The target is recorded in `manifest.json` — one run = one target. + +``` +.agentv/results/runs// +├── manifest.json ← eval metadata, target, test_ids +├── index.jsonl ← per-test scores +├── benchmark.json ← aggregate statistics +└── / ← from eval.yaml "name" field (omitted if absent) + └── / ← test case id + ├── input.json ← test input text + messages + ├── invoke.json ← target command or agent instructions + ├── criteria.md ← grading criteria + ├── response.md ← target/agent output + ├── timing.json ← execution timing + ├── code_graders/.json ← code grader configs + ├── llm_graders/.json ← LLM grader configs + ├── code_grader_results/.json ← code grader results + └── grading.json ← merged grading +``` diff --git a/plugins/agentv-dev/skills/agentv-bench/scripts/__init__.py b/plugins/agentv-dev/skills/agentv-bench/scripts/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/plugins/agentv-dev/skills/agentv-bench/scripts/agentv_cli.py b/plugins/agentv-dev/skills/agentv-bench/scripts/agentv_cli.py deleted file mode 100644 index 3e8fe40af..000000000 --- a/plugins/agentv-dev/skills/agentv-bench/scripts/agentv_cli.py +++ /dev/null @@ -1,66 +0,0 @@ -"""Resolve and invoke the agentv CLI. - -Centralises CLI resolution so individual scripts don't duplicate -the lookup logic. Also usable as a standalone wrapper: - - uv run agentv_cli.py eval evals/my.eval.yaml --artifacts out/ - -Resolution order: -1. AGENTV_CLI environment variable -2. AGENTV_CLI in nearest .env file (searching upward from cwd) -3. `agentv` on PATH -""" -import os -import shutil -import subprocess -import sys -from pathlib import Path - - -def _find_env_key(key: str) -> str | None: - """Search up from cwd for .env and return a specific key value.""" - current = Path(os.getcwd()) - while True: - env_file = current / ".env" - if env_file.exists(): - for line in env_file.read_text().splitlines(): - line = line.strip() - if line.startswith(f"{key}="): - return line[len(key) + 1 :] - parent = current.parent - if parent == current: - break - current = parent - return None - - -def find_agentv() -> list[str]: - """Resolve the agentv CLI command. - - Checks AGENTV_CLI env var first (supports multi-word commands like - 'bun /path/to/cli.ts' for running from source). If not in environment, - also searches the nearest .env file. Falls back to PATH lookup. - """ - cli = os.environ.get("AGENTV_CLI") or _find_env_key("AGENTV_CLI") - if cli: - parts = cli.split() - if parts: - return parts - path = shutil.which("agentv") - if not path: - print( - "agentv CLI not found. Set AGENTV_CLI in .env or install: bun install -g agentv", - file=sys.stderr, - ) - sys.exit(1) - return [path] - - -def main() -> None: - """Pass-through wrapper: resolve agentv and forward all arguments.""" - cmd = find_agentv() + sys.argv[1:] - sys.exit(subprocess.call(cmd)) - - -if __name__ == "__main__": - main() diff --git a/plugins/agentv-dev/skills/agentv-bench/scripts/aggregate_benchmark.py b/plugins/agentv-dev/skills/agentv-bench/scripts/aggregate_benchmark.py deleted file mode 100644 index 9c7f61afc..000000000 --- a/plugins/agentv-dev/skills/agentv-bench/scripts/aggregate_benchmark.py +++ /dev/null @@ -1,405 +0,0 @@ -#!/usr/bin/env python3 -""" -Aggregate individual run results into benchmark summary statistics. - -Reads grading.json files from run directories and produces: -- run_summary with mean, stddev, min, max for each metric -- delta between with_skill and without_skill configurations - -Usage: - python aggregate_benchmark.py - -Example: - python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/ - -The script supports two directory layouts: - - Workspace layout (from skill-creator iterations): - / - └── eval-N/ - ├── with_skill/ - │ ├── run-1/grading.json - │ └── run-2/grading.json - └── without_skill/ - ├── run-1/grading.json - └── run-2/grading.json - - Legacy layout (with runs/ subdirectory): - / - └── runs/ - └── eval-N/ - ├── with_skill/ - │ └── run-1/grading.json - └── without_skill/ - └── run-1/grading.json -""" - -import argparse -import json -import math -import sys -from datetime import datetime, timezone -from pathlib import Path - - -def calculate_stats(values: list[float]) -> dict: - """Calculate mean, stddev, min, max for a list of values.""" - if not values: - return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0} - - n = len(values) - mean = sum(values) / n - - if n > 1: - variance = sum((x - mean) ** 2 for x in values) / (n - 1) - stddev = math.sqrt(variance) - else: - stddev = 0.0 - - return { - "mean": round(mean, 4), - "stddev": round(stddev, 4), - "min": round(min(values), 4), - "max": round(max(values), 4) - } - - -def load_run_results(benchmark_dir: Path) -> dict: - """ - Load all run results from a benchmark directory. - - Returns dict keyed by config name (e.g. "with_skill"/"without_skill", - or "new_skill"/"old_skill"), each containing a list of run results. - """ - # Support both layouts: eval dirs directly under benchmark_dir, or under runs/ - runs_dir = benchmark_dir / "runs" - if runs_dir.exists(): - search_dir = runs_dir - elif list(benchmark_dir.glob("eval-*")): - search_dir = benchmark_dir - else: - print(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}") - return {} - - results: dict[str, list] = {} - - for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))): - metadata_path = eval_dir / "eval_metadata.json" - if metadata_path.exists(): - try: - with open(metadata_path) as mf: - eval_id = json.load(mf).get("eval_id", eval_idx) - except (json.JSONDecodeError, OSError): - eval_id = eval_idx - else: - try: - eval_id = int(eval_dir.name.split("-")[1]) - except ValueError: - eval_id = eval_idx - - # Discover config directories dynamically rather than hardcoding names - for config_dir in sorted(eval_dir.iterdir()): - if not config_dir.is_dir(): - continue - # Skip non-config directories (inputs, outputs, etc.) - if not list(config_dir.glob("run-*")): - continue - config = config_dir.name - if config not in results: - results[config] = [] - - for run_dir in sorted(config_dir.glob("run-*")): - run_number = int(run_dir.name.split("-")[1]) - grading_file = run_dir / "grading.json" - - if not grading_file.exists(): - print(f"Warning: grading.json not found in {run_dir}") - continue - - try: - with open(grading_file) as f: - grading = json.load(f) - except json.JSONDecodeError as e: - print(f"Warning: Invalid JSON in {grading_file}: {e}") - continue - - # Extract metrics - result = { - "eval_id": eval_id, - "run_number": run_number, - "pass_rate": grading.get("summary", {}).get("pass_rate", 0.0), - "passed": grading.get("summary", {}).get("passed", 0), - "failed": grading.get("summary", {}).get("failed", 0), - "total": grading.get("summary", {}).get("total", 0), - } - - # Extract timing — check grading.json first, then sibling timing.json - timing = grading.get("timing", {}) - result["time_seconds"] = timing.get("total_duration_seconds", 0.0) - timing_file = run_dir / "timing.json" - if result["time_seconds"] == 0.0 and timing_file.exists(): - try: - with open(timing_file) as tf: - timing_data = json.load(tf) - result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0) - result["tokens"] = timing_data.get("total_tokens", 0) - except json.JSONDecodeError: - pass - - # Extract metrics if available - metrics = grading.get("execution_metrics", {}) - result["tool_calls"] = metrics.get("total_tool_calls", 0) - if not result.get("tokens"): - result["tokens"] = metrics.get("output_chars", 0) - result["errors"] = metrics.get("errors_encountered", 0) - - # Three naming conventions exist for grading results across the ecosystem: - # "assertion_results" — agentskills docs (evaluating-skills.mdx) - # "assertions" — AgentV native format - # "expectations" — skill-creator plugin (schemas.md) - # All use the same shape: [{text, passed, evidence}]. Fall through in priority order. - raw_assertions = grading.get("assertion_results", grading.get("assertions", grading.get("expectations", []))) - for exp in raw_assertions: - if "text" not in exp or "passed" not in exp: - print(f"Warning: assertion_result in {grading_file} missing required fields (text, passed, evidence): {exp}") - result["assertion_results"] = raw_assertions - - # Extract notes from user_notes_summary - notes_summary = grading.get("user_notes_summary") or {} - notes = [] - notes.extend(notes_summary.get("uncertainties", [])) - notes.extend(notes_summary.get("needs_review", [])) - notes.extend(notes_summary.get("workarounds", [])) - result["notes"] = notes - - results[config].append(result) - - return results - - -def aggregate_results(results: dict) -> dict: - """ - Aggregate run results into summary statistics. - - Returns run_summary with stats for each configuration and delta. - """ - run_summary = {} - configs = list(results.keys()) - - for config in configs: - runs = results.get(config, []) - - if not runs: - run_summary[config] = { - "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}, - "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}, - "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0} - } - continue - - pass_rates = [r["pass_rate"] for r in runs] - times = [r["time_seconds"] for r in runs] - tokens = [r.get("tokens", 0) for r in runs] - - run_summary[config] = { - "pass_rate": calculate_stats(pass_rates), - "time_seconds": calculate_stats(times), - "tokens": calculate_stats(tokens) - } - - # Calculate delta between the first two configs (if two exist) - if len(configs) >= 2: - primary = run_summary.get(configs[0], {}) - baseline = run_summary.get(configs[1], {}) - else: - primary = run_summary.get(configs[0], {}) if configs else {} - baseline = {} - - delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0) - delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0) - delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0) - - run_summary["delta"] = { - "pass_rate": f"{delta_pass_rate:+.2f}", - "time_seconds": f"{delta_time:+.1f}", - "tokens": f"{delta_tokens:+.0f}" - } - - return run_summary - - -def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict: - """ - Generate complete benchmark.json from run results. - """ - results = load_run_results(benchmark_dir) - run_summary = aggregate_results(results) - - # Build runs array for benchmark.json - runs = [] - for config in results: - for result in results[config]: - runs.append({ - "eval_id": result["eval_id"], - "configuration": config, - "run_number": result["run_number"], - "result": { - "pass_rate": result["pass_rate"], - "passed": result["passed"], - "failed": result["failed"], - "total": result["total"], - "time_seconds": result["time_seconds"], - "tokens": result.get("tokens", 0), - "tool_calls": result.get("tool_calls", 0), - "errors": result.get("errors", 0) - }, - "assertion_results": result["assertion_results"], - "notes": result["notes"] - }) - - # Determine eval IDs from results - eval_ids = sorted(set( - r["eval_id"] - for config in results.values() - for r in config - )) - - benchmark = { - "metadata": { - "skill_name": skill_name or "", - "skill_path": skill_path or "", - "executor_model": "", - "analyzer_model": "", - "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), - "evals_run": eval_ids, - "runs_per_configuration": 3 - }, - "runs": runs, - "run_summary": run_summary, - "notes": [] # To be filled by analyzer - } - - return benchmark - - -def generate_markdown(benchmark: dict) -> str: - """Generate human-readable benchmark.md from benchmark data.""" - metadata = benchmark["metadata"] - run_summary = benchmark["run_summary"] - - # Determine config names (excluding "delta") - configs = [k for k in run_summary if k != "delta"] - config_a = configs[0] if len(configs) >= 1 else "config_a" - config_b = configs[1] if len(configs) >= 2 else "config_b" - label_a = config_a.replace("_", " ").title() - label_b = config_b.replace("_", " ").title() - - lines = [ - f"# Skill Benchmark: {metadata['skill_name']}", - "", - f"**Model**: {metadata['executor_model']}", - f"**Date**: {metadata['timestamp']}", - f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)", - "", - "## Summary", - "", - f"| Metric | {label_a} | {label_b} | Delta |", - "|--------|------------|---------------|-------|", - ] - - a_summary = run_summary.get(config_a, {}) - b_summary = run_summary.get(config_b, {}) - delta = run_summary.get("delta", {}) - - # Format pass rate - a_pr = a_summary.get("pass_rate", {}) - b_pr = b_summary.get("pass_rate", {}) - lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |") - - # Format time - a_time = a_summary.get("time_seconds", {}) - b_time = b_summary.get("time_seconds", {}) - lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |") - - # Format tokens - a_tokens = a_summary.get("tokens", {}) - b_tokens = b_summary.get("tokens", {}) - lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |") - - # Notes section - if benchmark.get("notes"): - lines.extend([ - "", - "## Notes", - "" - ]) - for note in benchmark["notes"]: - lines.append(f"- {note}") - - return "\n".join(lines) - - -def main(): - parser = argparse.ArgumentParser( - description="Aggregate benchmark run results into summary statistics" - ) - parser.add_argument( - "benchmark_dir", - type=Path, - help="Path to the benchmark directory" - ) - parser.add_argument( - "--skill-name", - default="", - help="Name of the skill being benchmarked" - ) - parser.add_argument( - "--skill-path", - default="", - help="Path to the skill being benchmarked" - ) - parser.add_argument( - "--output", "-o", - type=Path, - help="Output path for benchmark.json (default: /benchmark.json)" - ) - - args = parser.parse_args() - - if not args.benchmark_dir.exists(): - print(f"Directory not found: {args.benchmark_dir}") - sys.exit(1) - - # Generate benchmark - benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path) - - # Determine output paths - output_json = args.output or (args.benchmark_dir / "benchmark.json") - output_md = output_json.with_suffix(".md") - - # Write benchmark.json - with open(output_json, "w") as f: - json.dump(benchmark, f, indent=2) - print(f"Generated: {output_json}") - - # Write benchmark.md - markdown = generate_markdown(benchmark) - with open(output_md, "w") as f: - f.write(markdown) - print(f"Generated: {output_md}") - - # Print summary - run_summary = benchmark["run_summary"] - configs = [k for k in run_summary if k != "delta"] - delta = run_summary.get("delta", {}) - - print(f"\nSummary:") - for config in configs: - pr = run_summary[config]["pass_rate"]["mean"] - label = config.replace("_", " ").title() - print(f" {label}: {pr*100:.1f}% pass rate") - print(f" Delta: {delta.get('pass_rate', '—')}") - - -if __name__ == "__main__": - main() diff --git a/plugins/agentv-dev/skills/agentv-bench/scripts/bench.py b/plugins/agentv-dev/skills/agentv-bench/scripts/bench.py deleted file mode 100644 index d7616bb13..000000000 --- a/plugins/agentv-dev/skills/agentv-bench/scripts/bench.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -""" -Merge evaluator scores and produce final benchmark artifacts. - -Calls `agentv pipeline bench` to merge code-grader results with LLM grader -scores, compute weighted pass_rate, and write grading.json + index.jsonl + -benchmark.json. - -Usage: - python bench.py < llm_scores.json - echo '{"test-01": {"relevance": {"score": 0.8, ...}}}' | python bench.py - -Example: - python bench.py .agentv/results/export/run-1 < llm_scores.json - -Stdin format (LLM grader scores): - { - "": { - "": { - "score": 0.85, - "assertions": [{"text": "...", "passed": true, "evidence": "..."}] - } - } - } - -Output: - /index.jsonl <- per-test manifest - /benchmark.json <- aggregate statistics - //grading.json <- merged grading per test -""" -import argparse -import subprocess -import sys - - -from agentv_cli import find_agentv as _find_agentv - - -def main(): - parser = argparse.ArgumentParser( - description="Merge scores and produce benchmark artifacts" - ) - parser.add_argument("export_dir", help="Export directory") - args = parser.parse_args() - - # Pass stdin through to agentv pipeline bench - result = subprocess.run( - [*_find_agentv(), "pipeline", "bench", args.export_dir], - stdin=sys.stdin, - ) - sys.exit(result.returncode) - - -if __name__ == "__main__": - main() diff --git a/plugins/agentv-dev/skills/agentv-bench/scripts/package_skill.py b/plugins/agentv-dev/skills/agentv-bench/scripts/package_skill.py deleted file mode 100644 index f48eac444..000000000 --- a/plugins/agentv-dev/skills/agentv-bench/scripts/package_skill.py +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env python3 -""" -Skill Packager - Creates a distributable .skill file of a skill folder - -Usage: - python utils/package_skill.py [output-directory] - -Example: - python utils/package_skill.py skills/public/my-skill - python utils/package_skill.py skills/public/my-skill ./dist -""" - -import fnmatch -import sys -import zipfile -from pathlib import Path -from scripts.quick_validate import validate_skill - -# Patterns to exclude when packaging skills. -EXCLUDE_DIRS = {"__pycache__", "node_modules"} -EXCLUDE_GLOBS = {"*.pyc"} -EXCLUDE_FILES = {".DS_Store"} -# Directories excluded only at the skill root (not when nested deeper). -ROOT_EXCLUDE_DIRS = {"evals"} - - -def should_exclude(rel_path: Path) -> bool: - """Check if a path should be excluded from packaging.""" - parts = rel_path.parts - if any(part in EXCLUDE_DIRS for part in parts): - return True - # rel_path is relative to skill_path.parent, so parts[0] is the skill - # folder name and parts[1] (if present) is the first subdir. - if len(parts) > 1 and parts[1] in ROOT_EXCLUDE_DIRS: - return True - name = rel_path.name - if name in EXCLUDE_FILES: - return True - return any(fnmatch.fnmatch(name, pat) for pat in EXCLUDE_GLOBS) - - -def package_skill(skill_path, output_dir=None): - """ - Package a skill folder into a .skill file. - - Args: - skill_path: Path to the skill folder - output_dir: Optional output directory for the .skill file (defaults to current directory) - - Returns: - Path to the created .skill file, or None if error - """ - skill_path = Path(skill_path).resolve() - - # Validate skill folder exists - if not skill_path.exists(): - print(f"❌ Error: Skill folder not found: {skill_path}") - return None - - if not skill_path.is_dir(): - print(f"❌ Error: Path is not a directory: {skill_path}") - return None - - # Validate SKILL.md exists - skill_md = skill_path / "SKILL.md" - if not skill_md.exists(): - print(f"❌ Error: SKILL.md not found in {skill_path}") - return None - - # Run validation before packaging - print("🔍 Validating skill...") - valid, message = validate_skill(skill_path) - if not valid: - print(f"❌ Validation failed: {message}") - print(" Please fix the validation errors before packaging.") - return None - print(f"✅ {message}\n") - - # Determine output location - skill_name = skill_path.name - if output_dir: - output_path = Path(output_dir).resolve() - output_path.mkdir(parents=True, exist_ok=True) - else: - output_path = Path.cwd() - - skill_filename = output_path / f"{skill_name}.skill" - - # Create the .skill file (zip format) - try: - with zipfile.ZipFile(skill_filename, 'w', zipfile.ZIP_DEFLATED) as zipf: - # Walk through the skill directory, excluding build artifacts - for file_path in skill_path.rglob('*'): - if not file_path.is_file(): - continue - arcname = file_path.relative_to(skill_path.parent) - if should_exclude(arcname): - print(f" Skipped: {arcname}") - continue - zipf.write(file_path, arcname) - print(f" Added: {arcname}") - - print(f"\n✅ Successfully packaged skill to: {skill_filename}") - return skill_filename - - except Exception as e: - print(f"❌ Error creating .skill file: {e}") - return None - - -def main(): - if len(sys.argv) < 2: - print("Usage: python utils/package_skill.py [output-directory]") - print("\nExample:") - print(" python utils/package_skill.py skills/public/my-skill") - print(" python utils/package_skill.py skills/public/my-skill ./dist") - sys.exit(1) - - skill_path = sys.argv[1] - output_dir = sys.argv[2] if len(sys.argv) > 2 else None - - print(f"📦 Packaging skill: {skill_path}") - if output_dir: - print(f" Output directory: {output_dir}") - print() - - result = package_skill(skill_path, output_dir) - - if result: - sys.exit(0) - else: - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/plugins/agentv-dev/skills/agentv-bench/scripts/quick_validate.py b/plugins/agentv-dev/skills/agentv-bench/scripts/quick_validate.py deleted file mode 100644 index ed8e1dddc..000000000 --- a/plugins/agentv-dev/skills/agentv-bench/scripts/quick_validate.py +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/env python3 -""" -Quick validation script for skills - minimal version -""" - -import sys -import os -import re -import yaml -from pathlib import Path - -def validate_skill(skill_path): - """Basic validation of a skill""" - skill_path = Path(skill_path) - - # Check SKILL.md exists - skill_md = skill_path / 'SKILL.md' - if not skill_md.exists(): - return False, "SKILL.md not found" - - # Read and validate frontmatter - content = skill_md.read_text() - if not content.startswith('---'): - return False, "No YAML frontmatter found" - - # Extract frontmatter - match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL) - if not match: - return False, "Invalid frontmatter format" - - frontmatter_text = match.group(1) - - # Parse YAML frontmatter - try: - frontmatter = yaml.safe_load(frontmatter_text) - if not isinstance(frontmatter, dict): - return False, "Frontmatter must be a YAML dictionary" - except yaml.YAMLError as e: - return False, f"Invalid YAML in frontmatter: {e}" - - # Define allowed properties - ALLOWED_PROPERTIES = {'name', 'description', 'license', 'allowed-tools', 'metadata', 'compatibility'} - - # Check for unexpected properties (excluding nested keys under metadata) - unexpected_keys = set(frontmatter.keys()) - ALLOWED_PROPERTIES - if unexpected_keys: - return False, ( - f"Unexpected key(s) in SKILL.md frontmatter: {', '.join(sorted(unexpected_keys))}. " - f"Allowed properties are: {', '.join(sorted(ALLOWED_PROPERTIES))}" - ) - - # Check required fields - if 'name' not in frontmatter: - return False, "Missing 'name' in frontmatter" - if 'description' not in frontmatter: - return False, "Missing 'description' in frontmatter" - - # Extract name for validation - name = frontmatter.get('name', '') - if not isinstance(name, str): - return False, f"Name must be a string, got {type(name).__name__}" - name = name.strip() - if name: - # Check naming convention (kebab-case: lowercase with hyphens) - if not re.match(r'^[a-z0-9-]+$', name): - return False, f"Name '{name}' should be kebab-case (lowercase letters, digits, and hyphens only)" - if name.startswith('-') or name.endswith('-') or '--' in name: - return False, f"Name '{name}' cannot start/end with hyphen or contain consecutive hyphens" - # Check name length (max 64 characters per spec) - if len(name) > 64: - return False, f"Name is too long ({len(name)} characters). Maximum is 64 characters." - - # Extract and validate description - description = frontmatter.get('description', '') - if not isinstance(description, str): - return False, f"Description must be a string, got {type(description).__name__}" - description = description.strip() - if description: - # Check for angle brackets - if '<' in description or '>' in description: - return False, "Description cannot contain angle brackets (< or >)" - # Check description length (max 1024 characters per spec) - if len(description) > 1024: - return False, f"Description is too long ({len(description)} characters). Maximum is 1024 characters." - - # Validate compatibility field if present (optional) - compatibility = frontmatter.get('compatibility', '') - if compatibility: - if not isinstance(compatibility, str): - return False, f"Compatibility must be a string, got {type(compatibility).__name__}" - if len(compatibility) > 500: - return False, f"Compatibility is too long ({len(compatibility)} characters). Maximum is 500 characters." - - return True, "Skill is valid!" - -if __name__ == "__main__": - if len(sys.argv) != 2: - print("Usage: python quick_validate.py ") - sys.exit(1) - - valid, message = validate_skill(sys.argv[1]) - print(message) - sys.exit(0 if valid else 1) \ No newline at end of file diff --git a/plugins/agentv-dev/skills/agentv-bench/scripts/run_code_graders.py b/plugins/agentv-dev/skills/agentv-bench/scripts/run_code_graders.py deleted file mode 100644 index 45280494c..000000000 --- a/plugins/agentv-dev/skills/agentv-bench/scripts/run_code_graders.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -""" -Run code-grader assertions on existing responses. - -Calls `agentv pipeline grade` to execute all code-grader assertions declared in -the eval against response.md files in the export directory. - -Usage: - python run_code_graders.py - -Example: - python run_code_graders.py .agentv/results/export/run-1 - -Prerequisites: - - `agentv pipeline input` has been run (or run_tests.py) - - response.md exists in each test directory - -Output: - //code_grader_results/.json -""" -import argparse -import subprocess -import sys - - -from agentv_cli import find_agentv as _find_agentv - - -def main(): - parser = argparse.ArgumentParser(description="Run code-grader assertions") - parser.add_argument("export_dir", help="Export directory from pipeline input") - args = parser.parse_args() - - result = subprocess.run( - [*_find_agentv(), "pipeline", "grade", args.export_dir], - capture_output=False, - ) - sys.exit(result.returncode) - - -if __name__ == "__main__": - main() diff --git a/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py b/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py deleted file mode 100644 index aff81612e..000000000 --- a/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py +++ /dev/null @@ -1,201 +0,0 @@ -#!/usr/bin/env python3 -""" -Run eval test cases by extracting inputs and invoking CLI targets. - -Calls `agentv pipeline input` to extract inputs, then invokes each test's CLI -target command in parallel, writing response.md per test. - -Usage: - python run_tests.py --out [--workers N] - -Example: - python run_tests.py evals/repro.eval.yaml --out .agentv/results/export/run-1 - -Output structure: - / - ├── manifest.json ← from agentv pipeline input - ├── / - │ ├── input.json ← from agentv pipeline input - │ ├── invoke.json ← from agentv pipeline input - │ ├── response.md ← target output (written by this script) - │ └── timing.json ← execution timing (written by this script) - -For agent-as-target mode (invoke.json has kind=agent — all non-CLI providers -unless subagent_mode_allowed=false in targets.yaml), this script only runs -`agentv pipeline input`. Executor subagents handle execution directly. -""" -import argparse -import json -import os -import subprocess -import sys -import tempfile -import time -from concurrent.futures import ThreadPoolExecutor, as_completed -from datetime import datetime, timezone -from pathlib import Path - - -from agentv_cli import find_agentv as _find_agentv - - -def _load_env(env_file: Path) -> dict: - """Read key=value pairs from a .env file, ignoring comments and blanks.""" - env = {} - for line in env_file.read_text().splitlines(): - line = line.strip() - if not line or line.startswith("#"): - continue - if "=" not in line: - continue - key, _, value = line.partition("=") - env[key.strip()] = value.strip() - return env - - -def run_agentv_input(eval_path: str, out_dir: str) -> dict: - """Call agentv pipeline input and return the manifest.""" - result = subprocess.run( - [*_find_agentv(), "pipeline", "input", eval_path, "--out", out_dir], - capture_output=True, - text=True, - ) - if result.returncode != 0: - print(f"agentv pipeline input failed:\n{result.stderr}", file=sys.stderr) - sys.exit(1) - manifest_path = Path(out_dir) / "manifest.json" - return json.loads(manifest_path.read_text()) - - -def invoke_cli_target(test_dir: Path, extra_env: dict | None = None) -> None: - """Read invoke.json and execute the CLI target command.""" - invoke_path = test_dir / "invoke.json" - invoke = json.loads(invoke_path.read_text()) - - if invoke.get("kind") != "cli": - return # Agent-as-target — skip CLI invocation - - input_data = json.loads((test_dir / "input.json").read_text()) - command_template = invoke["command"] - cwd = invoke.get("cwd") - timeout_s = invoke.get("timeout_ms", 30000) / 1000 - merged_env = {**os.environ, **(extra_env or {})} - - # Extract prompt text from input messages - prompt_text = next( - (m["content"] for m in input_data.get("input", []) if m.get("role") == "user"), - "", - ) - - # Write prompt to temp file for {PROMPT_FILE} placeholder - with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as pf: - pf.write(prompt_text) - prompt_file = pf.name - - # Create output file path for {OUTPUT_FILE} placeholder - output_file = tempfile.mktemp(suffix=".txt") - - # Render template - rendered = command_template - rendered = rendered.replace("{PROMPT}", prompt_text) - rendered = rendered.replace("{PROMPT_FILE}", prompt_file) - rendered = rendered.replace("{OUTPUT_FILE}", output_file) - - start = time.time() - try: - result = subprocess.run( - rendered, - shell=True, - cwd=cwd, - capture_output=True, - text=True, - timeout=timeout_s, - env=merged_env, - ) - duration_ms = int((time.time() - start) * 1000) - - if result.returncode != 0: - response = f"ERROR: target exited with code {result.returncode}\n{result.stderr}" - elif os.path.exists(output_file): - response = Path(output_file).read_text() - else: - response = result.stdout - except subprocess.TimeoutExpired: - duration_ms = int((time.time() - start) * 1000) - response = f"ERROR: target timed out after {timeout_s}s" - finally: - for f in [prompt_file, output_file]: - try: - os.unlink(f) - except OSError: - pass - - (test_dir / "response.md").write_text(response) - (test_dir / "timing.json").write_text( - json.dumps( - { - "duration_ms": duration_ms, - "total_duration_seconds": round(duration_ms / 1000, 3), - }, - indent=2, - ) - + "\n" - ) - - -def main(): - parser = argparse.ArgumentParser(description="Run eval test cases") - parser.add_argument("eval_path", help="Path to eval YAML file") - parser.add_argument("--out", required=True, help="Output directory") - parser.add_argument( - "--workers", type=int, default=3, help="Parallel workers (default: 3)" - ) - args = parser.parse_args() - - if "AGENTV_RUN_TIMESTAMP" not in os.environ: - ts = datetime.now(timezone.utc).isoformat().replace(":", "-").replace(".", "-") - os.environ["AGENTV_RUN_TIMESTAMP"] = ts - - # Load .env from eval directory or any parent - eval_dir = Path(args.eval_path).resolve().parent - env_file = None - for p in [eval_dir] + list(eval_dir.parents): - candidate = p / ".env" - if candidate.exists(): - env_file = candidate - break - extra_env = _load_env(env_file) if env_file else {} - - manifest = run_agentv_input(args.eval_path, args.out) - out = Path(args.out) - - test_ids = manifest["test_ids"] - cli_tests = [] - for tid in test_ids: - test_dir = out / tid - invoke = json.loads((test_dir / "invoke.json").read_text()) - if invoke.get("kind") == "cli": - cli_tests.append(test_dir) - - if not cli_tests: - print( - f"Extracted {len(test_ids)} test(s). No CLI targets to invoke (agent-as-target mode)." - ) - return - - print(f"Running {len(cli_tests)} CLI target(s) with {args.workers} workers...") - with ThreadPoolExecutor(max_workers=args.workers) as pool: - futures = {pool.submit(invoke_cli_target, td, extra_env): td.name for td in cli_tests} - for future in as_completed(futures): - tid = futures[future] - try: - future.result() - print(f" {tid}: done") - except Exception as e: - print(f" {tid}: ERROR — {e}", file=sys.stderr) - - print(f"Done. Responses written to {args.out}") - - -if __name__ == "__main__": - main()