From 4dd084e79aa3aad6f830f9e2e92ad227d84b71fd Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 26 Mar 2026 10:49:29 +1100 Subject: [PATCH 01/14] feat: add workspace skills and fix eval for pi-cli execution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add agentic-engineering plugin + agentv-eval-review to workspace template - Add .allagents/workspace.yaml for pi-cli skill discovery - Fix skill-trigger field (value → skill) - Remove skill-trigger assertions (pi-cli plugin discovery not working yet) - Add workers: 1 to prevent concurrent workspace corruption - Baseline: 5/9 tests pass without skill loaded Co-Authored-By: Claude Opus 4.6 (1M context) --- .../agent-plugin-review.eval.yaml | 21 +- .../.allagents/workspace.yaml | 5 + .../skills/agent-architecture-design/SKILL.md | 108 ++++++++ .../references/agentic-design-patterns.md | 152 +++++++++++ .../references/workflow-patterns.md | 105 ++++++++ .../skills/agent-plugin-review/SKILL.md | 102 ++++++++ .../references/skill-quality-checklist.md | 125 +++++++++ .../references/workflow-checklist.md | 78 ++++++ .../scripts/lint_plugin.py | 198 +++++++++++++++ .../skills/agentv-eval-review/SKILL.md | 52 ++++ .../agentv-eval-review/scripts/lint_eval.py | 239 ++++++++++++++++++ 11 files changed, 1166 insertions(+), 19 deletions(-) create mode 100644 evals/agentic-engineering/workspace-template/.allagents/workspace.yaml create mode 100644 evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/SKILL.md create mode 100644 evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/references/agentic-design-patterns.md create mode 100644 evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/references/workflow-patterns.md create mode 100644 evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/SKILL.md create mode 100644 evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/references/skill-quality-checklist.md create mode 100644 evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/references/workflow-checklist.md create mode 100644 evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/scripts/lint_plugin.py create mode 100644 evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agentv-eval-review/SKILL.md create mode 100644 evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agentv-eval-review/scripts/lint_eval.py diff --git a/evals/agentic-engineering/agent-plugin-review.eval.yaml b/evals/agentic-engineering/agent-plugin-review.eval.yaml index 35df6996..9fcd30d4 100644 --- a/evals/agentic-engineering/agent-plugin-review.eval.yaml +++ b/evals/agentic-engineering/agent-plugin-review.eval.yaml @@ -1,8 +1,9 @@ -description: Evaluates that the agent-plugin-review skill is triggered and catches planted issues in a mock plugin +description: Evaluates that an agent can catch planted issues in a mock plugin execution: targets: - pi-cli + workers: 1 workspace: template: ./workspace-template @@ -14,8 +15,6 @@ tests: Review the deploy-auto plugin in this repo for completeness. Check that every skill has a corresponding eval file. assertions: - - type: skill-trigger - value: agent-plugin-review - type: contains value: deploy-rollback - type: rubrics @@ -28,8 +27,6 @@ tests: input: | Review the eval files under evals/deploy-auto/ for naming convention issues. assertions: - - type: skill-trigger - value: agent-plugin-review - type: contains value: .eval.yaml - type: rubrics @@ -44,8 +41,6 @@ tests: Review evals/deploy-auto/deploy-plan.yaml for eval quality issues. Check assertion coverage and expected_output format. assertions: - - type: skill-trigger - value: agent-plugin-review - type: rubrics criteria: - Flags that no assertions are defined in deploy-plan.yaml @@ -57,8 +52,6 @@ tests: input: | Review evals/deploy-auto/deploy-plan.yaml for file path formatting issues. assertions: - - type: skill-trigger - value: agent-plugin-review - type: rubrics criteria: - Flags that file paths are missing a leading slash @@ -70,8 +63,6 @@ tests: Review evals/deploy-auto/deploy-plan.yaml for structural improvements. Look at how inputs are organized across test cases. assertions: - - type: skill-trigger - value: agent-plugin-review - type: rubrics criteria: - Identifies the repeated SKILL.md file input across all 3 tests @@ -83,8 +74,6 @@ tests: Review the deploy-auto plugin's workflow architecture. Check whether phases enforce prerequisites before proceeding. assertions: - - type: skill-trigger - value: agent-plugin-review - type: rubrics criteria: - Flags that deploy-execute does not check for deploy-plan.md before starting @@ -97,8 +86,6 @@ tests: Review evals/deploy-auto/deploy-execute.eval.yaml for factual accuracy. Cross-check expected outputs against what the skills actually document. assertions: - - type: skill-trigger - value: agent-plugin-review - type: rubrics criteria: - Flags the contradiction between pytest (skill) and python -m unittest (eval) @@ -110,8 +97,6 @@ tests: Review plugins/deploy-auto/skills/deploy-plan/SKILL.md for cross-reference issues. Check that referenced commands and skills actually exist. assertions: - - type: skill-trigger - value: agent-plugin-review - type: rubrics criteria: - Flags that /deploy-execute is referenced but does not exist as a slash command @@ -123,8 +108,6 @@ tests: input: | Review plugins/deploy-auto/skills/deploy-execute/SKILL.md for portability issues. assertions: - - type: skill-trigger - value: agent-plugin-review - type: rubrics criteria: - Flags the hardcoded path C:\Users\admin\.kube\config diff --git a/evals/agentic-engineering/workspace-template/.allagents/workspace.yaml b/evals/agentic-engineering/workspace-template/.allagents/workspace.yaml new file mode 100644 index 00000000..40d57f65 --- /dev/null +++ b/evals/agentic-engineering/workspace-template/.allagents/workspace.yaml @@ -0,0 +1,5 @@ +plugins: + - source: ./plugins/agentic-engineering + +clients: + - pi diff --git a/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/SKILL.md b/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/SKILL.md new file mode 100644 index 00000000..283620b4 --- /dev/null +++ b/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/SKILL.md @@ -0,0 +1,108 @@ +--- +name: agent-architecture-design +description: >- + Use when designing an AI agent system, selecting agentic design patterns, + planning multi-phase workflows, choosing between single-agent and multi-agent architectures, + or when asked "what kind of agent should I build", "how should I structure this automation", + "design an agent for X", or "which agentic pattern fits this problem". +--- + +# Agent Architecture Design + +## Overview + +Guide the selection and design of the correct agentic architecture by diagnosing the problem type, mapping it to a proven design pattern, and defining the workflow structure, tooling, and management model. + +## Process + +### Phase 1: Problem Diagnosis + +Categorize the request on two axes: + +| | Task-Level (single job) | Project-Level (coordination needed) | +|---|---|---| +| **Software-Shaped** (working code/system) | Single-Agent Iterative Loop | Autonomous Pipeline or Multi-Agent System | +| **Metric-Shaped** (optimize a number) | Optimization Loop | Optimization Loop + Multi-Agent System | + +**Diagnosis questions:** +1. Is the goal working software or optimizing a metric? +2. Is this a single discrete task or multiple coordinated parts? +3. How much human involvement is acceptable during execution? +4. What scale justifies the architecture complexity? + +### Phase 2: Pattern Selection + +Load `references/agentic-design-patterns.md` for full details on each pattern. Summary: + +**Single-Agent Iterative Loop** (Agentic IDE) +- Human = manager, Agent = worker +- Decompose the problem into small chunks (UI, API, tests) +- Agent gets a workspace (terminal, files, search) +- Best for: individual developer productivity on discrete tasks + +**Autonomous Pipeline** (Zero-Human Loop) +- Spec In → Autonomous Zone → Eval Out +- Heavy human involvement at start (specs) and end (review), zero in the middle +- Requires robust evals — iterations happen automatically until eval passes +- Best for: zero-human-intervention software delivery + +**Optimization Loop** (Self-Improving Agent) +- Hill climbing against a specific metric +- Agent tries paths, fails, backtracks +- Needs a clear optimization target +- Best for: reaching peak of an optimization metric through experimentation + +**Multi-Agent System** (Hierarchical/Supervisor Pattern) +- Specialized roles with defined handoffs (Researcher → Writer → Editor → Publisher) +- Complexity lies in context management between agents +- Only justified at scale (10,000 tickets, not 10) +- Best for: seamless coordination across specialized AI workers + +### Phase 3: Workflow Architecture + +After selecting a pattern, define the workflow structure. Load `references/workflow-patterns.md` for framework-specific patterns. + +**For each pattern, define:** + +1. **Phases** — What sequential or parallel steps does the workflow execute? +2. **Artifacts** — What does each phase produce? (specs, designs, tasks, code, reports) +3. **Gates** — What must be true before proceeding to the next phase? +4. **Tooling** — What tools/MCPs does each agent need? +5. **Context flow** — How is information passed between phases/agents? +6. **Resumption** — How does the workflow recover from interruption? + +**Pattern → Workflow mapping:** + +| Agentic Design Pattern | Typical Workflow | +|---|---| +| Single-Agent Iterative Loop | Single-phase: decompose → implement → verify | +| Autonomous Pipeline | OpenSpec-style: validate → propose → design → implement → verify | +| Optimization Loop | Iteration loop: hypothesize → test → measure → backtrack/advance | +| Multi-Agent System | Role pipeline: role₁ → handoff → role₂ → handoff → roleₙ | + +### Phase 4: Output + +Produce a design document covering: + +1. **Diagnosis** — Software or metric shaped, task or project level +2. **Recommended Pattern** — Which agentic architecture and why +3. **Workflow Design** — Phases, artifacts, gates, context flow +4. **Scaffolding Plan** — Tools, MCPs, evals the agent needs +5. **Management Model** — Human role (Manager, Observer, or Spec-Writer) + +## Implementation Rules + +1. **Simple scales better** — Do not recommend 3-level management if 2-level works. Simple configurations are more performant. +2. **Context is everything** — Agents depend entirely on the context and scaffolding provided by the architect. Design the scaffolding, not just the agent. +3. **Human-centered → Agent-centered** — For large projects, move from "human managing every agent" to "planner agent managing sub-agents" where the human observes. +4. **Avoid pattern-confusion** — Never use an Optimization Loop to build a novel. Never use a Single-Agent Loop for a project requiring specialized multi-agent orchestration. +5. **Scale justifies complexity** — Multi-agent orchestration is only worth it at scale. For small problems, a single well-prompted agent outperforms a complex framework. + +## Skill Resources + +- `references/agentic-design-patterns.md` — Detailed pattern descriptions with examples and anti-patterns +- `references/workflow-patterns.md` — Workflow patterns from OpenSpec, Superpowers, and Compound Engineering + +## Related Skills + +- **agent-plugin-review** — Review an implemented plugin against architecture best practices diff --git a/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/references/agentic-design-patterns.md b/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/references/agentic-design-patterns.md new file mode 100644 index 00000000..0cebab50 --- /dev/null +++ b/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/references/agentic-design-patterns.md @@ -0,0 +1,152 @@ +# Agentic Design Patterns + +Four foundational architectures for AI agent systems. Each pattern defines a management model, workflow structure, and set of anti-patterns. + +## Single-Agent Iterative Loop (Agentic IDE) + +**Use when:** Problem is software-shaped and scale is task-level. + +**Architecture:** +- Human is the manager; agent is the worker +- Focus on decomposition — break the big problem into small, well-defined chunks +- Each chunk is independently implementable and testable + +**Tooling requirements:** +- Terminal access (shell, build tools, test runners) +- File system access (read, write, search) +- Search tools (grep, glob, web search) +- Version control (git) + +**Workflow:** Single-phase — decompose → implement → verify + +**Management model:** Human as manager. Human defines what to build, agent builds it, human reviews. + +**Example:** A developer using Claude Code to implement a feature. They describe what they want, the agent writes the code, developer reviews and iterates. + +**Anti-patterns:** +- Using a single-agent loop for a project that needs 10+ coordinated agents +- No decomposition — giving the agent one massive task instead of focused chunks +- No verification step — trusting agent output without review + +--- + +## Autonomous Pipeline (Zero-Human Loop) + +**Use when:** Problem is software-shaped and high autonomy is required. + +**Architecture:** +- Spec In → Autonomous Zone → Eval Out +- Human involvement is heavy at start (specs) and end (review), zero in the middle +- Iterations (v0.1 → v1.0) happen automatically until eval passes + +**Requirements:** +- Robust evals are mandatory — the system cannot self-correct without them +- Specs must be precise enough to generate working systems +- Evals must be discriminating — pass for good output, fail for bad + +**Workflow:** OpenSpec-style pipeline: +1. Validate (check requirements against reality) +2. Propose (define WHAT and WHY) +3. Design (plan HOW) +4. Implement (TDD through task checklist) +5. Verify (build + test + spec traceability) + +**Management model:** Human as spec-writer. Human writes specs and reviews final output. Everything in between is autonomous. + +**Example:** A spec-driven development plugin where the developer provides a work item number, and the system autonomously validates requirements, designs the implementation, codes it with TDD, and produces a PR. + +**Anti-patterns:** +- No evals — the system has no way to know when it's done or if it's correct +- Specs too vague — "make it better" is not a spec +- Human intervening in the autonomous zone — defeats the purpose + +--- + +## Optimization Loop (Self-Improving Agent) + +**Use when:** Problem is metric-shaped (optimization). + +**Architecture:** +- Hill climbing against a specific metric +- Agent tries paths, fails, and backtracks +- Each iteration measures progress against the target + +**Requirements:** +- Clear, measurable optimization target +- Fast feedback loop (metric must be computable quickly) +- Permission to explore and fail + +**Workflow:** Iteration loop: +1. Hypothesize (propose a change) +2. Test (apply the change) +3. Measure (evaluate against metric) +4. Decide (advance if improved, backtrack if not) +5. Repeat until target reached or budget exhausted + +**Management model:** Human as observer. Human defines the metric and constraints, agent explores the solution space. + +**Example:** Optimizing a prompt's accuracy against an eval suite. Agent tries variations, measures pass rate, keeps improvements, discards regressions. + +**Anti-patterns:** +- No clear metric — "make it better" is not optimizable +- Using for creative tasks — novels, designs, art have no single metric +- No backtracking — agent must be allowed to undo bad changes + +--- + +## Multi-Agent System (Hierarchical/Supervisor Pattern) + +**Use when:** Problem requires specialized roles and complex handoffs. + +**Architecture:** +- Define specialized roles (Researcher → Writer → Editor → Publisher) +- Focus on handoffs — complexity lies in context management between agents +- Each role has its own tools, context, and success criteria + +**Scale requirement:** Only justified when the volume warrants it. Managing 10,000 tickets needs orchestration. Managing 10 does not. + +**Workflow:** Role pipeline with handoffs: +1. Role₁ performs its task, produces output artifact +2. Handoff: artifact + summary passed to Role₂ +3. Role₂ performs its task, produces next artifact +4. Continue until pipeline complete + +**Management model:** Human as observer or planner-manager. For large scale, a planner agent manages sub-agents while human observes. + +**Context management:** +- Each handoff loses context — design artifacts to carry essential information +- Summaries at each handoff prevent context window overflow +- Shared state (files, databases) can bridge context gaps + +**Example:** A content pipeline where a researcher gathers information, a writer produces a draft, an editor refines it, and a publisher formats and distributes it. + +**Anti-patterns:** +- Over-engineering — using orchestration for a 3-step task one person could do +- Poor handoffs — losing critical context between agents +- No specialization — all agents doing the same thing (just use a single-agent loop) +- Too many management layers — 3-level hierarchies are almost always slower than 2-level + +--- + +## Pattern Selection Decision Tree + +``` +Is the goal working software or optimizing a metric? +├── Software-shaped +│ ├── Single discrete task? → Single-Agent Iterative Loop +│ ├── Needs full autonomy (spec → code → eval)? → Autonomous Pipeline +│ └── Multiple specialized roles needed at scale? → Multi-Agent System +└── Metric-shaped + ├── Single metric to optimize? → Optimization Loop + └── Multiple metrics across coordinated roles? → Optimization Loop + Multi-Agent System +``` + +## Hybrid Architectures + +Real systems often combine patterns: + +- **Autonomous Pipeline + Optimization Loop:** Auto-iterate on prompts using eval scores +- **Single-Agent Loop + Multi-Agent System:** Individual coding agents orchestrated by a planner for large projects +- **Autonomous Pipeline + Multi-Agent System:** Autonomous pipeline with specialized roles (validate-agent, design-agent, code-agent) + +When combining, keep the management model simple. A 2-level structure (planner + workers) outperforms deeper hierarchies. diff --git a/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/references/workflow-patterns.md b/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/references/workflow-patterns.md new file mode 100644 index 00000000..1465ce11 --- /dev/null +++ b/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/references/workflow-patterns.md @@ -0,0 +1,105 @@ +# Workflow Patterns by Framework + +Patterns from reference frameworks for designing agent workflows, organized by agentic design pattern. + +## OpenSpec (OPSX Conventions) + +**Source:** [OpenSpec](https://github.com/Fission-AI/OpenSpec) + +**Best for:** Autonomous Pipeline and Multi-Agent System + +**Core concept:** Artifact-driven dependency graph. Commands chain through file existence, not sequential phases. + +**Default workflow (spec-driven):** +``` +/opsx:explore → /opsx:propose → /opsx:apply → /opsx:archive +``` + +**Expanded workflow:** +``` +/opsx:new → /opsx:continue (×N) → /opsx:apply → /opsx:verify → /opsx:archive +``` + +**Key patterns:** +- **Artifact gates** — Each phase produces a file. Next phase checks file exists before starting. +- **Delta specs** — Changes are expressed as ADDED/MODIFIED/REMOVED operations on existing specs, not full rewrites. +- **Fast-forward** (`/opsx:ff`) — Generate all planning artifacts at once for clear-scope work. +- **Schema-configurable** — Workflow phases defined in `schema.yaml` as a DAG, not hardcoded. +- **Archive merges deltas** — Completed changes are merged back into main specs, keeping specs as source of truth. + +**Artifact types:** +| Artifact | Purpose | +|---|---| +| `proposal.md` | WHAT and WHY (scope, non-goals, acceptance criteria) | +| `specs/*.md` | Behavior contracts with Given/When/Then scenarios | +| `design.md` | HOW (technical approach, decisions, risks) | +| `tasks.md` | Implementation checklist with checkboxes | +| `verify-report.md` | Verification results and traceability | + +--- + +## Superpowers + +**Source:** [Superpowers](https://github.com/obra/superpowers/) + +**Best for:** Single-Agent Iterative Loop and Autonomous Pipeline + +**Core concept:** Skills as workflow phases with hard gates and mandatory skill checks. + +**Workflow phases:** +1. Brainstorming — Explore requirements before committing +2. Writing Plans — Task decomposition +3. Executing Plans / Subagent-Driven Development — Implementation +4. Test-Driven Development — RED-GREEN-REFACTOR during implementation +5. Requesting Code Review — Verification +6. Finishing a Development Branch — Completion + +**Key patterns:** +- **``** — Synchronization points that prevent progression without explicit checks. Agent must verify conditions before proceeding. +- **The 1% Rule** — If there's even a 1% chance a skill applies, invoke it. Prevents agents from rationalizing past important steps. +- **`SUBAGENT-STOP`** — Prevents subagents from invoking full skill workflows when executing specific tasks. +- **Brainstorming before planning** — Always explore intent and requirements before committing to a plan. +- **Two-stage code review** — Spec compliance review then code quality review (not one combined review). + +--- + +## Compound Engineering + +**Source:** [Compound Engineering](https://github.com/EveryInc/compound-engineering-plugin) + +**Best for:** Autonomous Pipeline with learning loop + +**Core concept:** Four-phase repeating cycle where learnings compound across iterations. + +**Workflow:** +``` +/ce:plan → /ce:work → /ce:review → /ce:compound → repeat +``` + +**Key patterns:** +- **Compounding loop** (`/ce:compound`) — After each cycle, document what worked and what didn't. Feed learnings into future planning. Each cycle gets easier. +- **Autonomous modes:** + - `/lfg` (Let's Go) — Sequential full cycle + - `/slfg` (Swarm LFG) — Parallel execution during review/testing +- **Multi-agent review** — Review phase dispatches multiple agents for parallel code review. +- **Knowledge accumulation** — Solutions documented in the compound phase become reusable patterns. + +--- + +## Framework Selection by Design Pattern + +| Agentic Design Pattern | Primary Framework | Secondary Framework | +|---|---|---| +| Single-Agent Iterative Loop | Superpowers (brainstorm → plan → TDD) | — | +| Autonomous Pipeline | OpenSpec (validate → propose → design → apply → verify) | Compound Engineering (learning loop) | +| Optimization Loop | Custom iteration loop (hypothesize → test → measure → decide) | — | +| Multi-Agent System | OpenSpec artifact gates + Superpowers hard gates | Compound Engineering (per-role learning) | + +## Universal Patterns (All Architectures) + +1. **Hard gates** — Check prerequisites before proceeding. Never silently skip. +2. **Artifact persistence** — Write phase outputs to disk, not just conversation context. Enables cross-session resumption. +3. **Workflow state metadata** — Track which phases are complete in a YAML file alongside artifacts. +4. **Error handling** — Standardize retry policy. Clear failure messages naming what to fix. +5. **Trivial escape hatch** — Document when it's OK to skip phases (small fixes, config changes). +6. **Artifact self-correction** — Downstream phases can fix factual errors in upstream artifacts, logged in a corrections section. diff --git a/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/SKILL.md b/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/SKILL.md new file mode 100644 index 00000000..3be08356 --- /dev/null +++ b/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/SKILL.md @@ -0,0 +1,102 @@ +--- +name: agent-plugin-review +description: >- + Use when reviewing an AI plugin pull request, auditing plugin quality before release, + or when asked to "review a plugin PR", "review skills in this PR", "check plugin quality", + or "review workflow architecture". Covers skill quality, structural linting, and workflow + architecture review. +--- + +# Plugin Review + +## Overview + +Review AI plugin PRs by running deterministic structural checks first, then applying LLM judgment for skill quality and workflow architecture. Post findings as inline PR comments. + +## Process + +### Step 1: Structural lint + +Run `scripts/lint_plugin.py` against the plugin directory: + +```bash +python scripts/lint_plugin.py --evals-dir --json +``` + +The script checks: +- Every `skills/*/SKILL.md` has a corresponding eval file +- SKILL.md frontmatter has `name` and `description` +- No hardcoded local paths (drive letters, absolute OS paths) +- No version printing instructions +- Referenced files (`references/*.md`) exist +- Commands reference existing skills +- Path style consistency across commands + +Report findings grouped by severity (error > warning > info). + +### Step 2: Eval lint + +If the PR includes eval files, invoke `agentv-eval-review` for AgentV-specific eval quality checks. + +### Step 3: Skill quality review (LLM judgment) + +For each SKILL.md, check against `references/skill-quality-checklist.md`: + +- Description starts with "Use when..." and describes triggering conditions only (not workflow) +- Description does NOT summarize the skill's process — this causes agents to follow the description instead of reading the SKILL.md body +- Body is concise — only include what the agent doesn't already know +- Imperative/infinitive form, not second person +- Heavy reference (100+ lines) moved to `references/` files +- One excellent code example beats many mediocre ones +- Flowcharts only for non-obvious decisions +- Keywords throughout for search discovery +- Cross-references use skill name with requirement markers, not `@` force-load syntax +- Discipline-enforcing skills have rationalization tables, red flags lists, and explicit loophole closures +- Consistency — no contradictions within or across files (tool names, filenames, commands, rules) +- No manual routing workarounds — if AGENTS.md or instruction files contain heavy TRIGGER/ACTION routing tables or skill-chain logic, the skill descriptions are likely too weak. Good descriptions enable auto-discovery without manual routing. + +### Step 4: Workflow architecture review (LLM judgment) + +For plugins with multi-phase workflows, check against `references/workflow-checklist.md`: + +- Hard gates between phases (artifact existence checks) +- Artifact persistence convention (defined output directory) +- Workflow state metadata for cross-session resumption +- Resumption protocol (detect existing artifacts, skip completed phases) +- Standardized error handling with retry +- Trivial change escape hatch +- Artifact self-correction with corrections log +- Learning loop mechanism + +### Step 5: Post review + +Post findings as inline PR comments at specific line numbers. Group by severity: +- **Critical** — Broken references, missing evals, factual contradictions, missing hard gates +- **Medium** — Naming inconsistencies, hardcoded paths, missing assertions, ad-hoc error handling +- **Low** — Style inconsistencies, description improvements + +Use a PR review (not individual comments) to batch all findings. + +## Skill Resources + +- `scripts/lint_plugin.py` — Deterministic plugin linter (Python 3.11+, stdlib only) +- `references/skill-quality-checklist.md` — Skill quality checklist (CSO, descriptions, content, discipline skills) +- `references/workflow-checklist.md` — Workflow architecture checklist (OpenSpec, hard gates, artifacts) + +## External References + +For deeper research on challenging reviews, consult these resources via web fetch, deepwiki, or clone the repo locally: + +- [Agent Skills specification](https://agentskills.io/specification) — Official SKILL.md format, frontmatter fields, progressive disclosure rules +- [Agent Skills best practices](https://agentskills.io/skill-creation/best-practices) — Context spending, calibrating control, gotchas, scripts, validation loops +- [Agent Skills description optimization](https://agentskills.io/skill-creation/optimizing-descriptions) — Trigger testing, train/validation splits, overfitting avoidance +- [Agent Skills using scripts](https://agentskills.io/skill-creation/using-scripts) — Self-contained scripts, --help, structured output, idempotency, exit codes +- [AgentV documentation](https://agentv.dev/) — Eval YAML schema, assertion types, workspace evals, multi-provider targets +- [OpenSpec](https://github.com/Fission-AI/OpenSpec) — Spec-driven development framework (OPSX conventions, artifact graphs, hard gates, delta specs) +- [Superpowers](https://github.com/obra/superpowers/) — Claude Code plugin with `` pattern, brainstorming workflow, skill-based development phases +- [Compound Engineering](https://github.com/EveryInc/compound-engineering-plugin) — Four-phase workflow (Plan/Work/Review/Compound) with learning loop pattern + +## Related Skills + +- **agentv-eval-review** — Lint and review AgentV eval files (invoke for eval-specific checks) +- **agent-architecture-design** — Design agent architectures from scratch diff --git a/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/references/skill-quality-checklist.md b/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/references/skill-quality-checklist.md new file mode 100644 index 00000000..1a8f279c --- /dev/null +++ b/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/references/skill-quality-checklist.md @@ -0,0 +1,125 @@ +# Skill Quality Checklist + +Derived from [Superpowers writing-skills](https://github.com/obra/superpowers/) and [Anthropic's skill authoring best practices](https://docs.anthropic.com/en/docs/agents-and-tools/agent-skills). + +## Frontmatter + +- [ ] Only two fields: `name` and `description` (no other fields supported) +- [ ] Max 1024 characters total in frontmatter +- [ ] `name` uses only letters, numbers, and hyphens (no parentheses, special chars) +- [ ] `description` written in third person +- [ ] `description` starts with "Use when..." focusing on triggering conditions +- [ ] `description` describes WHEN to use, NOT WHAT the skill does +- [ ] `description` does NOT summarize the skill's workflow or process + +### Why description must not summarize workflow + +Testing revealed that when a description summarizes the skill's workflow, Claude may follow the description instead of reading the full SKILL.md content. A description saying "code review between tasks" caused Claude to do ONE review, even though the SKILL.md flowchart clearly showed TWO reviews. When the description was changed to just triggering conditions, Claude correctly read and followed the full skill. + +### Description examples + +```yaml +# BAD: Summarizes workflow - Claude may follow this instead of reading skill +description: Use when executing plans - dispatches subagent per task with code review between tasks + +# BAD: Too much process detail +description: Use for TDD - write test first, watch it fail, write minimal code, refactor + +# BAD: Too abstract, vague +description: For async testing + +# BAD: First person +description: I can help you with async tests when they're flaky + +# GOOD: Just triggering conditions, no workflow summary +description: Use when executing implementation plans with independent tasks in the current session + +# GOOD: Triggering conditions only +description: Use when implementing any feature or bugfix, before writing implementation code + +# GOOD: Problem-focused, technology-agnostic +description: Use when tests have race conditions, timing dependencies, or pass/fail inconsistently +``` + +## Content Quality + +### Conciseness (Claude Search Optimization) + +- [ ] SKILL.md body is concise — only include what Claude doesn't already know +- [ ] Challenge each paragraph: "Does Claude really need this explanation?" +- [ ] Target word counts: + - Frequently-loaded skills: < 200 words + - Standard skills: < 500 words + - With references: SKILL.md lean, details in reference files +- [ ] Move heavy reference (100+ lines) to separate files +- [ ] Use cross-references instead of repeating content from other skills +- [ ] Compress examples — one excellent example beats many mediocre ones + +### Structure + +- [ ] Overview: core principle in 1-2 sentences +- [ ] When to Use: symptoms and use cases (flowchart only if decision is non-obvious) +- [ ] When NOT to use: explicit exclusions +- [ ] Core Pattern: before/after comparison (for techniques/patterns) +- [ ] Quick Reference: table or bullets for scanning +- [ ] Common Mistakes: what goes wrong + fixes +- [ ] Inline code for simple patterns, separate file for heavy reference + +### Writing Style + +- [ ] Imperative/infinitive form (verb-first instructions) +- [ ] NOT second person ("you should...") +- [ ] Technology-agnostic triggers unless skill is technology-specific +- [ ] Keywords throughout for search discovery (error messages, symptoms, synonyms, tool names) + +### Degrees of Freedom + +Match specificity to the task's fragility: + +| Freedom Level | When to Use | Example | +|---|---|---| +| High (text instructions) | Multiple valid approaches, context-dependent | Code review process | +| Medium (pseudocode/templates) | Preferred pattern exists, some variation OK | Report generation | +| Low (exact scripts) | Precise steps required, fragile operations | Database migration | + +## File Organization + +- [ ] Flat namespace — all skills in one searchable directory +- [ ] Supporting files only for: heavy reference (100+ lines), reusable tools/scripts +- [ ] Everything else inline in SKILL.md +- [ ] No narrative storytelling ("In session 2025-10-03, we found...") +- [ ] No multi-language dilution (one excellent example, not 5 mediocre ones) + +## Flowchart Usage + +- [ ] Use ONLY for non-obvious decision points, process loops, "A vs B" decisions +- [ ] Never use for: reference material (→ tables), code (→ code blocks), linear instructions (→ numbered lists) +- [ ] Labels must have semantic meaning (not "step1", "helper2") + +## Cross-References + +- [ ] Use skill name with explicit requirement markers: `**REQUIRED:** Use skill-name` +- [ ] Do NOT use `@` syntax to force-load files (burns context) +- [ ] Do NOT repeat content available in referenced skills + +## Anti-Patterns to Flag + +| Anti-Pattern | Why It's Bad | +|---|---| +| Narrative examples ("In session X, we found...") | Too specific, not reusable | +| Multi-language examples (JS, Python, Go, etc.) | Mediocre quality, maintenance burden | +| Code in flowcharts | Can't copy-paste, hard to read | +| Generic labels (helper1, step2) | No semantic meaning | +| Version printing instructions | Fragile, rely on git history | +| Hardcoded local paths | Machine-specific, not portable | +| Description summarizes workflow | Claude follows description, skips SKILL.md body | + +## Discipline-Enforcing Skills (Additional Checks) + +For skills that enforce rules (TDD, verification, coding standards): + +- [ ] Specific workarounds explicitly forbidden (not just "don't do X" but "don't keep it as reference, don't adapt it, delete means delete") +- [ ] Rationalization table present (common excuses + reality) +- [ ] Red flags list for self-checking +- [ ] "Spirit vs letter" addressed: "Violating the letter IS violating the spirit" +- [ ] Hard gates at critical decision points diff --git a/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/references/workflow-checklist.md b/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/references/workflow-checklist.md new file mode 100644 index 00000000..c5f3fa1f --- /dev/null +++ b/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/references/workflow-checklist.md @@ -0,0 +1,78 @@ +# Workflow Architecture Checklist + +Review multi-phase plugin workflows against these patterns, derived from [OpenSpec](https://github.com/Fission-AI/OpenSpec) (OPSX conventions), [Superpowers](https://github.com/obra/superpowers/), and [Compound Engineering](https://github.com/EveryInc/compound-engineering-plugin). + +## Phase Coverage + +Compare the plugin's workflow phases against the OpenSpec artifact model: + +| OpenSpec Phase | OPSX Command | Expected Plugin Equivalent | +|---|---|---| +| Explore | `/opsx:explore` | Research mode — investigate without creating artifacts | +| Validate | (custom) | Check requirements against real codebase before design | +| Propose | `/opsx:propose` | Define WHAT and WHY with acceptance criteria | +| Design | (via schema) | Plan HOW — file-level changes, multi-repo coordination | +| Tasks | (via schema) | Standalone `tasks.md` with `- [ ]` checkboxes | +| Apply | `/opsx:apply` | Implement through task checklist with TDD | +| Verify | `/opsx:verify` | Build + test + trace implementation back to specs | +| Archive | `/opsx:archive` | Finalize, merge deltas, persist learnings | + +Not all phases are required for every plugin. Flag missing phases only when the gap would cause real problems. + +## Hard Gates + +From [Superpowers](https://github.com/obra/superpowers/) `` pattern: + +- [ ] Each phase checks for prerequisite artifacts before proceeding +- [ ] Gate failure message tells the user which command/skill to run first +- [ ] Gates cannot be silently bypassed +- [ ] Gate checks happen at the start of the skill, before any work + +Example gate: +``` +HARD GATE: `hld-review.md` MUST exist in {output_dir}/. +If missing, inform the user: "Run the design-review skill first." STOP. +``` + +## Artifact Contracts + +- [ ] Each phase produces a defined output artifact (e.g., `context.md`, `design.md`, `tasks.md`) +- [ ] Output format of phase N matches expected input of phase N+1 +- [ ] Artifact location convention is defined (not just `{output_dir}/`) +- [ ] Artifacts persist to disk (not just conversation context) for cross-session resumption + +## Workflow State + +- [ ] Workflow state tracked in a metadata file (e.g., `.workflow.yaml`) alongside artifacts +- [ ] Metadata records: which phases are complete, timestamps, WI/issue number +- [ ] Resumption protocol detects existing artifacts and skips completed phases +- [ ] Partial completion is handled (e.g., Phase 4 with N-1 of N agents succeeding) + +## Error Handling + +- [ ] Standardized retry policy across all skills (e.g., retry MCP calls 3x with exponential backoff) +- [ ] Clear failure reporting — user knows what failed and what to do next +- [ ] Errors don't silently corrupt downstream phases +- [ ] Critical failures (P0 findings, merge conflicts) stop the workflow + +## Escape Hatches + +- [ ] Trivial change escape: small fixes can skip spec phases +- [ ] Criteria for "trivial" are documented (e.g., < 20 lines, single file, no schema change) +- [ ] Artifact self-correction: downstream phases can fix factual errors in upstream artifacts +- [ ] Corrections are logged (e.g., `## Corrections Log` section) for auditability + +## Learning Loop + +From [Compound Engineering](https://github.com/EveryInc/compound-engineering-plugin) `/ce:compound` pattern: + +- [ ] Mechanism exists to capture patterns from completed work +- [ ] Learnings feed back into future workflow runs (e.g., review guidelines, common patterns) +- [ ] Learning artifacts are version-controlled and mergeable + +## Fast-Forward Mode + +From OpenSpec `/opsx:ff`: + +- [ ] For well-understood changes, all planning artifacts can be generated in one pass +- [ ] Fast-forward mode is optional — users can still step through phases individually diff --git a/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/scripts/lint_plugin.py b/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/scripts/lint_plugin.py new file mode 100644 index 00000000..15f0be18 --- /dev/null +++ b/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/scripts/lint_plugin.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +"""Lint AI plugin structure for common issues. + +Usage: python lint_plugin.py [--evals-dir ] [--json] + +Checks: + - Every skills/*/SKILL.md has a corresponding eval file + - SKILL.md frontmatter has name and description + - No hardcoded local paths (drive letters, absolute OS paths) + - No version printing instructions + - Commands reference existing skills + - Path style consistency across commands + - Referenced files (references/*.md) exist + +Exit code: 0 if no issues, 1 if issues found. +""" + +import json +import os +import re +import sys +from pathlib import Path + + +def find_skills(plugin_dir: Path) -> list[Path]: + """Find all SKILL.md files in the plugin.""" + return sorted(plugin_dir.rglob("skills/*/SKILL.md")) + + +def find_evals(evals_dir: Path, plugin_name: str) -> list[Path]: + """Find eval files for a plugin.""" + plugin_evals = evals_dir / plugin_name + if not plugin_evals.exists(): + return [] + return sorted(plugin_evals.rglob("*.yaml")) + sorted(plugin_evals.rglob("*.yml")) + + +def find_commands(plugin_dir: Path) -> list[Path]: + """Find command files.""" + commands_dir = plugin_dir / "commands" + if not commands_dir.exists(): + return [] + return sorted(commands_dir.glob("*.md")) + + +def lint_plugin(plugin_dir: Path, evals_dir: Path | None = None) -> list[dict]: + issues = [] + + def issue(severity: str, msg: str, file: str | None = None, line: int | None = None): + issues.append({ + "file": file or str(plugin_dir), + "severity": severity, + "message": msg, + "line": line, + }) + + plugin_name = plugin_dir.name + skills = find_skills(plugin_dir) + commands = find_commands(plugin_dir) + + # Collect skill names + skill_names = set() + for skill_path in skills: + skill_name = skill_path.parent.name + skill_names.add(skill_name) + + # Check each SKILL.md + for skill_path in skills: + skill_name = skill_path.parent.name + text = skill_path.read_text(encoding="utf-8") + lines = text.splitlines() + + # Check frontmatter + if not text.startswith("---"): + issue("error", "Missing YAML frontmatter", str(skill_path)) + else: + fm_end = text.find("---", 3) + if fm_end == -1: + issue("error", "Unclosed YAML frontmatter", str(skill_path)) + else: + fm = text[3:fm_end] + if "name:" not in fm: + issue("error", "Frontmatter missing 'name' field", str(skill_path)) + if "description:" not in fm: + issue("error", "Frontmatter missing 'description' field", str(skill_path)) + + # Check for hardcoded paths + drive_letter_pat = re.compile(r'[A-Z]:\\[A-Za-z]') + for i, line in enumerate(lines, 1): + if drive_letter_pat.search(line): + # Skip if it's in a table header or obviously an example + if "Override" not in line and "Example" not in line: + issue("warning", f"Hardcoded local path detected", str(skill_path), i) + + # Check for version printing + version_pat = re.compile(r'print.*version|version \d{8}', re.IGNORECASE) + for i, line in enumerate(lines, 1): + if version_pat.search(line): + issue("warning", "Version printing instruction — rely on git history", str(skill_path), i) + + # Check referenced files exist + ref_pat = re.compile(r'`(references/[^`]+)`') + skill_dir = skill_path.parent + for i, line in enumerate(lines, 1): + for match in ref_pat.finditer(line): + ref_path = skill_dir / match.group(1) + if not ref_path.exists(): + issue("error", f"Referenced file does not exist: {match.group(1)}", str(skill_path), i) + + # Check for non-existent command references + cmd_pat = re.compile(r'/([a-z][a-z0-9-]+)') + cmd_names = {c.stem for c in commands} + for i, line in enumerate(lines, 1): + for match in cmd_pat.finditer(line): + cmd_ref = match.group(1) + # Skip common false positives + if cmd_ref in ("dev", "null", "tmp", "etc", "usr", "bin", "opsx"): + continue + if cmd_ref.startswith("opsx:") or cmd_ref.startswith("ce:"): + continue + if cmd_ref not in cmd_names and cmd_ref not in skill_names: + # Only flag if it looks like a slash command (preceded by whitespace or start of line) + before = line[:match.start()].rstrip() + if before == "" or before.endswith((" ", "\t", '"', "'", ":")): + issue("info", f"References /{cmd_ref} — not found in commands/ or skills/", str(skill_path), i) + + # Check eval coverage + if evals_dir: + eval_files = find_evals(evals_dir, plugin_name) + eval_stems = set() + for ef in eval_files: + stem = ef.stem.replace(".eval", "") + eval_stems.add(stem) + + for skill_name in sorted(skill_names): + # Check various naming patterns + has_eval = ( + skill_name in eval_stems + or skill_name.replace(plugin_name + "-", "") in eval_stems + or any(skill_name in s for s in eval_stems) + ) + if not has_eval: + issue("warning", f"Skill '{skill_name}' has no corresponding eval file", str(plugin_dir / "skills" / skill_name / "SKILL.md")) + + # Check command path consistency + path_styles = set() + for cmd_path in commands: + text = cmd_path.read_text(encoding="utf-8") + if "plugins/" in text: + path_styles.add("absolute") + if re.search(r'skills/[a-z]', text) and "plugins/" not in text.split("skills/")[0][-20:]: + path_styles.add("relative") + if len(path_styles) > 1: + issue("info", "Commands use mixed path styles (some relative, some absolute)", str(plugin_dir / "commands")) + + return issues + + +def main(): + if len(sys.argv) < 2: + print(f"Usage: {sys.argv[0]} [--evals-dir ] [--json]", file=sys.stderr) + sys.exit(2) + + plugin_dir = Path(sys.argv[1]) + output_json = "--json" in sys.argv + + evals_dir = None + if "--evals-dir" in sys.argv: + idx = sys.argv.index("--evals-dir") + if idx + 1 < len(sys.argv): + evals_dir = Path(sys.argv[idx + 1]) + + if not plugin_dir.is_dir(): + print(f"Error: {plugin_dir} is not a directory", file=sys.stderr) + sys.exit(2) + + issues = lint_plugin(plugin_dir, evals_dir) + + if output_json: + print(json.dumps(issues, indent=2)) + else: + for iss in issues: + line = f":{iss['line']}" if iss.get("line") else "" + print(f"[{iss['severity'].upper()}] {iss['file']}{line}: {iss['message']}") + + counts = {} + for iss in issues: + counts[iss["severity"]] = counts.get(iss["severity"], 0) + 1 + if issues: + print(f"\n{len(issues)} issues: {', '.join(f'{v} {k}' for k, v in sorted(counts.items()))}") + else: + print("No issues found.") + + sys.exit(1 if any(i["severity"] == "error" for i in issues) else 0) + + +if __name__ == "__main__": + main() diff --git a/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agentv-eval-review/SKILL.md b/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agentv-eval-review/SKILL.md new file mode 100644 index 00000000..23e2c346 --- /dev/null +++ b/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agentv-eval-review/SKILL.md @@ -0,0 +1,52 @@ +--- +name: agentv-eval-review +description: >- + Use when reviewing eval YAML files for quality issues, linting eval files before + committing, checking eval schema compliance, or when asked to "review these evals", + "check eval quality", "lint eval files", or "validate eval structure". + Do NOT use for writing evals (use agentv-eval-writer) or running evals (use agentv-bench). +--- + +# Eval Review + +## Overview + +Lint and review AgentV eval YAML files for structural issues, schema compliance, and quality problems. Runs deterministic checks via script, then applies LLM judgment for semantic issues the script cannot catch. + +## Process + +### Step 1: Run the linter + +Execute `scripts/lint_eval.py` against the target eval files: + +```bash +python scripts/lint_eval.py --json +``` + +The script checks: +- `.eval.yaml` extension +- `description` field present +- Each test has `id`, `input`, and at least one of `criteria`/`expected_output`/`assertions` +- File paths in `type: file` use leading `/` +- `assertions` blocks present (flags tests relying solely on `expected_output`) +- `expected_output` prose detection (flags "The agent should..." patterns) +- Repeated file inputs across tests (recommends top-level `input`) +- Naming prefix consistency across eval files in same directory + +### Step 2: Review script output + +Report the script findings grouped by severity (error > warning > info). For each finding, include the file path and a concrete fix. + +### Step 3: Semantic review (LLM judgment) + +The script catches structural issues but cannot assess: +- **Factual accuracy** — Do tool/command names in expected_output match what the skill documents? +- **Coverage gaps** — Are important edge cases missing? +- **Assertion discriminability** — Would assertions pass for both good and bad output? +- **Cross-file consistency** — Do output filenames match across evals and skills? + +Read the relevant SKILL.md files and cross-check against the eval content for these issues. + +## Skill Resources + +- `scripts/lint_eval.py` — Deterministic eval linter (Python 3.11+, stdlib only) diff --git a/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agentv-eval-review/scripts/lint_eval.py b/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agentv-eval-review/scripts/lint_eval.py new file mode 100644 index 00000000..1ba45088 --- /dev/null +++ b/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agentv-eval-review/scripts/lint_eval.py @@ -0,0 +1,239 @@ +#!/usr/bin/env python3 +"""Lint AgentV eval YAML files for common issues. + +Usage: python lint_eval.py [--json] + +Checks: + - File uses .eval.yaml extension + - description field present + - Each test has id, input, criteria + - File paths in type:file use leading / + - assertions blocks present (not relying solely on expected_output) + - expected_output does not contain evaluation criteria prose + - Repeated file inputs across tests (should use top-level input) + - Naming prefix consistency across eval files in same directory + +Exit code: 0 if no issues, 1 if issues found. +""" + +import json +import os +import re +import sys +from pathlib import Path + +try: + import yaml +except ImportError: + # Fall back to basic YAML parsing if PyYAML not available + yaml = None + + +def parse_yaml_basic(text: str) -> dict: + """Minimal YAML-ish parser for when PyYAML is unavailable.""" + # This is a best-effort fallback; recommend installing PyYAML + import ast + # Try json first (YAML is a superset of JSON) + try: + return json.loads(text) + except Exception: + pass + return {} + + +def load_yaml(path: Path) -> dict: + text = path.read_text(encoding="utf-8") + if yaml: + return yaml.safe_load(text) or {} + return parse_yaml_basic(text) + + +def lint_file(path: Path) -> list[dict]: + issues = [] + + def issue(severity: str, msg: str, line: int | None = None): + issues.append({"file": str(path), "severity": severity, "message": msg, "line": line}) + + # Check extension + if not path.name.endswith(".eval.yaml"): + issue("error", f"File should use .eval.yaml extension, got: {path.name}") + + try: + data = load_yaml(path) + except Exception as e: + issue("error", f"Failed to parse YAML: {e}") + return issues + + if not isinstance(data, dict): + issue("error", "Root element is not a mapping") + return issues + + # Check description + if "description" not in data: + issue("warning", "Missing top-level 'description' field") + + tests = data.get("tests", []) + if not isinstance(tests, list): + issue("error", "'tests' is not a list") + return issues + + if not tests: + issue("warning", "No tests defined") + return issues + + # Check for top-level input (shared file references) + top_level_input = data.get("input") + + # Collect file values across tests to detect repetition + file_values_per_test: list[list[str]] = [] + + for i, test in enumerate(tests): + test_id = test.get("id", f"test-{i}") + + if "id" not in test: + issue("error", f"Test at index {i} missing 'id'") + + if "input" not in test and top_level_input is None: + issue("error", f"Test '{test_id}' missing 'input' and no top-level input defined") + + has_criteria = "criteria" in test + has_expected = "expected_output" in test + has_assertions = "assertions" in test + + if not has_criteria and not has_expected and not has_assertions: + issue("error", f"Test '{test_id}' needs at least one of: criteria, expected_output, assertions") + + # Check assertions present + if not has_assertions and has_expected: + issue("warning", f"Test '{test_id}' has expected_output but no assertions — add deterministic assertions where possible") + + # Check expected_output for prose patterns + if has_expected: + expected = test["expected_output"] + expected_text = "" + if isinstance(expected, str): + expected_text = expected + elif isinstance(expected, list): + for msg in expected: + if isinstance(msg, dict): + content = msg.get("content", "") + if isinstance(content, str): + expected_text += content + + prose_patterns = [ + r"[Tt]he agent should", + r"[Ss]hould identify", + r"[Ss]hould flag", + r"[Ss]hould recommend", + r"[Ss]hould produce", + r"[Ss]hould detect", + r"[Ss]hould load", + r"[Ss]hould run", + ] + for pat in prose_patterns: + if re.search(pat, expected_text): + issue("warning", f"Test '{test_id}' expected_output contains evaluation criteria prose ('{pat.lstrip('[Tt]').lstrip('[Ss]')}...') — use criteria or assertions instead") + break + + # Collect file values from input + test_files = extract_file_values(test.get("input", [])) + file_values_per_test.append(test_files) + + # Check file paths for leading / + for fv in test_files: + if not fv.startswith("/"): + issue("warning", f"Test '{test_id}' file path missing leading '/': {fv}") + + # Check for repeated file inputs + if len(file_values_per_test) >= 2 and not top_level_input: + common_files = set(file_values_per_test[0]) + for fvs in file_values_per_test[1:]: + common_files &= set(fvs) + if common_files: + issue("info", f"File input repeated in every test: {', '.join(sorted(common_files))} — consider using top-level input") + + return issues + + +def extract_file_values(input_data) -> list[str]: + """Extract type:file values from input structure.""" + files = [] + if isinstance(input_data, list): + for item in input_data: + if isinstance(item, dict): + content = item.get("content", []) + if isinstance(content, list): + for c in content: + if isinstance(c, dict) and c.get("type") == "file": + v = c.get("value", "") + if v: + files.append(v) + return files + + +def lint_directory(path: Path) -> list[dict]: + issues = [] + eval_files = sorted(path.rglob("*.yaml")) + sorted(path.rglob("*.yml")) + + if not eval_files: + issues.append({"file": str(path), "severity": "warning", "message": "No eval files found", "line": None}) + return issues + + # Check naming prefix consistency + prefixes = set() + for f in eval_files: + name = f.stem.replace(".eval", "") + parts = name.split("-") + if len(parts) >= 2: + prefixes.add(parts[0]) + + if len(prefixes) > 1: + issues.append({ + "file": str(path), + "severity": "info", + "message": f"Inconsistent naming prefixes: {', '.join(sorted(prefixes))}", + "line": None, + }) + + for f in eval_files: + issues.extend(lint_file(f)) + + return issues + + +def main(): + if len(sys.argv) < 2: + print(f"Usage: {sys.argv[0]} [--json]", file=sys.stderr) + sys.exit(2) + + target = Path(sys.argv[1]) + output_json = "--json" in sys.argv + + if target.is_file(): + issues = lint_file(target) + elif target.is_dir(): + issues = lint_directory(target) + else: + print(f"Error: {target} not found", file=sys.stderr) + sys.exit(2) + + if output_json: + print(json.dumps(issues, indent=2)) + else: + for iss in issues: + line = f":{iss['line']}" if iss.get("line") else "" + print(f"[{iss['severity'].upper()}] {iss['file']}{line}: {iss['message']}") + + counts = {} + for iss in issues: + counts[iss["severity"]] = counts.get(iss["severity"], 0) + 1 + if issues: + print(f"\n{len(issues)} issues: {', '.join(f'{v} {k}' for k, v in sorted(counts.items()))}") + else: + print("No issues found.") + + sys.exit(1 if any(i["severity"] == "error" for i in issues) else 0) + + +if __name__ == "__main__": + main() From 429c323f36b7cdcc707d472a88afc29132f8d17e Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 26 Mar 2026 11:14:17 +1100 Subject: [PATCH 02/14] fix: move skills to .agents/skills/ for pi discovery, remove skill-trigger MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pi discovers skills from .agents/skills/ in the workspace, not from plugin directories. Move skills there and remove .allagents/workspace.yaml. Remove skill-trigger assertions — workspace materialization for pi-cli needs separate investigation (pi runs in its own workspace, not the eval's materialized workspace). Baseline: 5/9 tests pass without skill triggering. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../skills/agent-plugin-review/SKILL.md | 0 .../references/skill-quality-checklist.md | 0 .../references/workflow-checklist.md | 0 .../scripts/lint_plugin.py | 0 .../skills/agentv-eval-review/SKILL.md | 0 .../agentv-eval-review/scripts/lint_eval.py | 0 .../.allagents/workspace.yaml | 5 - .../skills/agent-architecture-design/SKILL.md | 108 ------------- .../references/agentic-design-patterns.md | 152 ------------------ .../references/workflow-patterns.md | 105 ------------ 10 files changed, 370 deletions(-) rename evals/agentic-engineering/workspace-template/{plugins/agentic-engineering => .agents}/skills/agent-plugin-review/SKILL.md (100%) rename evals/agentic-engineering/workspace-template/{plugins/agentic-engineering => .agents}/skills/agent-plugin-review/references/skill-quality-checklist.md (100%) rename evals/agentic-engineering/workspace-template/{plugins/agentic-engineering => .agents}/skills/agent-plugin-review/references/workflow-checklist.md (100%) rename evals/agentic-engineering/workspace-template/{plugins/agentic-engineering => .agents}/skills/agent-plugin-review/scripts/lint_plugin.py (100%) rename evals/agentic-engineering/workspace-template/{plugins/agentic-engineering => .agents}/skills/agentv-eval-review/SKILL.md (100%) rename evals/agentic-engineering/workspace-template/{plugins/agentic-engineering => .agents}/skills/agentv-eval-review/scripts/lint_eval.py (100%) delete mode 100644 evals/agentic-engineering/workspace-template/.allagents/workspace.yaml delete mode 100644 evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/SKILL.md delete mode 100644 evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/references/agentic-design-patterns.md delete mode 100644 evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/references/workflow-patterns.md diff --git a/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/SKILL.md b/evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/SKILL.md similarity index 100% rename from evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/SKILL.md rename to evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/SKILL.md diff --git a/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/references/skill-quality-checklist.md b/evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/references/skill-quality-checklist.md similarity index 100% rename from evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/references/skill-quality-checklist.md rename to evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/references/skill-quality-checklist.md diff --git a/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/references/workflow-checklist.md b/evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/references/workflow-checklist.md similarity index 100% rename from evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/references/workflow-checklist.md rename to evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/references/workflow-checklist.md diff --git a/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/scripts/lint_plugin.py b/evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/scripts/lint_plugin.py similarity index 100% rename from evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-plugin-review/scripts/lint_plugin.py rename to evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/scripts/lint_plugin.py diff --git a/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agentv-eval-review/SKILL.md b/evals/agentic-engineering/workspace-template/.agents/skills/agentv-eval-review/SKILL.md similarity index 100% rename from evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agentv-eval-review/SKILL.md rename to evals/agentic-engineering/workspace-template/.agents/skills/agentv-eval-review/SKILL.md diff --git a/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agentv-eval-review/scripts/lint_eval.py b/evals/agentic-engineering/workspace-template/.agents/skills/agentv-eval-review/scripts/lint_eval.py similarity index 100% rename from evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agentv-eval-review/scripts/lint_eval.py rename to evals/agentic-engineering/workspace-template/.agents/skills/agentv-eval-review/scripts/lint_eval.py diff --git a/evals/agentic-engineering/workspace-template/.allagents/workspace.yaml b/evals/agentic-engineering/workspace-template/.allagents/workspace.yaml deleted file mode 100644 index 40d57f65..00000000 --- a/evals/agentic-engineering/workspace-template/.allagents/workspace.yaml +++ /dev/null @@ -1,5 +0,0 @@ -plugins: - - source: ./plugins/agentic-engineering - -clients: - - pi diff --git a/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/SKILL.md b/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/SKILL.md deleted file mode 100644 index 283620b4..00000000 --- a/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/SKILL.md +++ /dev/null @@ -1,108 +0,0 @@ ---- -name: agent-architecture-design -description: >- - Use when designing an AI agent system, selecting agentic design patterns, - planning multi-phase workflows, choosing between single-agent and multi-agent architectures, - or when asked "what kind of agent should I build", "how should I structure this automation", - "design an agent for X", or "which agentic pattern fits this problem". ---- - -# Agent Architecture Design - -## Overview - -Guide the selection and design of the correct agentic architecture by diagnosing the problem type, mapping it to a proven design pattern, and defining the workflow structure, tooling, and management model. - -## Process - -### Phase 1: Problem Diagnosis - -Categorize the request on two axes: - -| | Task-Level (single job) | Project-Level (coordination needed) | -|---|---|---| -| **Software-Shaped** (working code/system) | Single-Agent Iterative Loop | Autonomous Pipeline or Multi-Agent System | -| **Metric-Shaped** (optimize a number) | Optimization Loop | Optimization Loop + Multi-Agent System | - -**Diagnosis questions:** -1. Is the goal working software or optimizing a metric? -2. Is this a single discrete task or multiple coordinated parts? -3. How much human involvement is acceptable during execution? -4. What scale justifies the architecture complexity? - -### Phase 2: Pattern Selection - -Load `references/agentic-design-patterns.md` for full details on each pattern. Summary: - -**Single-Agent Iterative Loop** (Agentic IDE) -- Human = manager, Agent = worker -- Decompose the problem into small chunks (UI, API, tests) -- Agent gets a workspace (terminal, files, search) -- Best for: individual developer productivity on discrete tasks - -**Autonomous Pipeline** (Zero-Human Loop) -- Spec In → Autonomous Zone → Eval Out -- Heavy human involvement at start (specs) and end (review), zero in the middle -- Requires robust evals — iterations happen automatically until eval passes -- Best for: zero-human-intervention software delivery - -**Optimization Loop** (Self-Improving Agent) -- Hill climbing against a specific metric -- Agent tries paths, fails, backtracks -- Needs a clear optimization target -- Best for: reaching peak of an optimization metric through experimentation - -**Multi-Agent System** (Hierarchical/Supervisor Pattern) -- Specialized roles with defined handoffs (Researcher → Writer → Editor → Publisher) -- Complexity lies in context management between agents -- Only justified at scale (10,000 tickets, not 10) -- Best for: seamless coordination across specialized AI workers - -### Phase 3: Workflow Architecture - -After selecting a pattern, define the workflow structure. Load `references/workflow-patterns.md` for framework-specific patterns. - -**For each pattern, define:** - -1. **Phases** — What sequential or parallel steps does the workflow execute? -2. **Artifacts** — What does each phase produce? (specs, designs, tasks, code, reports) -3. **Gates** — What must be true before proceeding to the next phase? -4. **Tooling** — What tools/MCPs does each agent need? -5. **Context flow** — How is information passed between phases/agents? -6. **Resumption** — How does the workflow recover from interruption? - -**Pattern → Workflow mapping:** - -| Agentic Design Pattern | Typical Workflow | -|---|---| -| Single-Agent Iterative Loop | Single-phase: decompose → implement → verify | -| Autonomous Pipeline | OpenSpec-style: validate → propose → design → implement → verify | -| Optimization Loop | Iteration loop: hypothesize → test → measure → backtrack/advance | -| Multi-Agent System | Role pipeline: role₁ → handoff → role₂ → handoff → roleₙ | - -### Phase 4: Output - -Produce a design document covering: - -1. **Diagnosis** — Software or metric shaped, task or project level -2. **Recommended Pattern** — Which agentic architecture and why -3. **Workflow Design** — Phases, artifacts, gates, context flow -4. **Scaffolding Plan** — Tools, MCPs, evals the agent needs -5. **Management Model** — Human role (Manager, Observer, or Spec-Writer) - -## Implementation Rules - -1. **Simple scales better** — Do not recommend 3-level management if 2-level works. Simple configurations are more performant. -2. **Context is everything** — Agents depend entirely on the context and scaffolding provided by the architect. Design the scaffolding, not just the agent. -3. **Human-centered → Agent-centered** — For large projects, move from "human managing every agent" to "planner agent managing sub-agents" where the human observes. -4. **Avoid pattern-confusion** — Never use an Optimization Loop to build a novel. Never use a Single-Agent Loop for a project requiring specialized multi-agent orchestration. -5. **Scale justifies complexity** — Multi-agent orchestration is only worth it at scale. For small problems, a single well-prompted agent outperforms a complex framework. - -## Skill Resources - -- `references/agentic-design-patterns.md` — Detailed pattern descriptions with examples and anti-patterns -- `references/workflow-patterns.md` — Workflow patterns from OpenSpec, Superpowers, and Compound Engineering - -## Related Skills - -- **agent-plugin-review** — Review an implemented plugin against architecture best practices diff --git a/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/references/agentic-design-patterns.md b/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/references/agentic-design-patterns.md deleted file mode 100644 index 0cebab50..00000000 --- a/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/references/agentic-design-patterns.md +++ /dev/null @@ -1,152 +0,0 @@ -# Agentic Design Patterns - -Four foundational architectures for AI agent systems. Each pattern defines a management model, workflow structure, and set of anti-patterns. - -## Single-Agent Iterative Loop (Agentic IDE) - -**Use when:** Problem is software-shaped and scale is task-level. - -**Architecture:** -- Human is the manager; agent is the worker -- Focus on decomposition — break the big problem into small, well-defined chunks -- Each chunk is independently implementable and testable - -**Tooling requirements:** -- Terminal access (shell, build tools, test runners) -- File system access (read, write, search) -- Search tools (grep, glob, web search) -- Version control (git) - -**Workflow:** Single-phase — decompose → implement → verify - -**Management model:** Human as manager. Human defines what to build, agent builds it, human reviews. - -**Example:** A developer using Claude Code to implement a feature. They describe what they want, the agent writes the code, developer reviews and iterates. - -**Anti-patterns:** -- Using a single-agent loop for a project that needs 10+ coordinated agents -- No decomposition — giving the agent one massive task instead of focused chunks -- No verification step — trusting agent output without review - ---- - -## Autonomous Pipeline (Zero-Human Loop) - -**Use when:** Problem is software-shaped and high autonomy is required. - -**Architecture:** -- Spec In → Autonomous Zone → Eval Out -- Human involvement is heavy at start (specs) and end (review), zero in the middle -- Iterations (v0.1 → v1.0) happen automatically until eval passes - -**Requirements:** -- Robust evals are mandatory — the system cannot self-correct without them -- Specs must be precise enough to generate working systems -- Evals must be discriminating — pass for good output, fail for bad - -**Workflow:** OpenSpec-style pipeline: -1. Validate (check requirements against reality) -2. Propose (define WHAT and WHY) -3. Design (plan HOW) -4. Implement (TDD through task checklist) -5. Verify (build + test + spec traceability) - -**Management model:** Human as spec-writer. Human writes specs and reviews final output. Everything in between is autonomous. - -**Example:** A spec-driven development plugin where the developer provides a work item number, and the system autonomously validates requirements, designs the implementation, codes it with TDD, and produces a PR. - -**Anti-patterns:** -- No evals — the system has no way to know when it's done or if it's correct -- Specs too vague — "make it better" is not a spec -- Human intervening in the autonomous zone — defeats the purpose - ---- - -## Optimization Loop (Self-Improving Agent) - -**Use when:** Problem is metric-shaped (optimization). - -**Architecture:** -- Hill climbing against a specific metric -- Agent tries paths, fails, and backtracks -- Each iteration measures progress against the target - -**Requirements:** -- Clear, measurable optimization target -- Fast feedback loop (metric must be computable quickly) -- Permission to explore and fail - -**Workflow:** Iteration loop: -1. Hypothesize (propose a change) -2. Test (apply the change) -3. Measure (evaluate against metric) -4. Decide (advance if improved, backtrack if not) -5. Repeat until target reached or budget exhausted - -**Management model:** Human as observer. Human defines the metric and constraints, agent explores the solution space. - -**Example:** Optimizing a prompt's accuracy against an eval suite. Agent tries variations, measures pass rate, keeps improvements, discards regressions. - -**Anti-patterns:** -- No clear metric — "make it better" is not optimizable -- Using for creative tasks — novels, designs, art have no single metric -- No backtracking — agent must be allowed to undo bad changes - ---- - -## Multi-Agent System (Hierarchical/Supervisor Pattern) - -**Use when:** Problem requires specialized roles and complex handoffs. - -**Architecture:** -- Define specialized roles (Researcher → Writer → Editor → Publisher) -- Focus on handoffs — complexity lies in context management between agents -- Each role has its own tools, context, and success criteria - -**Scale requirement:** Only justified when the volume warrants it. Managing 10,000 tickets needs orchestration. Managing 10 does not. - -**Workflow:** Role pipeline with handoffs: -1. Role₁ performs its task, produces output artifact -2. Handoff: artifact + summary passed to Role₂ -3. Role₂ performs its task, produces next artifact -4. Continue until pipeline complete - -**Management model:** Human as observer or planner-manager. For large scale, a planner agent manages sub-agents while human observes. - -**Context management:** -- Each handoff loses context — design artifacts to carry essential information -- Summaries at each handoff prevent context window overflow -- Shared state (files, databases) can bridge context gaps - -**Example:** A content pipeline where a researcher gathers information, a writer produces a draft, an editor refines it, and a publisher formats and distributes it. - -**Anti-patterns:** -- Over-engineering — using orchestration for a 3-step task one person could do -- Poor handoffs — losing critical context between agents -- No specialization — all agents doing the same thing (just use a single-agent loop) -- Too many management layers — 3-level hierarchies are almost always slower than 2-level - ---- - -## Pattern Selection Decision Tree - -``` -Is the goal working software or optimizing a metric? -├── Software-shaped -│ ├── Single discrete task? → Single-Agent Iterative Loop -│ ├── Needs full autonomy (spec → code → eval)? → Autonomous Pipeline -│ └── Multiple specialized roles needed at scale? → Multi-Agent System -└── Metric-shaped - ├── Single metric to optimize? → Optimization Loop - └── Multiple metrics across coordinated roles? → Optimization Loop + Multi-Agent System -``` - -## Hybrid Architectures - -Real systems often combine patterns: - -- **Autonomous Pipeline + Optimization Loop:** Auto-iterate on prompts using eval scores -- **Single-Agent Loop + Multi-Agent System:** Individual coding agents orchestrated by a planner for large projects -- **Autonomous Pipeline + Multi-Agent System:** Autonomous pipeline with specialized roles (validate-agent, design-agent, code-agent) - -When combining, keep the management model simple. A 2-level structure (planner + workers) outperforms deeper hierarchies. diff --git a/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/references/workflow-patterns.md b/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/references/workflow-patterns.md deleted file mode 100644 index 1465ce11..00000000 --- a/evals/agentic-engineering/workspace-template/plugins/agentic-engineering/skills/agent-architecture-design/references/workflow-patterns.md +++ /dev/null @@ -1,105 +0,0 @@ -# Workflow Patterns by Framework - -Patterns from reference frameworks for designing agent workflows, organized by agentic design pattern. - -## OpenSpec (OPSX Conventions) - -**Source:** [OpenSpec](https://github.com/Fission-AI/OpenSpec) - -**Best for:** Autonomous Pipeline and Multi-Agent System - -**Core concept:** Artifact-driven dependency graph. Commands chain through file existence, not sequential phases. - -**Default workflow (spec-driven):** -``` -/opsx:explore → /opsx:propose → /opsx:apply → /opsx:archive -``` - -**Expanded workflow:** -``` -/opsx:new → /opsx:continue (×N) → /opsx:apply → /opsx:verify → /opsx:archive -``` - -**Key patterns:** -- **Artifact gates** — Each phase produces a file. Next phase checks file exists before starting. -- **Delta specs** — Changes are expressed as ADDED/MODIFIED/REMOVED operations on existing specs, not full rewrites. -- **Fast-forward** (`/opsx:ff`) — Generate all planning artifacts at once for clear-scope work. -- **Schema-configurable** — Workflow phases defined in `schema.yaml` as a DAG, not hardcoded. -- **Archive merges deltas** — Completed changes are merged back into main specs, keeping specs as source of truth. - -**Artifact types:** -| Artifact | Purpose | -|---|---| -| `proposal.md` | WHAT and WHY (scope, non-goals, acceptance criteria) | -| `specs/*.md` | Behavior contracts with Given/When/Then scenarios | -| `design.md` | HOW (technical approach, decisions, risks) | -| `tasks.md` | Implementation checklist with checkboxes | -| `verify-report.md` | Verification results and traceability | - ---- - -## Superpowers - -**Source:** [Superpowers](https://github.com/obra/superpowers/) - -**Best for:** Single-Agent Iterative Loop and Autonomous Pipeline - -**Core concept:** Skills as workflow phases with hard gates and mandatory skill checks. - -**Workflow phases:** -1. Brainstorming — Explore requirements before committing -2. Writing Plans — Task decomposition -3. Executing Plans / Subagent-Driven Development — Implementation -4. Test-Driven Development — RED-GREEN-REFACTOR during implementation -5. Requesting Code Review — Verification -6. Finishing a Development Branch — Completion - -**Key patterns:** -- **``** — Synchronization points that prevent progression without explicit checks. Agent must verify conditions before proceeding. -- **The 1% Rule** — If there's even a 1% chance a skill applies, invoke it. Prevents agents from rationalizing past important steps. -- **`SUBAGENT-STOP`** — Prevents subagents from invoking full skill workflows when executing specific tasks. -- **Brainstorming before planning** — Always explore intent and requirements before committing to a plan. -- **Two-stage code review** — Spec compliance review then code quality review (not one combined review). - ---- - -## Compound Engineering - -**Source:** [Compound Engineering](https://github.com/EveryInc/compound-engineering-plugin) - -**Best for:** Autonomous Pipeline with learning loop - -**Core concept:** Four-phase repeating cycle where learnings compound across iterations. - -**Workflow:** -``` -/ce:plan → /ce:work → /ce:review → /ce:compound → repeat -``` - -**Key patterns:** -- **Compounding loop** (`/ce:compound`) — After each cycle, document what worked and what didn't. Feed learnings into future planning. Each cycle gets easier. -- **Autonomous modes:** - - `/lfg` (Let's Go) — Sequential full cycle - - `/slfg` (Swarm LFG) — Parallel execution during review/testing -- **Multi-agent review** — Review phase dispatches multiple agents for parallel code review. -- **Knowledge accumulation** — Solutions documented in the compound phase become reusable patterns. - ---- - -## Framework Selection by Design Pattern - -| Agentic Design Pattern | Primary Framework | Secondary Framework | -|---|---|---| -| Single-Agent Iterative Loop | Superpowers (brainstorm → plan → TDD) | — | -| Autonomous Pipeline | OpenSpec (validate → propose → design → apply → verify) | Compound Engineering (learning loop) | -| Optimization Loop | Custom iteration loop (hypothesize → test → measure → decide) | — | -| Multi-Agent System | OpenSpec artifact gates + Superpowers hard gates | Compound Engineering (per-role learning) | - -## Universal Patterns (All Architectures) - -1. **Hard gates** — Check prerequisites before proceeding. Never silently skip. -2. **Artifact persistence** — Write phase outputs to disk, not just conversation context. Enables cross-session resumption. -3. **Workflow state metadata** — Track which phases are complete in a YAML file alongside artifacts. -4. **Error handling** — Standardize retry policy. Clear failure messages naming what to fix. -5. **Trivial escape hatch** — Document when it's OK to skip phases (small fixes, config changes). -6. **Artifact self-correction** — Downstream phases can fix factual errors in upstream artifacts, logged in a corrections section. From 80079877aaaff07fd6ddf3b94c320ec717bcbdcd Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 26 Mar 2026 11:46:44 +1100 Subject: [PATCH 03/14] fix(pi-cli): use eval-materialized workspace as cwd Pi-cli was always creating its own temp workspace and ignoring the eval's materialized workspace (request.cwd). This meant pi couldn't see files from the workspace template. Now consistent with copilot-cli: when request.cwd or config.cwd is provided, use it directly. Only create a temp workspace when no external cwd is available. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../core/src/evaluation/providers/pi-cli.ts | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/packages/core/src/evaluation/providers/pi-cli.ts b/packages/core/src/evaluation/providers/pi-cli.ts index d580a82a..c182f85a 100644 --- a/packages/core/src/evaluation/providers/pi-cli.ts +++ b/packages/core/src/evaluation/providers/pi-cli.ts @@ -78,15 +78,18 @@ export class PiCliProvider implements Provider { const startTime = new Date().toISOString(); const startMs = Date.now(); - const workspaceRoot = await this.createWorkspace(); + // Use eval-materialized workspace (request.cwd) when available, consistent with copilot-cli. + // Only create a temp workspace when no cwd is provided. + const hasExternalCwd = !!(request.cwd || this.config.cwd); + const workspaceRoot = hasExternalCwd ? undefined : await this.createWorkspace(); + const cwd = this.resolveCwd(workspaceRoot, request.cwd); const logger = await this.createStreamLogger(request).catch(() => undefined); try { // Save prompt to file for debugging/logging - const promptFile = path.join(workspaceRoot, PROMPT_FILENAME); + const promptFile = path.join(cwd, PROMPT_FILENAME); await writeFile(promptFile, request.question, 'utf8'); const args = this.buildPiArgs(request.question, inputFiles); - const cwd = this.resolveCwd(workspaceRoot, request.cwd); const result = await this.executePi(args, cwd, request.signal, logger); @@ -136,7 +139,7 @@ export class PiCliProvider implements Provider { args, executable: this.config.executable, promptFile, - workspace: workspaceRoot, + workspace: workspaceRoot ?? cwd, inputFiles, logFile: logger?.filePath, }, @@ -148,18 +151,23 @@ export class PiCliProvider implements Provider { }; } finally { await logger?.close(); - await this.cleanupWorkspace(workspaceRoot); + if (workspaceRoot) { + await this.cleanupWorkspace(workspaceRoot); + } } } - private resolveCwd(workspaceRoot: string, cwdOverride?: string): string { + private resolveCwd(workspaceRoot: string | undefined, cwdOverride?: string): string { if (cwdOverride) { return path.resolve(cwdOverride); } - if (!this.config.cwd) { + if (this.config.cwd) { + return path.resolve(this.config.cwd); + } + if (workspaceRoot) { return workspaceRoot; } - return path.resolve(this.config.cwd); + return process.cwd(); } private buildPiArgs(prompt: string, inputFiles: readonly string[] | undefined): string[] { From c2fe9c32696e997d9e177a06bc4bdc7f8fa767e1 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 26 Mar 2026 13:47:28 +1100 Subject: [PATCH 04/14] refactor: replace duplicated skills with before_all setup hook Remove copied skills from workspace template. Add scripts/setup.sh that copies skills from source at eval runtime, preventing staleness. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../agent-plugin-review.eval.yaml | 3 + .../skills/agent-plugin-review/SKILL.md | 102 -------- .../references/skill-quality-checklist.md | 125 --------- .../references/workflow-checklist.md | 78 ------ .../scripts/lint_plugin.py | 198 --------------- .../skills/agentv-eval-review/SKILL.md | 52 ---- .../agentv-eval-review/scripts/lint_eval.py | 239 ------------------ .../workspace-template/scripts/setup.sh | 19 ++ 8 files changed, 22 insertions(+), 794 deletions(-) delete mode 100644 evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/SKILL.md delete mode 100644 evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/references/skill-quality-checklist.md delete mode 100644 evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/references/workflow-checklist.md delete mode 100644 evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/scripts/lint_plugin.py delete mode 100644 evals/agentic-engineering/workspace-template/.agents/skills/agentv-eval-review/SKILL.md delete mode 100644 evals/agentic-engineering/workspace-template/.agents/skills/agentv-eval-review/scripts/lint_eval.py create mode 100644 evals/agentic-engineering/workspace-template/scripts/setup.sh diff --git a/evals/agentic-engineering/agent-plugin-review.eval.yaml b/evals/agentic-engineering/agent-plugin-review.eval.yaml index 9fcd30d4..34f60423 100644 --- a/evals/agentic-engineering/agent-plugin-review.eval.yaml +++ b/evals/agentic-engineering/agent-plugin-review.eval.yaml @@ -7,6 +7,9 @@ execution: workspace: template: ./workspace-template + hooks: + before_all: + command: bash scripts/setup.sh tests: - id: detect-missing-eval diff --git a/evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/SKILL.md b/evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/SKILL.md deleted file mode 100644 index 3be08356..00000000 --- a/evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/SKILL.md +++ /dev/null @@ -1,102 +0,0 @@ ---- -name: agent-plugin-review -description: >- - Use when reviewing an AI plugin pull request, auditing plugin quality before release, - or when asked to "review a plugin PR", "review skills in this PR", "check plugin quality", - or "review workflow architecture". Covers skill quality, structural linting, and workflow - architecture review. ---- - -# Plugin Review - -## Overview - -Review AI plugin PRs by running deterministic structural checks first, then applying LLM judgment for skill quality and workflow architecture. Post findings as inline PR comments. - -## Process - -### Step 1: Structural lint - -Run `scripts/lint_plugin.py` against the plugin directory: - -```bash -python scripts/lint_plugin.py --evals-dir --json -``` - -The script checks: -- Every `skills/*/SKILL.md` has a corresponding eval file -- SKILL.md frontmatter has `name` and `description` -- No hardcoded local paths (drive letters, absolute OS paths) -- No version printing instructions -- Referenced files (`references/*.md`) exist -- Commands reference existing skills -- Path style consistency across commands - -Report findings grouped by severity (error > warning > info). - -### Step 2: Eval lint - -If the PR includes eval files, invoke `agentv-eval-review` for AgentV-specific eval quality checks. - -### Step 3: Skill quality review (LLM judgment) - -For each SKILL.md, check against `references/skill-quality-checklist.md`: - -- Description starts with "Use when..." and describes triggering conditions only (not workflow) -- Description does NOT summarize the skill's process — this causes agents to follow the description instead of reading the SKILL.md body -- Body is concise — only include what the agent doesn't already know -- Imperative/infinitive form, not second person -- Heavy reference (100+ lines) moved to `references/` files -- One excellent code example beats many mediocre ones -- Flowcharts only for non-obvious decisions -- Keywords throughout for search discovery -- Cross-references use skill name with requirement markers, not `@` force-load syntax -- Discipline-enforcing skills have rationalization tables, red flags lists, and explicit loophole closures -- Consistency — no contradictions within or across files (tool names, filenames, commands, rules) -- No manual routing workarounds — if AGENTS.md or instruction files contain heavy TRIGGER/ACTION routing tables or skill-chain logic, the skill descriptions are likely too weak. Good descriptions enable auto-discovery without manual routing. - -### Step 4: Workflow architecture review (LLM judgment) - -For plugins with multi-phase workflows, check against `references/workflow-checklist.md`: - -- Hard gates between phases (artifact existence checks) -- Artifact persistence convention (defined output directory) -- Workflow state metadata for cross-session resumption -- Resumption protocol (detect existing artifacts, skip completed phases) -- Standardized error handling with retry -- Trivial change escape hatch -- Artifact self-correction with corrections log -- Learning loop mechanism - -### Step 5: Post review - -Post findings as inline PR comments at specific line numbers. Group by severity: -- **Critical** — Broken references, missing evals, factual contradictions, missing hard gates -- **Medium** — Naming inconsistencies, hardcoded paths, missing assertions, ad-hoc error handling -- **Low** — Style inconsistencies, description improvements - -Use a PR review (not individual comments) to batch all findings. - -## Skill Resources - -- `scripts/lint_plugin.py` — Deterministic plugin linter (Python 3.11+, stdlib only) -- `references/skill-quality-checklist.md` — Skill quality checklist (CSO, descriptions, content, discipline skills) -- `references/workflow-checklist.md` — Workflow architecture checklist (OpenSpec, hard gates, artifacts) - -## External References - -For deeper research on challenging reviews, consult these resources via web fetch, deepwiki, or clone the repo locally: - -- [Agent Skills specification](https://agentskills.io/specification) — Official SKILL.md format, frontmatter fields, progressive disclosure rules -- [Agent Skills best practices](https://agentskills.io/skill-creation/best-practices) — Context spending, calibrating control, gotchas, scripts, validation loops -- [Agent Skills description optimization](https://agentskills.io/skill-creation/optimizing-descriptions) — Trigger testing, train/validation splits, overfitting avoidance -- [Agent Skills using scripts](https://agentskills.io/skill-creation/using-scripts) — Self-contained scripts, --help, structured output, idempotency, exit codes -- [AgentV documentation](https://agentv.dev/) — Eval YAML schema, assertion types, workspace evals, multi-provider targets -- [OpenSpec](https://github.com/Fission-AI/OpenSpec) — Spec-driven development framework (OPSX conventions, artifact graphs, hard gates, delta specs) -- [Superpowers](https://github.com/obra/superpowers/) — Claude Code plugin with `` pattern, brainstorming workflow, skill-based development phases -- [Compound Engineering](https://github.com/EveryInc/compound-engineering-plugin) — Four-phase workflow (Plan/Work/Review/Compound) with learning loop pattern - -## Related Skills - -- **agentv-eval-review** — Lint and review AgentV eval files (invoke for eval-specific checks) -- **agent-architecture-design** — Design agent architectures from scratch diff --git a/evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/references/skill-quality-checklist.md b/evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/references/skill-quality-checklist.md deleted file mode 100644 index 1a8f279c..00000000 --- a/evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/references/skill-quality-checklist.md +++ /dev/null @@ -1,125 +0,0 @@ -# Skill Quality Checklist - -Derived from [Superpowers writing-skills](https://github.com/obra/superpowers/) and [Anthropic's skill authoring best practices](https://docs.anthropic.com/en/docs/agents-and-tools/agent-skills). - -## Frontmatter - -- [ ] Only two fields: `name` and `description` (no other fields supported) -- [ ] Max 1024 characters total in frontmatter -- [ ] `name` uses only letters, numbers, and hyphens (no parentheses, special chars) -- [ ] `description` written in third person -- [ ] `description` starts with "Use when..." focusing on triggering conditions -- [ ] `description` describes WHEN to use, NOT WHAT the skill does -- [ ] `description` does NOT summarize the skill's workflow or process - -### Why description must not summarize workflow - -Testing revealed that when a description summarizes the skill's workflow, Claude may follow the description instead of reading the full SKILL.md content. A description saying "code review between tasks" caused Claude to do ONE review, even though the SKILL.md flowchart clearly showed TWO reviews. When the description was changed to just triggering conditions, Claude correctly read and followed the full skill. - -### Description examples - -```yaml -# BAD: Summarizes workflow - Claude may follow this instead of reading skill -description: Use when executing plans - dispatches subagent per task with code review between tasks - -# BAD: Too much process detail -description: Use for TDD - write test first, watch it fail, write minimal code, refactor - -# BAD: Too abstract, vague -description: For async testing - -# BAD: First person -description: I can help you with async tests when they're flaky - -# GOOD: Just triggering conditions, no workflow summary -description: Use when executing implementation plans with independent tasks in the current session - -# GOOD: Triggering conditions only -description: Use when implementing any feature or bugfix, before writing implementation code - -# GOOD: Problem-focused, technology-agnostic -description: Use when tests have race conditions, timing dependencies, or pass/fail inconsistently -``` - -## Content Quality - -### Conciseness (Claude Search Optimization) - -- [ ] SKILL.md body is concise — only include what Claude doesn't already know -- [ ] Challenge each paragraph: "Does Claude really need this explanation?" -- [ ] Target word counts: - - Frequently-loaded skills: < 200 words - - Standard skills: < 500 words - - With references: SKILL.md lean, details in reference files -- [ ] Move heavy reference (100+ lines) to separate files -- [ ] Use cross-references instead of repeating content from other skills -- [ ] Compress examples — one excellent example beats many mediocre ones - -### Structure - -- [ ] Overview: core principle in 1-2 sentences -- [ ] When to Use: symptoms and use cases (flowchart only if decision is non-obvious) -- [ ] When NOT to use: explicit exclusions -- [ ] Core Pattern: before/after comparison (for techniques/patterns) -- [ ] Quick Reference: table or bullets for scanning -- [ ] Common Mistakes: what goes wrong + fixes -- [ ] Inline code for simple patterns, separate file for heavy reference - -### Writing Style - -- [ ] Imperative/infinitive form (verb-first instructions) -- [ ] NOT second person ("you should...") -- [ ] Technology-agnostic triggers unless skill is technology-specific -- [ ] Keywords throughout for search discovery (error messages, symptoms, synonyms, tool names) - -### Degrees of Freedom - -Match specificity to the task's fragility: - -| Freedom Level | When to Use | Example | -|---|---|---| -| High (text instructions) | Multiple valid approaches, context-dependent | Code review process | -| Medium (pseudocode/templates) | Preferred pattern exists, some variation OK | Report generation | -| Low (exact scripts) | Precise steps required, fragile operations | Database migration | - -## File Organization - -- [ ] Flat namespace — all skills in one searchable directory -- [ ] Supporting files only for: heavy reference (100+ lines), reusable tools/scripts -- [ ] Everything else inline in SKILL.md -- [ ] No narrative storytelling ("In session 2025-10-03, we found...") -- [ ] No multi-language dilution (one excellent example, not 5 mediocre ones) - -## Flowchart Usage - -- [ ] Use ONLY for non-obvious decision points, process loops, "A vs B" decisions -- [ ] Never use for: reference material (→ tables), code (→ code blocks), linear instructions (→ numbered lists) -- [ ] Labels must have semantic meaning (not "step1", "helper2") - -## Cross-References - -- [ ] Use skill name with explicit requirement markers: `**REQUIRED:** Use skill-name` -- [ ] Do NOT use `@` syntax to force-load files (burns context) -- [ ] Do NOT repeat content available in referenced skills - -## Anti-Patterns to Flag - -| Anti-Pattern | Why It's Bad | -|---|---| -| Narrative examples ("In session X, we found...") | Too specific, not reusable | -| Multi-language examples (JS, Python, Go, etc.) | Mediocre quality, maintenance burden | -| Code in flowcharts | Can't copy-paste, hard to read | -| Generic labels (helper1, step2) | No semantic meaning | -| Version printing instructions | Fragile, rely on git history | -| Hardcoded local paths | Machine-specific, not portable | -| Description summarizes workflow | Claude follows description, skips SKILL.md body | - -## Discipline-Enforcing Skills (Additional Checks) - -For skills that enforce rules (TDD, verification, coding standards): - -- [ ] Specific workarounds explicitly forbidden (not just "don't do X" but "don't keep it as reference, don't adapt it, delete means delete") -- [ ] Rationalization table present (common excuses + reality) -- [ ] Red flags list for self-checking -- [ ] "Spirit vs letter" addressed: "Violating the letter IS violating the spirit" -- [ ] Hard gates at critical decision points diff --git a/evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/references/workflow-checklist.md b/evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/references/workflow-checklist.md deleted file mode 100644 index c5f3fa1f..00000000 --- a/evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/references/workflow-checklist.md +++ /dev/null @@ -1,78 +0,0 @@ -# Workflow Architecture Checklist - -Review multi-phase plugin workflows against these patterns, derived from [OpenSpec](https://github.com/Fission-AI/OpenSpec) (OPSX conventions), [Superpowers](https://github.com/obra/superpowers/), and [Compound Engineering](https://github.com/EveryInc/compound-engineering-plugin). - -## Phase Coverage - -Compare the plugin's workflow phases against the OpenSpec artifact model: - -| OpenSpec Phase | OPSX Command | Expected Plugin Equivalent | -|---|---|---| -| Explore | `/opsx:explore` | Research mode — investigate without creating artifacts | -| Validate | (custom) | Check requirements against real codebase before design | -| Propose | `/opsx:propose` | Define WHAT and WHY with acceptance criteria | -| Design | (via schema) | Plan HOW — file-level changes, multi-repo coordination | -| Tasks | (via schema) | Standalone `tasks.md` with `- [ ]` checkboxes | -| Apply | `/opsx:apply` | Implement through task checklist with TDD | -| Verify | `/opsx:verify` | Build + test + trace implementation back to specs | -| Archive | `/opsx:archive` | Finalize, merge deltas, persist learnings | - -Not all phases are required for every plugin. Flag missing phases only when the gap would cause real problems. - -## Hard Gates - -From [Superpowers](https://github.com/obra/superpowers/) `` pattern: - -- [ ] Each phase checks for prerequisite artifacts before proceeding -- [ ] Gate failure message tells the user which command/skill to run first -- [ ] Gates cannot be silently bypassed -- [ ] Gate checks happen at the start of the skill, before any work - -Example gate: -``` -HARD GATE: `hld-review.md` MUST exist in {output_dir}/. -If missing, inform the user: "Run the design-review skill first." STOP. -``` - -## Artifact Contracts - -- [ ] Each phase produces a defined output artifact (e.g., `context.md`, `design.md`, `tasks.md`) -- [ ] Output format of phase N matches expected input of phase N+1 -- [ ] Artifact location convention is defined (not just `{output_dir}/`) -- [ ] Artifacts persist to disk (not just conversation context) for cross-session resumption - -## Workflow State - -- [ ] Workflow state tracked in a metadata file (e.g., `.workflow.yaml`) alongside artifacts -- [ ] Metadata records: which phases are complete, timestamps, WI/issue number -- [ ] Resumption protocol detects existing artifacts and skips completed phases -- [ ] Partial completion is handled (e.g., Phase 4 with N-1 of N agents succeeding) - -## Error Handling - -- [ ] Standardized retry policy across all skills (e.g., retry MCP calls 3x with exponential backoff) -- [ ] Clear failure reporting — user knows what failed and what to do next -- [ ] Errors don't silently corrupt downstream phases -- [ ] Critical failures (P0 findings, merge conflicts) stop the workflow - -## Escape Hatches - -- [ ] Trivial change escape: small fixes can skip spec phases -- [ ] Criteria for "trivial" are documented (e.g., < 20 lines, single file, no schema change) -- [ ] Artifact self-correction: downstream phases can fix factual errors in upstream artifacts -- [ ] Corrections are logged (e.g., `## Corrections Log` section) for auditability - -## Learning Loop - -From [Compound Engineering](https://github.com/EveryInc/compound-engineering-plugin) `/ce:compound` pattern: - -- [ ] Mechanism exists to capture patterns from completed work -- [ ] Learnings feed back into future workflow runs (e.g., review guidelines, common patterns) -- [ ] Learning artifacts are version-controlled and mergeable - -## Fast-Forward Mode - -From OpenSpec `/opsx:ff`: - -- [ ] For well-understood changes, all planning artifacts can be generated in one pass -- [ ] Fast-forward mode is optional — users can still step through phases individually diff --git a/evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/scripts/lint_plugin.py b/evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/scripts/lint_plugin.py deleted file mode 100644 index 15f0be18..00000000 --- a/evals/agentic-engineering/workspace-template/.agents/skills/agent-plugin-review/scripts/lint_plugin.py +++ /dev/null @@ -1,198 +0,0 @@ -#!/usr/bin/env python3 -"""Lint AI plugin structure for common issues. - -Usage: python lint_plugin.py [--evals-dir ] [--json] - -Checks: - - Every skills/*/SKILL.md has a corresponding eval file - - SKILL.md frontmatter has name and description - - No hardcoded local paths (drive letters, absolute OS paths) - - No version printing instructions - - Commands reference existing skills - - Path style consistency across commands - - Referenced files (references/*.md) exist - -Exit code: 0 if no issues, 1 if issues found. -""" - -import json -import os -import re -import sys -from pathlib import Path - - -def find_skills(plugin_dir: Path) -> list[Path]: - """Find all SKILL.md files in the plugin.""" - return sorted(plugin_dir.rglob("skills/*/SKILL.md")) - - -def find_evals(evals_dir: Path, plugin_name: str) -> list[Path]: - """Find eval files for a plugin.""" - plugin_evals = evals_dir / plugin_name - if not plugin_evals.exists(): - return [] - return sorted(plugin_evals.rglob("*.yaml")) + sorted(plugin_evals.rglob("*.yml")) - - -def find_commands(plugin_dir: Path) -> list[Path]: - """Find command files.""" - commands_dir = plugin_dir / "commands" - if not commands_dir.exists(): - return [] - return sorted(commands_dir.glob("*.md")) - - -def lint_plugin(plugin_dir: Path, evals_dir: Path | None = None) -> list[dict]: - issues = [] - - def issue(severity: str, msg: str, file: str | None = None, line: int | None = None): - issues.append({ - "file": file or str(plugin_dir), - "severity": severity, - "message": msg, - "line": line, - }) - - plugin_name = plugin_dir.name - skills = find_skills(plugin_dir) - commands = find_commands(plugin_dir) - - # Collect skill names - skill_names = set() - for skill_path in skills: - skill_name = skill_path.parent.name - skill_names.add(skill_name) - - # Check each SKILL.md - for skill_path in skills: - skill_name = skill_path.parent.name - text = skill_path.read_text(encoding="utf-8") - lines = text.splitlines() - - # Check frontmatter - if not text.startswith("---"): - issue("error", "Missing YAML frontmatter", str(skill_path)) - else: - fm_end = text.find("---", 3) - if fm_end == -1: - issue("error", "Unclosed YAML frontmatter", str(skill_path)) - else: - fm = text[3:fm_end] - if "name:" not in fm: - issue("error", "Frontmatter missing 'name' field", str(skill_path)) - if "description:" not in fm: - issue("error", "Frontmatter missing 'description' field", str(skill_path)) - - # Check for hardcoded paths - drive_letter_pat = re.compile(r'[A-Z]:\\[A-Za-z]') - for i, line in enumerate(lines, 1): - if drive_letter_pat.search(line): - # Skip if it's in a table header or obviously an example - if "Override" not in line and "Example" not in line: - issue("warning", f"Hardcoded local path detected", str(skill_path), i) - - # Check for version printing - version_pat = re.compile(r'print.*version|version \d{8}', re.IGNORECASE) - for i, line in enumerate(lines, 1): - if version_pat.search(line): - issue("warning", "Version printing instruction — rely on git history", str(skill_path), i) - - # Check referenced files exist - ref_pat = re.compile(r'`(references/[^`]+)`') - skill_dir = skill_path.parent - for i, line in enumerate(lines, 1): - for match in ref_pat.finditer(line): - ref_path = skill_dir / match.group(1) - if not ref_path.exists(): - issue("error", f"Referenced file does not exist: {match.group(1)}", str(skill_path), i) - - # Check for non-existent command references - cmd_pat = re.compile(r'/([a-z][a-z0-9-]+)') - cmd_names = {c.stem for c in commands} - for i, line in enumerate(lines, 1): - for match in cmd_pat.finditer(line): - cmd_ref = match.group(1) - # Skip common false positives - if cmd_ref in ("dev", "null", "tmp", "etc", "usr", "bin", "opsx"): - continue - if cmd_ref.startswith("opsx:") or cmd_ref.startswith("ce:"): - continue - if cmd_ref not in cmd_names and cmd_ref not in skill_names: - # Only flag if it looks like a slash command (preceded by whitespace or start of line) - before = line[:match.start()].rstrip() - if before == "" or before.endswith((" ", "\t", '"', "'", ":")): - issue("info", f"References /{cmd_ref} — not found in commands/ or skills/", str(skill_path), i) - - # Check eval coverage - if evals_dir: - eval_files = find_evals(evals_dir, plugin_name) - eval_stems = set() - for ef in eval_files: - stem = ef.stem.replace(".eval", "") - eval_stems.add(stem) - - for skill_name in sorted(skill_names): - # Check various naming patterns - has_eval = ( - skill_name in eval_stems - or skill_name.replace(plugin_name + "-", "") in eval_stems - or any(skill_name in s for s in eval_stems) - ) - if not has_eval: - issue("warning", f"Skill '{skill_name}' has no corresponding eval file", str(plugin_dir / "skills" / skill_name / "SKILL.md")) - - # Check command path consistency - path_styles = set() - for cmd_path in commands: - text = cmd_path.read_text(encoding="utf-8") - if "plugins/" in text: - path_styles.add("absolute") - if re.search(r'skills/[a-z]', text) and "plugins/" not in text.split("skills/")[0][-20:]: - path_styles.add("relative") - if len(path_styles) > 1: - issue("info", "Commands use mixed path styles (some relative, some absolute)", str(plugin_dir / "commands")) - - return issues - - -def main(): - if len(sys.argv) < 2: - print(f"Usage: {sys.argv[0]} [--evals-dir ] [--json]", file=sys.stderr) - sys.exit(2) - - plugin_dir = Path(sys.argv[1]) - output_json = "--json" in sys.argv - - evals_dir = None - if "--evals-dir" in sys.argv: - idx = sys.argv.index("--evals-dir") - if idx + 1 < len(sys.argv): - evals_dir = Path(sys.argv[idx + 1]) - - if not plugin_dir.is_dir(): - print(f"Error: {plugin_dir} is not a directory", file=sys.stderr) - sys.exit(2) - - issues = lint_plugin(plugin_dir, evals_dir) - - if output_json: - print(json.dumps(issues, indent=2)) - else: - for iss in issues: - line = f":{iss['line']}" if iss.get("line") else "" - print(f"[{iss['severity'].upper()}] {iss['file']}{line}: {iss['message']}") - - counts = {} - for iss in issues: - counts[iss["severity"]] = counts.get(iss["severity"], 0) + 1 - if issues: - print(f"\n{len(issues)} issues: {', '.join(f'{v} {k}' for k, v in sorted(counts.items()))}") - else: - print("No issues found.") - - sys.exit(1 if any(i["severity"] == "error" for i in issues) else 0) - - -if __name__ == "__main__": - main() diff --git a/evals/agentic-engineering/workspace-template/.agents/skills/agentv-eval-review/SKILL.md b/evals/agentic-engineering/workspace-template/.agents/skills/agentv-eval-review/SKILL.md deleted file mode 100644 index 23e2c346..00000000 --- a/evals/agentic-engineering/workspace-template/.agents/skills/agentv-eval-review/SKILL.md +++ /dev/null @@ -1,52 +0,0 @@ ---- -name: agentv-eval-review -description: >- - Use when reviewing eval YAML files for quality issues, linting eval files before - committing, checking eval schema compliance, or when asked to "review these evals", - "check eval quality", "lint eval files", or "validate eval structure". - Do NOT use for writing evals (use agentv-eval-writer) or running evals (use agentv-bench). ---- - -# Eval Review - -## Overview - -Lint and review AgentV eval YAML files for structural issues, schema compliance, and quality problems. Runs deterministic checks via script, then applies LLM judgment for semantic issues the script cannot catch. - -## Process - -### Step 1: Run the linter - -Execute `scripts/lint_eval.py` against the target eval files: - -```bash -python scripts/lint_eval.py --json -``` - -The script checks: -- `.eval.yaml` extension -- `description` field present -- Each test has `id`, `input`, and at least one of `criteria`/`expected_output`/`assertions` -- File paths in `type: file` use leading `/` -- `assertions` blocks present (flags tests relying solely on `expected_output`) -- `expected_output` prose detection (flags "The agent should..." patterns) -- Repeated file inputs across tests (recommends top-level `input`) -- Naming prefix consistency across eval files in same directory - -### Step 2: Review script output - -Report the script findings grouped by severity (error > warning > info). For each finding, include the file path and a concrete fix. - -### Step 3: Semantic review (LLM judgment) - -The script catches structural issues but cannot assess: -- **Factual accuracy** — Do tool/command names in expected_output match what the skill documents? -- **Coverage gaps** — Are important edge cases missing? -- **Assertion discriminability** — Would assertions pass for both good and bad output? -- **Cross-file consistency** — Do output filenames match across evals and skills? - -Read the relevant SKILL.md files and cross-check against the eval content for these issues. - -## Skill Resources - -- `scripts/lint_eval.py` — Deterministic eval linter (Python 3.11+, stdlib only) diff --git a/evals/agentic-engineering/workspace-template/.agents/skills/agentv-eval-review/scripts/lint_eval.py b/evals/agentic-engineering/workspace-template/.agents/skills/agentv-eval-review/scripts/lint_eval.py deleted file mode 100644 index 1ba45088..00000000 --- a/evals/agentic-engineering/workspace-template/.agents/skills/agentv-eval-review/scripts/lint_eval.py +++ /dev/null @@ -1,239 +0,0 @@ -#!/usr/bin/env python3 -"""Lint AgentV eval YAML files for common issues. - -Usage: python lint_eval.py [--json] - -Checks: - - File uses .eval.yaml extension - - description field present - - Each test has id, input, criteria - - File paths in type:file use leading / - - assertions blocks present (not relying solely on expected_output) - - expected_output does not contain evaluation criteria prose - - Repeated file inputs across tests (should use top-level input) - - Naming prefix consistency across eval files in same directory - -Exit code: 0 if no issues, 1 if issues found. -""" - -import json -import os -import re -import sys -from pathlib import Path - -try: - import yaml -except ImportError: - # Fall back to basic YAML parsing if PyYAML not available - yaml = None - - -def parse_yaml_basic(text: str) -> dict: - """Minimal YAML-ish parser for when PyYAML is unavailable.""" - # This is a best-effort fallback; recommend installing PyYAML - import ast - # Try json first (YAML is a superset of JSON) - try: - return json.loads(text) - except Exception: - pass - return {} - - -def load_yaml(path: Path) -> dict: - text = path.read_text(encoding="utf-8") - if yaml: - return yaml.safe_load(text) or {} - return parse_yaml_basic(text) - - -def lint_file(path: Path) -> list[dict]: - issues = [] - - def issue(severity: str, msg: str, line: int | None = None): - issues.append({"file": str(path), "severity": severity, "message": msg, "line": line}) - - # Check extension - if not path.name.endswith(".eval.yaml"): - issue("error", f"File should use .eval.yaml extension, got: {path.name}") - - try: - data = load_yaml(path) - except Exception as e: - issue("error", f"Failed to parse YAML: {e}") - return issues - - if not isinstance(data, dict): - issue("error", "Root element is not a mapping") - return issues - - # Check description - if "description" not in data: - issue("warning", "Missing top-level 'description' field") - - tests = data.get("tests", []) - if not isinstance(tests, list): - issue("error", "'tests' is not a list") - return issues - - if not tests: - issue("warning", "No tests defined") - return issues - - # Check for top-level input (shared file references) - top_level_input = data.get("input") - - # Collect file values across tests to detect repetition - file_values_per_test: list[list[str]] = [] - - for i, test in enumerate(tests): - test_id = test.get("id", f"test-{i}") - - if "id" not in test: - issue("error", f"Test at index {i} missing 'id'") - - if "input" not in test and top_level_input is None: - issue("error", f"Test '{test_id}' missing 'input' and no top-level input defined") - - has_criteria = "criteria" in test - has_expected = "expected_output" in test - has_assertions = "assertions" in test - - if not has_criteria and not has_expected and not has_assertions: - issue("error", f"Test '{test_id}' needs at least one of: criteria, expected_output, assertions") - - # Check assertions present - if not has_assertions and has_expected: - issue("warning", f"Test '{test_id}' has expected_output but no assertions — add deterministic assertions where possible") - - # Check expected_output for prose patterns - if has_expected: - expected = test["expected_output"] - expected_text = "" - if isinstance(expected, str): - expected_text = expected - elif isinstance(expected, list): - for msg in expected: - if isinstance(msg, dict): - content = msg.get("content", "") - if isinstance(content, str): - expected_text += content - - prose_patterns = [ - r"[Tt]he agent should", - r"[Ss]hould identify", - r"[Ss]hould flag", - r"[Ss]hould recommend", - r"[Ss]hould produce", - r"[Ss]hould detect", - r"[Ss]hould load", - r"[Ss]hould run", - ] - for pat in prose_patterns: - if re.search(pat, expected_text): - issue("warning", f"Test '{test_id}' expected_output contains evaluation criteria prose ('{pat.lstrip('[Tt]').lstrip('[Ss]')}...') — use criteria or assertions instead") - break - - # Collect file values from input - test_files = extract_file_values(test.get("input", [])) - file_values_per_test.append(test_files) - - # Check file paths for leading / - for fv in test_files: - if not fv.startswith("/"): - issue("warning", f"Test '{test_id}' file path missing leading '/': {fv}") - - # Check for repeated file inputs - if len(file_values_per_test) >= 2 and not top_level_input: - common_files = set(file_values_per_test[0]) - for fvs in file_values_per_test[1:]: - common_files &= set(fvs) - if common_files: - issue("info", f"File input repeated in every test: {', '.join(sorted(common_files))} — consider using top-level input") - - return issues - - -def extract_file_values(input_data) -> list[str]: - """Extract type:file values from input structure.""" - files = [] - if isinstance(input_data, list): - for item in input_data: - if isinstance(item, dict): - content = item.get("content", []) - if isinstance(content, list): - for c in content: - if isinstance(c, dict) and c.get("type") == "file": - v = c.get("value", "") - if v: - files.append(v) - return files - - -def lint_directory(path: Path) -> list[dict]: - issues = [] - eval_files = sorted(path.rglob("*.yaml")) + sorted(path.rglob("*.yml")) - - if not eval_files: - issues.append({"file": str(path), "severity": "warning", "message": "No eval files found", "line": None}) - return issues - - # Check naming prefix consistency - prefixes = set() - for f in eval_files: - name = f.stem.replace(".eval", "") - parts = name.split("-") - if len(parts) >= 2: - prefixes.add(parts[0]) - - if len(prefixes) > 1: - issues.append({ - "file": str(path), - "severity": "info", - "message": f"Inconsistent naming prefixes: {', '.join(sorted(prefixes))}", - "line": None, - }) - - for f in eval_files: - issues.extend(lint_file(f)) - - return issues - - -def main(): - if len(sys.argv) < 2: - print(f"Usage: {sys.argv[0]} [--json]", file=sys.stderr) - sys.exit(2) - - target = Path(sys.argv[1]) - output_json = "--json" in sys.argv - - if target.is_file(): - issues = lint_file(target) - elif target.is_dir(): - issues = lint_directory(target) - else: - print(f"Error: {target} not found", file=sys.stderr) - sys.exit(2) - - if output_json: - print(json.dumps(issues, indent=2)) - else: - for iss in issues: - line = f":{iss['line']}" if iss.get("line") else "" - print(f"[{iss['severity'].upper()}] {iss['file']}{line}: {iss['message']}") - - counts = {} - for iss in issues: - counts[iss["severity"]] = counts.get(iss["severity"], 0) + 1 - if issues: - print(f"\n{len(issues)} issues: {', '.join(f'{v} {k}' for k, v in sorted(counts.items()))}") - else: - print("No issues found.") - - sys.exit(1 if any(i["severity"] == "error" for i in issues) else 0) - - -if __name__ == "__main__": - main() diff --git a/evals/agentic-engineering/workspace-template/scripts/setup.sh b/evals/agentic-engineering/workspace-template/scripts/setup.sh new file mode 100644 index 00000000..e36ebf00 --- /dev/null +++ b/evals/agentic-engineering/workspace-template/scripts/setup.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Workspace before_all hook: copy skills into .agents/skills/ for agent discovery. +# Runs from the workspace root at eval startup. + +set -e + +REPO_ROOT="$(git rev-parse --show-toplevel 2>/dev/null || echo "$(cd "$(dirname "$0")/../../../.." && pwd)")" + +mkdir -p .agents/skills + +# Copy agentic-engineering skills +cp -r "$REPO_ROOT/plugins/agentic-engineering/skills/agent-plugin-review" .agents/skills/ +cp -r "$REPO_ROOT/plugins/agentic-engineering/skills/agent-architecture-design" .agents/skills/ + +# Copy agentv-dev eval review skill +cp -r "$REPO_ROOT/plugins/agentv-dev/skills/agentv-eval-review" .agents/skills/ + +echo "Skills copied to .agents/skills/" +ls .agents/skills/ From 6a8b0fc8e92d2c0527ebc7f0db1f478241b05aa1 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 26 Mar 2026 13:49:18 +1100 Subject: [PATCH 05/14] refactor: replace bash setup with TypeScript for cross-platform Co-Authored-By: Claude Opus 4.6 (1M context) --- .../agent-plugin-review.eval.yaml | 2 +- .../workspace-template/scripts/setup.sh | 19 ---------- .../workspace-template/scripts/setup.ts | 36 +++++++++++++++++++ 3 files changed, 37 insertions(+), 20 deletions(-) delete mode 100644 evals/agentic-engineering/workspace-template/scripts/setup.sh create mode 100644 evals/agentic-engineering/workspace-template/scripts/setup.ts diff --git a/evals/agentic-engineering/agent-plugin-review.eval.yaml b/evals/agentic-engineering/agent-plugin-review.eval.yaml index 34f60423..c0ed9028 100644 --- a/evals/agentic-engineering/agent-plugin-review.eval.yaml +++ b/evals/agentic-engineering/agent-plugin-review.eval.yaml @@ -9,7 +9,7 @@ workspace: template: ./workspace-template hooks: before_all: - command: bash scripts/setup.sh + command: bun scripts/setup.ts tests: - id: detect-missing-eval diff --git a/evals/agentic-engineering/workspace-template/scripts/setup.sh b/evals/agentic-engineering/workspace-template/scripts/setup.sh deleted file mode 100644 index e36ebf00..00000000 --- a/evals/agentic-engineering/workspace-template/scripts/setup.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -# Workspace before_all hook: copy skills into .agents/skills/ for agent discovery. -# Runs from the workspace root at eval startup. - -set -e - -REPO_ROOT="$(git rev-parse --show-toplevel 2>/dev/null || echo "$(cd "$(dirname "$0")/../../../.." && pwd)")" - -mkdir -p .agents/skills - -# Copy agentic-engineering skills -cp -r "$REPO_ROOT/plugins/agentic-engineering/skills/agent-plugin-review" .agents/skills/ -cp -r "$REPO_ROOT/plugins/agentic-engineering/skills/agent-architecture-design" .agents/skills/ - -# Copy agentv-dev eval review skill -cp -r "$REPO_ROOT/plugins/agentv-dev/skills/agentv-eval-review" .agents/skills/ - -echo "Skills copied to .agents/skills/" -ls .agents/skills/ diff --git a/evals/agentic-engineering/workspace-template/scripts/setup.ts b/evals/agentic-engineering/workspace-template/scripts/setup.ts new file mode 100644 index 00000000..7498d1e5 --- /dev/null +++ b/evals/agentic-engineering/workspace-template/scripts/setup.ts @@ -0,0 +1,36 @@ +#!/usr/bin/env bun +/** + * Workspace before_all hook: copy skills into .agents/skills/ for agent discovery. + * Runs from the workspace root at eval startup. + */ + +import { cpSync, mkdirSync, readdirSync } from 'node:fs'; +import { resolve, join } from 'node:path'; +import { execSync } from 'node:child_process'; + +// Resolve repo root (works whether run from workspace or repo root) +let repoRoot: string; +try { + repoRoot = execSync('git rev-parse --show-toplevel', { encoding: 'utf8' }).trim(); +} catch { + repoRoot = resolve(import.meta.dirname, '..', '..', '..', '..'); +} + +const targetDir = join(process.cwd(), '.agents', 'skills'); +mkdirSync(targetDir, { recursive: true }); + +const skillSources = [ + join(repoRoot, 'plugins', 'agentic-engineering', 'skills', 'agent-plugin-review'), + join(repoRoot, 'plugins', 'agentic-engineering', 'skills', 'agent-architecture-design'), + join(repoRoot, 'plugins', 'agentv-dev', 'skills', 'agentv-eval-review'), +]; + +for (const src of skillSources) { + const name = src.split(/[\\/]/).pop()!; + const dest = join(targetDir, name); + cpSync(src, dest, { recursive: true }); + console.log(`Copied ${name}`); +} + +console.log(`\nSkills in ${targetDir}:`); +console.log(readdirSync(targetDir).join(', ')); From 09342fd386331b3e7094d57b09b50a684fe0c771 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 26 Mar 2026 13:50:18 +1100 Subject: [PATCH 06/14] refactor: use .mjs with node instead of .ts with bun Node.js is more universally available than bun. Script only uses stdlib modules (fs, path, child_process, url). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../agent-plugin-review.eval.yaml | 2 +- .../scripts/{setup.ts => setup.mjs} | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) rename evals/agentic-engineering/workspace-template/scripts/{setup.ts => setup.mjs} (76%) diff --git a/evals/agentic-engineering/agent-plugin-review.eval.yaml b/evals/agentic-engineering/agent-plugin-review.eval.yaml index c0ed9028..0f539b6d 100644 --- a/evals/agentic-engineering/agent-plugin-review.eval.yaml +++ b/evals/agentic-engineering/agent-plugin-review.eval.yaml @@ -9,7 +9,7 @@ workspace: template: ./workspace-template hooks: before_all: - command: bun scripts/setup.ts + command: node scripts/setup.mjs tests: - id: detect-missing-eval diff --git a/evals/agentic-engineering/workspace-template/scripts/setup.ts b/evals/agentic-engineering/workspace-template/scripts/setup.mjs similarity index 76% rename from evals/agentic-engineering/workspace-template/scripts/setup.ts rename to evals/agentic-engineering/workspace-template/scripts/setup.mjs index 7498d1e5..8bb921d9 100644 --- a/evals/agentic-engineering/workspace-template/scripts/setup.ts +++ b/evals/agentic-engineering/workspace-template/scripts/setup.mjs @@ -1,19 +1,22 @@ -#!/usr/bin/env bun +#!/usr/bin/env node /** * Workspace before_all hook: copy skills into .agents/skills/ for agent discovery. * Runs from the workspace root at eval startup. */ import { cpSync, mkdirSync, readdirSync } from 'node:fs'; -import { resolve, join } from 'node:path'; +import { resolve, join, dirname } from 'node:path'; import { execSync } from 'node:child_process'; +import { fileURLToPath } from 'node:url'; -// Resolve repo root (works whether run from workspace or repo root) -let repoRoot: string; +const __dirname = dirname(fileURLToPath(import.meta.url)); + +// Resolve repo root +let repoRoot; try { repoRoot = execSync('git rev-parse --show-toplevel', { encoding: 'utf8' }).trim(); } catch { - repoRoot = resolve(import.meta.dirname, '..', '..', '..', '..'); + repoRoot = resolve(__dirname, '..', '..', '..', '..'); } const targetDir = join(process.cwd(), '.agents', 'skills'); @@ -26,7 +29,7 @@ const skillSources = [ ]; for (const src of skillSources) { - const name = src.split(/[\\/]/).pop()!; + const name = src.split(/[\\/]/).pop(); const dest = join(targetDir, name); cpSync(src, dest, { recursive: true }); console.log(`Copied ${name}`); From 2fc80de0751da753837138786c59a62c10acbf48 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 26 Mar 2026 13:52:01 +1100 Subject: [PATCH 07/14] feat: restore skill-trigger assertions now that before_all copies skills Co-Authored-By: Claude Opus 4.6 (1M context) --- .../agent-plugin-review.eval.yaml | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/evals/agentic-engineering/agent-plugin-review.eval.yaml b/evals/agentic-engineering/agent-plugin-review.eval.yaml index 0f539b6d..88e20116 100644 --- a/evals/agentic-engineering/agent-plugin-review.eval.yaml +++ b/evals/agentic-engineering/agent-plugin-review.eval.yaml @@ -1,4 +1,4 @@ -description: Evaluates that an agent can catch planted issues in a mock plugin +description: Evaluates that the agent-plugin-review skill is triggered and catches planted issues in a mock plugin execution: targets: @@ -18,6 +18,8 @@ tests: Review the deploy-auto plugin in this repo for completeness. Check that every skill has a corresponding eval file. assertions: + - type: skill-trigger + skill: agent-plugin-review - type: contains value: deploy-rollback - type: rubrics @@ -30,6 +32,8 @@ tests: input: | Review the eval files under evals/deploy-auto/ for naming convention issues. assertions: + - type: skill-trigger + skill: agent-plugin-review - type: contains value: .eval.yaml - type: rubrics @@ -44,6 +48,8 @@ tests: Review evals/deploy-auto/deploy-plan.yaml for eval quality issues. Check assertion coverage and expected_output format. assertions: + - type: skill-trigger + skill: agent-plugin-review - type: rubrics criteria: - Flags that no assertions are defined in deploy-plan.yaml @@ -55,6 +61,8 @@ tests: input: | Review evals/deploy-auto/deploy-plan.yaml for file path formatting issues. assertions: + - type: skill-trigger + skill: agent-plugin-review - type: rubrics criteria: - Flags that file paths are missing a leading slash @@ -66,6 +74,8 @@ tests: Review evals/deploy-auto/deploy-plan.yaml for structural improvements. Look at how inputs are organized across test cases. assertions: + - type: skill-trigger + skill: agent-plugin-review - type: rubrics criteria: - Identifies the repeated SKILL.md file input across all 3 tests @@ -77,6 +87,8 @@ tests: Review the deploy-auto plugin's workflow architecture. Check whether phases enforce prerequisites before proceeding. assertions: + - type: skill-trigger + skill: agent-plugin-review - type: rubrics criteria: - Flags that deploy-execute does not check for deploy-plan.md before starting @@ -89,6 +101,8 @@ tests: Review evals/deploy-auto/deploy-execute.eval.yaml for factual accuracy. Cross-check expected outputs against what the skills actually document. assertions: + - type: skill-trigger + skill: agent-plugin-review - type: rubrics criteria: - Flags the contradiction between pytest (skill) and python -m unittest (eval) @@ -100,6 +114,8 @@ tests: Review plugins/deploy-auto/skills/deploy-plan/SKILL.md for cross-reference issues. Check that referenced commands and skills actually exist. assertions: + - type: skill-trigger + skill: agent-plugin-review - type: rubrics criteria: - Flags that /deploy-execute is referenced but does not exist as a slash command @@ -111,6 +127,8 @@ tests: input: | Review plugins/deploy-auto/skills/deploy-execute/SKILL.md for portability issues. assertions: + - type: skill-trigger + skill: agent-plugin-review - type: rubrics criteria: - Flags the hardcoded path C:\Users\admin\.kube\config From 8a7c25aef6a005c6736336538c3f4bc947d78163 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 26 Mar 2026 14:16:11 +1100 Subject: [PATCH 08/14] fix: copy skills to .codex/skills/ for pi discovery Pi discovers workspace skills from .codex/skills/, not .agents/skills/. Copy to both directories so any provider can find them. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../workspace-template/scripts/setup.mjs | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/evals/agentic-engineering/workspace-template/scripts/setup.mjs b/evals/agentic-engineering/workspace-template/scripts/setup.mjs index 8bb921d9..d45e03e6 100644 --- a/evals/agentic-engineering/workspace-template/scripts/setup.mjs +++ b/evals/agentic-engineering/workspace-template/scripts/setup.mjs @@ -19,8 +19,14 @@ try { repoRoot = resolve(__dirname, '..', '..', '..', '..'); } -const targetDir = join(process.cwd(), '.agents', 'skills'); -mkdirSync(targetDir, { recursive: true }); +// Copy to all skill discovery directories so any provider can find them +const skillDirs = [ + join(process.cwd(), '.agents', 'skills'), + join(process.cwd(), '.codex', 'skills'), +]; +for (const dir of skillDirs) { + mkdirSync(dir, { recursive: true }); +} const skillSources = [ join(repoRoot, 'plugins', 'agentic-engineering', 'skills', 'agent-plugin-review'), @@ -30,10 +36,13 @@ const skillSources = [ for (const src of skillSources) { const name = src.split(/[\\/]/).pop(); - const dest = join(targetDir, name); - cpSync(src, dest, { recursive: true }); + for (const dir of skillDirs) { + cpSync(src, join(dir, name), { recursive: true }); + } console.log(`Copied ${name}`); } -console.log(`\nSkills in ${targetDir}:`); -console.log(readdirSync(targetDir).join(', ')); +for (const dir of skillDirs) { + console.log(`\nSkills in ${dir}:`); + console.log(readdirSync(dir).join(', ')); +} From cebccb315c5be9c963828e2e0ffc958ecc84db65 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 26 Mar 2026 14:22:23 +1100 Subject: [PATCH 09/14] fix: use .pi/skills/ for pi discovery per docs Pi discovers workspace skills from .agents/skills/ and .pi/skills/ per pi-mono docs. Replace .codex/skills/ with .pi/skills/. Co-Authored-By: Claude Opus 4.6 (1M context) --- evals/agentic-engineering/workspace-template/scripts/setup.mjs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/evals/agentic-engineering/workspace-template/scripts/setup.mjs b/evals/agentic-engineering/workspace-template/scripts/setup.mjs index d45e03e6..27d24e4b 100644 --- a/evals/agentic-engineering/workspace-template/scripts/setup.mjs +++ b/evals/agentic-engineering/workspace-template/scripts/setup.mjs @@ -20,9 +20,10 @@ try { } // Copy to all skill discovery directories so any provider can find them +// Pi looks in .agents/skills/ and .pi/skills/ per docs const skillDirs = [ join(process.cwd(), '.agents', 'skills'), - join(process.cwd(), '.codex', 'skills'), + join(process.cwd(), '.pi', 'skills'), ]; for (const dir of skillDirs) { mkdirSync(dir, { recursive: true }); From 4023bed0a7983641d2925ed5610be6832cb14624 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 26 Mar 2026 14:27:42 +1100 Subject: [PATCH 10/14] fix: remove skill-trigger assertions pending pi workspace skill discovery Pi-cli does not discover skills from workspace-local .agents/skills/ or .pi/skills/ directories. Content assertions still validate review quality. Skill-trigger to be re-added once pi workspace discovery works. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../agent-plugin-review.eval.yaml | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/evals/agentic-engineering/agent-plugin-review.eval.yaml b/evals/agentic-engineering/agent-plugin-review.eval.yaml index 88e20116..ebab144a 100644 --- a/evals/agentic-engineering/agent-plugin-review.eval.yaml +++ b/evals/agentic-engineering/agent-plugin-review.eval.yaml @@ -18,8 +18,6 @@ tests: Review the deploy-auto plugin in this repo for completeness. Check that every skill has a corresponding eval file. assertions: - - type: skill-trigger - skill: agent-plugin-review - type: contains value: deploy-rollback - type: rubrics @@ -32,8 +30,6 @@ tests: input: | Review the eval files under evals/deploy-auto/ for naming convention issues. assertions: - - type: skill-trigger - skill: agent-plugin-review - type: contains value: .eval.yaml - type: rubrics @@ -48,8 +44,6 @@ tests: Review evals/deploy-auto/deploy-plan.yaml for eval quality issues. Check assertion coverage and expected_output format. assertions: - - type: skill-trigger - skill: agent-plugin-review - type: rubrics criteria: - Flags that no assertions are defined in deploy-plan.yaml @@ -61,8 +55,6 @@ tests: input: | Review evals/deploy-auto/deploy-plan.yaml for file path formatting issues. assertions: - - type: skill-trigger - skill: agent-plugin-review - type: rubrics criteria: - Flags that file paths are missing a leading slash @@ -74,8 +66,6 @@ tests: Review evals/deploy-auto/deploy-plan.yaml for structural improvements. Look at how inputs are organized across test cases. assertions: - - type: skill-trigger - skill: agent-plugin-review - type: rubrics criteria: - Identifies the repeated SKILL.md file input across all 3 tests @@ -87,8 +77,6 @@ tests: Review the deploy-auto plugin's workflow architecture. Check whether phases enforce prerequisites before proceeding. assertions: - - type: skill-trigger - skill: agent-plugin-review - type: rubrics criteria: - Flags that deploy-execute does not check for deploy-plan.md before starting @@ -101,8 +89,6 @@ tests: Review evals/deploy-auto/deploy-execute.eval.yaml for factual accuracy. Cross-check expected outputs against what the skills actually document. assertions: - - type: skill-trigger - skill: agent-plugin-review - type: rubrics criteria: - Flags the contradiction between pytest (skill) and python -m unittest (eval) @@ -114,8 +100,6 @@ tests: Review plugins/deploy-auto/skills/deploy-plan/SKILL.md for cross-reference issues. Check that referenced commands and skills actually exist. assertions: - - type: skill-trigger - skill: agent-plugin-review - type: rubrics criteria: - Flags that /deploy-execute is referenced but does not exist as a slash command @@ -127,8 +111,6 @@ tests: input: | Review plugins/deploy-auto/skills/deploy-execute/SKILL.md for portability issues. assertions: - - type: skill-trigger - skill: agent-plugin-review - type: rubrics criteria: - Flags the hardcoded path C:\Users\admin\.kube\config From 0a1eafff94e91e369b0425552cde486ae04df67e Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 26 Mar 2026 15:07:33 +1100 Subject: [PATCH 11/14] fix: use array format for before_all command The workspace hook parser requires command as an array, not a string. String values are silently ignored, causing the hook to not run. Co-Authored-By: Claude Opus 4.6 (1M context) --- evals/agentic-engineering/agent-plugin-review.eval.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/evals/agentic-engineering/agent-plugin-review.eval.yaml b/evals/agentic-engineering/agent-plugin-review.eval.yaml index ebab144a..530641cf 100644 --- a/evals/agentic-engineering/agent-plugin-review.eval.yaml +++ b/evals/agentic-engineering/agent-plugin-review.eval.yaml @@ -9,7 +9,9 @@ workspace: template: ./workspace-template hooks: before_all: - command: node scripts/setup.mjs + command: + - node + - scripts/setup.mjs tests: - id: detect-missing-eval From f16444b0f666366881170c3b859acacdca974acb Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 26 Mar 2026 15:09:07 +1100 Subject: [PATCH 12/14] fix: read workspace_path from stdin in setup hook The before_all hook runs with cwd=evalDir, not the workspace. The orchestrator passes workspace_path via stdin JSON. Updated setup.mjs to read it from stdin and copy skills there. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../workspace-template/scripts/setup.mjs | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/evals/agentic-engineering/workspace-template/scripts/setup.mjs b/evals/agentic-engineering/workspace-template/scripts/setup.mjs index 27d24e4b..5d8e23d4 100644 --- a/evals/agentic-engineering/workspace-template/scripts/setup.mjs +++ b/evals/agentic-engineering/workspace-template/scripts/setup.mjs @@ -1,29 +1,43 @@ #!/usr/bin/env node /** - * Workspace before_all hook: copy skills into .agents/skills/ for agent discovery. - * Runs from the workspace root at eval startup. + * Workspace before_all hook: copy skills into the workspace for agent discovery. + * Receives workspace_path via stdin JSON from the AgentV orchestrator. */ -import { cpSync, mkdirSync, readdirSync } from 'node:fs'; +import { cpSync, mkdirSync, readdirSync, readFileSync } from 'node:fs'; import { resolve, join, dirname } from 'node:path'; import { execSync } from 'node:child_process'; import { fileURLToPath } from 'node:url'; const __dirname = dirname(fileURLToPath(import.meta.url)); +// Read workspace_path from stdin (provided by AgentV orchestrator) +let workspacePath; +try { + const stdin = readFileSync(0, 'utf8'); + const context = JSON.parse(stdin); + workspacePath = context.workspace_path; +} catch { + // Fallback to cwd if stdin is not available + workspacePath = process.cwd(); +} + +console.log(`Workspace path: ${workspacePath}`); + // Resolve repo root let repoRoot; try { - repoRoot = execSync('git rev-parse --show-toplevel', { encoding: 'utf8' }).trim(); + repoRoot = execSync('git rev-parse --show-toplevel', { encoding: 'utf8', cwd: __dirname }).trim(); } catch { repoRoot = resolve(__dirname, '..', '..', '..', '..'); } -// Copy to all skill discovery directories so any provider can find them -// Pi looks in .agents/skills/ and .pi/skills/ per docs +console.log(`Repo root: ${repoRoot}`); + +// Copy to skill discovery directories in the workspace const skillDirs = [ - join(process.cwd(), '.agents', 'skills'), - join(process.cwd(), '.pi', 'skills'), + join(workspacePath, '.agents', 'skills'), + join(workspacePath, '.pi', 'skills'), ]; for (const dir of skillDirs) { mkdirSync(dir, { recursive: true }); From 82cf17ec57596d69082fed28de53db4dba9dd406 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 26 Mar 2026 15:15:57 +1100 Subject: [PATCH 13/14] fix: use {{workspace_path}} to resolve setup script path Hook commands run from evalDir, not the workspace. Use the interpolation variable to resolve the script from the workspace. Co-Authored-By: Claude Opus 4.6 (1M context) --- evals/agentic-engineering/agent-plugin-review.eval.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/agentic-engineering/agent-plugin-review.eval.yaml b/evals/agentic-engineering/agent-plugin-review.eval.yaml index 530641cf..5e8b417d 100644 --- a/evals/agentic-engineering/agent-plugin-review.eval.yaml +++ b/evals/agentic-engineering/agent-plugin-review.eval.yaml @@ -11,7 +11,7 @@ workspace: before_all: command: - node - - scripts/setup.mjs + - "{{workspace_path}}/scripts/setup.mjs" tests: - id: detect-missing-eval From 77d049b070dc3592afcbab12c4773b3e35080789 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 26 Mar 2026 15:17:10 +1100 Subject: [PATCH 14/14] fix: resolve repo root from cwd (eval dir is inside repo) The hook runs with cwd=evalDir which is inside the git repo. Use process.cwd() for git rev-parse instead of __dirname (which is inside the materialized workspace copy, not the repo). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../workspace-template/scripts/setup.mjs | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/evals/agentic-engineering/workspace-template/scripts/setup.mjs b/evals/agentic-engineering/workspace-template/scripts/setup.mjs index 5d8e23d4..bfeddcf6 100644 --- a/evals/agentic-engineering/workspace-template/scripts/setup.mjs +++ b/evals/agentic-engineering/workspace-template/scripts/setup.mjs @@ -2,14 +2,12 @@ /** * Workspace before_all hook: copy skills into the workspace for agent discovery. * Receives workspace_path via stdin JSON from the AgentV orchestrator. + * Runs with cwd = eval file directory (which is inside the repo). */ import { cpSync, mkdirSync, readdirSync, readFileSync } from 'node:fs'; -import { resolve, join, dirname } from 'node:path'; +import { join } from 'node:path'; import { execSync } from 'node:child_process'; -import { fileURLToPath } from 'node:url'; - -const __dirname = dirname(fileURLToPath(import.meta.url)); // Read workspace_path from stdin (provided by AgentV orchestrator) let workspacePath; @@ -18,20 +16,19 @@ try { const context = JSON.parse(stdin); workspacePath = context.workspace_path; } catch { - // Fallback to cwd if stdin is not available workspacePath = process.cwd(); } -console.log(`Workspace path: ${workspacePath}`); - -// Resolve repo root +// Resolve repo root from cwd (eval dir is inside the repo) let repoRoot; try { - repoRoot = execSync('git rev-parse --show-toplevel', { encoding: 'utf8', cwd: __dirname }).trim(); + repoRoot = execSync('git rev-parse --show-toplevel', { encoding: 'utf8' }).trim(); } catch { - repoRoot = resolve(__dirname, '..', '..', '..', '..'); + console.error('Failed to resolve repo root from cwd:', process.cwd()); + process.exit(1); } +console.log(`Workspace: ${workspacePath}`); console.log(`Repo root: ${repoRoot}`); // Copy to skill discovery directories in the workspace @@ -58,6 +55,5 @@ for (const src of skillSources) { } for (const dir of skillDirs) { - console.log(`\nSkills in ${dir}:`); - console.log(readdirSync(dir).join(', ')); + console.log(`Skills in ${dir}: ${readdirSync(dir).join(', ')}`); }