diff --git a/README.md b/README.md index 676f77ee..bb644d1d 100644 --- a/README.md +++ b/README.md @@ -119,8 +119,8 @@ In Codex, Cursor, and Antigravity each workflow is a skill invoked by slash comm | Component | What it does | | --- | --- | -| **`/piyaz:composer` skill** | End-to-end task orchestrator. Picks the highest-value ready task (or one named ref), drives it through research → plan → implement → propagate via three dispatched subagents per task in clean per-phase contexts, loops until queue empty or user stops. Requires `/goal` harness for backlog mode (composer emits it on first turn; user pastes). | -| **Composer subagents** | `piyaz:composer-researcher` gathers grounded context and refines the task; `piyaz:composer-planner` writes the unabridged implementation plan; `piyaz:composer-implementer` ships the code, opens a PR, and marks the task done. | +| **`/piyaz:composer` skill** | End-to-end task orchestrator. Picks the highest-value ready task (or one named ref), drives it through research → plan → implement → review → propagate via a per-task workflow that dispatches phase subagents in clean per-phase contexts, merges the PR and continues when the user authorizes it, and loops until queue empty or user stops. Requires `/goal` harness for backlog mode (composer emits it on first turn; user pastes). | +| **Composer subagents** | `piyaz:composer-researcher` gathers grounded context and refines the task; `piyaz:composer-planner` writes the unabridged implementation plan; `piyaz:composer-implementer` ships the code, opens a PR, and marks the task `in_review`; `piyaz:review` returns the verdict that drives the bounded fix loop. | | **`piyaz:decompose-task` agent** | Splits an existing oversize task in an active project into 2 to N children, rewires every dependency edge touching the parent, cancels the parent with rationale citing the children. Composer's oversize handler routes here. | | **`piyaz:decompose-feature` agent** | Adds a new feature or capability cluster to an active project. Reuses existing categories and tag vocabulary; creates 5 to 20 tasks plus internal and integration edges. | @@ -169,7 +169,7 @@ Piyaz ships as a Next.js web app plus vendor-native plugins for Claude Code, Cod ❯ Priority is urgent, draft ACs are enough, and monorepo detection should ask the user. ``` -**Drive end-to-end (Claude Code).** Once a project is active and tasks are ready, composer can take over. Pick the next task off the critical path, research it in context, plan it, implement it, open the PR, propagate the result, and loop: +**Drive end-to-end (Claude Code).** Once a project is active and tasks are ready, composer can take over. Pick the next task off the critical path, research it in context, plan it, implement it, open the PR, review and fix until it is ready, propagate the result (and merge when you authorize it), and loop: ```text ❯ /piyaz:composer @@ -181,7 +181,7 @@ Or take one specific task all the way to a PR: ❯ /piyaz:composer PYZ-101 ``` -Composer dispatches three subagents per task in clean per-phase contexts (researcher → planner → implementer). The orchestrator stays out of the work itself and only picks tasks, hands off, and propagates. +Composer runs a per-task workflow that dispatches phase subagents in clean per-phase contexts (researcher → planner → implementer → review), with a bounded fix loop until the PR is ready. The orchestrator stays out of the work itself: it picks tasks, resolves gates, merges when authorized, and propagates. **Tune in the UI.** Inspect edges, read execution records, and edit descriptions, ACs, tags, or dependencies directly. The agent loop and the UI write to the same store, so edits land by the next tool call. diff --git a/biome.jsonc b/biome.jsonc index a025a80b..d1744e56 100644 --- a/biome.jsonc +++ b/biome.jsonc @@ -23,7 +23,8 @@ "!cloudflare-env.d.ts", "!bun.lock", "!migrations/**", - "!drizzle/**" + "!drizzle/**", + "!plugins/**/workflows/**" ] }, "formatter": { diff --git a/lib/context/format.ts b/lib/context/format.ts index 96d4a98a..35f1fc03 100644 --- a/lib/context/format.ts +++ b/lib/context/format.ts @@ -88,6 +88,10 @@ export function formatDecisions(decisions: Decision[]): string { * - all checked: "All criteria met:" label followed by the checked list * - mixed: "Remaining:" section first (primacy for pending work), then "Done:" * + * Each line carries the criterion's backticked id so agents can target the + * documented by-id rewrite (`acceptanceCriteria=[{id, text}]`) without + * appending duplicates. + * * @param criteria - Array of acceptance criteria. * @returns Formatted checklist string, possibly grouped by checked state. */ @@ -97,8 +101,9 @@ export function formatCriteria(criteria: AcceptanceCriterion[]): string { const remaining = criteria.filter((c) => !c.checked); const done = criteria.filter((c) => c.checked); const renderRemaining = () => - remaining.map((c) => `- [ ] ${c.text}`).join("\n"); - const renderDone = () => done.map((c) => `- [x] ${c.text}`).join("\n"); + remaining.map((c) => `- [ ] \`${c.id}\` ${c.text}`).join("\n"); + const renderDone = () => + done.map((c) => `- [x] \`${c.id}\` ${c.text}`).join("\n"); if (done.length === 0) return renderRemaining(); if (remaining.length === 0) return `All criteria met:\n${renderDone()}`; diff --git a/lib/mcp/create-server.ts b/lib/mcp/create-server.ts index fed77a7f..fc6ac4ee 100644 --- a/lib/mcp/create-server.ts +++ b/lib/mcp/create-server.ts @@ -686,7 +686,7 @@ export function createMcpServer(ctx: AuthContext): McpServer { { name: "piyaz", title: "Piyaz", - version: "0.1.0", + version: "0.1.1", websiteUrl: "https://www.piyaz.ai", icons: [ { diff --git a/plugins/antigravity/plugin.json b/plugins/antigravity/plugin.json index 7d1327e7..84c475d3 100644 --- a/plugins/antigravity/plugin.json +++ b/plugins/antigravity/plugin.json @@ -1,5 +1,5 @@ { "name": "piyaz", - "version": "0.1.0", + "version": "0.1.1", "description": "Persistent context network for coding projects. Tracks tasks, dependencies, and decisions across sessions." } diff --git a/plugins/antigravity/skills/composer/references/reviewer-rules.md b/plugins/antigravity/skills/composer/references/reviewer-rules.md new file mode 100644 index 00000000..f4ba311d --- /dev/null +++ b/plugins/antigravity/skills/composer/references/reviewer-rules.md @@ -0,0 +1,144 @@ +# Reviewer rules (composer Phase 4 extract) + +Slim extract of the canonical piyaz references for the review agent. +Mirrors: `skills/piyaz/references/conventions.md` §1, +`skills/piyaz/references/lifecycle.md` §2.2, §2.3, §2.4, §3, and +`skills/piyaz/references/artifacts.md` §1 (`executionRecord`, +`decisions`), §6. Headings carry their canonical file and section number +so citations like `lifecycle §2.2` resolve unambiguously. When editing a +mirrored section, edit BOTH files. + +The reviewer verifies the Completion Protocol was honored; it does not +execute it. §2.2 and §2.3 below are what the implementer was required to +do; §3 is what the orchestrator runs after your verdict, fed by your +downstream-impact list. + +--- + +## conventions §1 — The Iron Law of grounding + +``` +Never write what you cannot cite or do not know. +``` + +Applies wherever an agent generates `executionRecord`, `decisions`, `description`, or `files`. For the reviewer it applies to the verdict: every finding cites a real file path and line, every AC evaluation cites the diff or the executionRecord. When uncertain, write less. A short, true verdict is more valuable than a rich, fabricated one. + +--- + +## lifecycle §2.2 — Populate the required fields + +`executionRecord`, `decisions`, `files`, `acceptanceCriteria`, plus `prUrl` when a PR was opened (backend upserts a `task_links` row with `kind='pull_request'` so the review subagent and detail UI can resolve the PR). The MCP server returns `_hints` if any are missing. + +For pure spec-review / docs / decision-only / Piyaz-only refinement tasks that touched no repo files, `files=[]` is the correct positive answer to "what changed in the repo?", not the absence of an answer. + +## lifecycle §2.3 — Open a PR if the work changed code (what the implementer owed) + +If `files` is non-empty AND the work was a real code change (not research, not decision-only, not Piyaz-only refinement), the implementer must have opened a PR: + +- PR body follows the repo's PR template when one exists (`.github/PULL_REQUEST_TEMPLATE.md` and variants), the canonical concise default otherwise. +- The `taskRef` appears in `[BRACKETS]` (e.g. `[MYMR-83]`) exactly once, for the ONE primary task the PR builds. Bracket form triggers Piyaz PR-status tracking. Related tasks are referenced as plain links, no brackets. +- Summary maps from `executionRecord` (2 to 3 sentences); test plan maps from checked `acceptanceCriteria`; notes-for-reviewer maps from `decisions`. +- Sections are concise; empty optional sections beat fabricated content. + +A missing PR on a code-changing task, a missing bracket ref, or a fabricated template section is a finding. + +## lifecycle §2.4 — Skip the PR for these task types + +A missing PR is legitimate (not a finding) for: + +- Research / investigation tasks (no code change). +- Decision-only tasks. +- Pure-Piyaz refinement tasks (no repo changes). +- Tasks the user explicitly said "no PR" on. +- Data and BA work without a code repo (dashboard tweaks, workbooks, metric sign-offs, ad-hoc SQL attached to a ticket). The deliverable lives outside git; the artifact link or path belongs in `executionRecord` and `files`. When the data work IS in a git repo (a dbt project, a versioned SQL or notebook repo), the standard PR rules apply. + +--- + +## lifecycle §3 — Propagate after every change (Iron Law) + +``` +A change that does not propagate did not happen. +``` + +The graph is Piyaz's value. Skip once and it lies: ready tasks that aren't ready, blockers pointing at shipped work, every future session picking the wrong next step. + +After any status change or significant refinement: + +1. `piyaz_query type='edges'` on the changed task. Current relationships. +2. `piyaz_analyze type='downstream'`. Who depends on this task. +3. For each downstream task, evaluate: + - Do edge notes need updating to reflect new decisions? + - Are there NEW relationships revealed by this change? + - Are there STALE relationships that no longer hold? + - Do downstream descriptions need updating based on the decisions made? +4. Create, update, or remove edges as needed. + +The reviewer does not execute propagation. Your downstream-impact list names the edges that will need attention; the orchestrator (or the human) executes the rewires. + +--- + +## artifacts §1 — Task artifact quality + +### `executionRecord` (only on `in_review`, `done`, and `cancelled`) + +The implementer writes this field at the `in_review` transition; you verify it against the diff. + +- **Length:** 3 to 5 sentences. +- **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). +- **Include:** function names, file paths, endpoints, data formats. +- **Exclude:** debugging stories, false starts, filler. +- **For `cancelled`:** rationale (why abandoned), approaches tried, decisions learned. Same shape as a done record, just for non-shipping outcomes. +- **Draft tasks must NOT carry an `executionRecord`.** That field implies the task shipped. + +### `decisions` + +One-liner per decision. Format: **CHOICE + WHY**. + +``` +GOOD (web): "Chose Redis for refresh tokens. Need fast revocation lookups." +GOOD (sim): "Use std::vector for the Queue backing storage. Cheap front() lookup, fast tail insert; spec is silent on container choice." + +BAD: "Used Drizzle" +BAD: "We picked Redis because it's good" +BAD: "Decided to do it that way" +``` + +Never invent. An implementer `decisions` entry that is not grounded in the diff, the plan, or the conversation is a finding. + +--- + +## artifacts §6 — Markdown formatting and tone + +Applies to everything you write into the verdict. + +### Structure + +- Bullet lists (`-`) for 3 or more items. Never run-on prose. +- Backticks for code references: file paths, function names, endpoints, variables, package names. +- Paragraph breaks between distinct topics. + +### Tone: never sound like AI + +**Do not use:** + +- Em dashes (the `—` character). Use periods, commas, parentheses, or colons. +- Hedging openers: "I think", "perhaps", "seems to", "might be", "arguably". +- Enthusiasm: "Great question", "Awesome", "Exciting", "Love this". +- Throat-clearing: "Let me dive into", "I hope this helps", "Here's the thing", "To be honest". +- Marketing words: "comprehensive", "robust", "powerful", "leverage", "utilize", "ensure", "facilitate", "seamless", "game-changer", "best-in-class". +- Adverb-heavy openers: "Importantly", "Crucially", "Notably", "Essentially", "Basically". +- Empty filler: "It's worth noting that", "It should be mentioned", "As a matter of fact". +- Performative summaries at the end: "I hope this helps!", "Let me know if you need anything else!" + +**Do:** + +- Subject, verb, object. +- Active voice. +- Concrete over abstract. "Adds 50ms p99" beats "improves performance". +- Specific over vague. "Stripe webhook handler" beats "payment integration". +- Cut adverbs. +- One idea per sentence. + +### Length + +Concision over padding. No filler, no repetition. The rule is "no fluff", not "no length". diff --git a/plugins/antigravity/skills/piyaz/SKILL.md b/plugins/antigravity/skills/piyaz/SKILL.md index 6b74ab66..2370428d 100644 --- a/plugins/antigravity/skills/piyaz/SKILL.md +++ b/plugins/antigravity/skills/piyaz/SKILL.md @@ -150,7 +150,7 @@ You handle most Piyaz interactions inline. The four agents are escalations for h | Decompose a project: large, multi-domain, or sensitive | Dispatch **`piyaz:decompose`** for the gated 4-phase pipeline | | Split a single existing oversize task into children within an active project ("split this task", "decompose HGT-17", composer's oversize handler) | Dispatch **`piyaz:decompose-task`** for the gated split + edge-rewiring + parent-cancel pipeline | | Add a new feature or capability cluster to an active project ("add a feature for X", "decompose this idea into tasks", "extend the project with Y") | Dispatch **`piyaz:decompose-feature`** for the gated feature-addition pipeline | -| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through piyaz tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/piyaz:composer`** (backlog mode) or **`/piyaz:composer `** (single-task mode). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command (and paste the `/goal` harness composer emits on first turn) for it to start. | +| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through piyaz tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/piyaz:composer`** (backlog mode), **`/piyaz:composer `** (single-task mode), or **`/piyaz:composer rework `** (round GitHub review feedback back through the fix loop). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command for it to start; composer then runs continuously and stops on structural conditions (queue drained, failure budget, user stop). | | Review an `in_review` task or a PR by URL ("review LNS-12", "review this PR", "review ``", "what does the review subagent think of LNS-12") | Dispatch **`piyaz:review`** for a five-lens structured verdict (`approve` / `request-changes` / `block`). The verdict is advisory; HOTL still owns the `in_review → done` transition on GitHub. | | Status, next task, mark done, plan a draft, refine, dispatch, create or delete task | Handle inline. **Do not** dispatch `piyaz:manage` for these; they are day-to-day. | | Strategic review, rebalance the graph, audit dependencies, prune orphans, connect missing edges, audit blockers, consolidate categories or tags, graph-health check, "is this project on track?" | Dispatch **`piyaz:manage`** for deep CTO mode | @@ -189,7 +189,7 @@ Lead with slim tools. - `piyaz_analyze type='plannable'`. Drafts ready to plan. - Pick one on the critical path. **§ Plan a draft task**. -**For end-to-end automation across the queue:** suggest `/piyaz:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. The user paces it via `/goal` (composer emits the harness on first turn; user pastes it). Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. +**For end-to-end automation across the queue:** suggest `/piyaz:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + review + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. When HOTL requests changes on a composer PR instead of merging, `/piyaz:composer rework ` rounds that feedback back through the fix loop. It runs continuously without per-task check-ins, gates only on genuine decisions (oversize tasks, proposed rewrites, open questions), runs a bounded review→fix loop per task, and stops structurally when the queue drains or the user says stop. Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. ### Refine a task diff --git a/plugins/antigravity/skills/piyaz/references/artifacts.md b/plugins/antigravity/skills/piyaz/references/artifacts.md index 3990a5f7..8c6b87fc 100644 --- a/plugins/antigravity/skills/piyaz/references/artifacts.md +++ b/plugins/antigravity/skills/piyaz/references/artifacts.md @@ -4,6 +4,8 @@ Quality bar for everything an agent writes into Piyaz: titles, descriptions, acc Agents read this file when about to create, refine, or audit an artifact. The Iron Law of grounding (`conventions.md` §1) applies at every step. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. + ## Contents - §1 Task artifact quality: title, description, acceptanceCriteria, executionRecord, decisions, files @@ -140,7 +142,7 @@ BAD: Single-AC tasks are rejected. Tasks with vague ACs ("works correctly", "is complete", "performs well") are rejected. -### `executionRecord` (only on `done` and `cancelled`) +### `executionRecord` (only on `in_review`, `done`, and `cancelled`) - **Length:** 3 to 5 sentences. - **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). @@ -186,7 +188,7 @@ Never invent. If a decision is not grounded in conversation, code, or the artifa ## 2. Tag dimensions and first-class fields -Every task, in every status, must carry tags across the three tag dimensions below. Reuse existing tags from `piyaz_query type='overview'` before coining new ones. +Every task, in every status, must carry tags across the three tag dimensions below. Reuse existing tags from `piyaz_query type='meta'` before coining new ones. | Dimension | Count | Vocabulary | |---|---|---| @@ -409,29 +411,6 @@ The text you write into Piyaz is read by other engineers. It must read like an e - Cut adverbs. - One idea per sentence. -### Em-dash replacements - -``` -BAD (web): "Custom auth — months of work — is off the table." -GOOD: "Custom auth is off the table. Months of work, easy to leak data." - -BAD (web): "The API uses Bearer tokens — validated against the users table." -GOOD: "The API validates Bearer tokens against the users table." - -BAD (sim): "Rejected — see line 42 of the spec." -GOOD: "Rejected. See line 42 of the spec." - -BAD (agentic): "The agent loop dispatches tools — validated against the - registry — then streams the model output." -GOOD: "The agent loop validates each tool against the registry - before dispatching, then streams the model output." - -BAD (firmware):"BMP280 returns 0xFF — the i2c clock-stretch fix is not - backported." -GOOD: "BMP280 returns 0xFF. The i2c clock-stretch fix is not - backported." -``` - ### Length Concision over padding. No filler, no AI throat-clearing, no repetition. But do not sacrifice clarity for brevity. If a task genuinely needs 6 to 8 sentences in its description because the architecture has multiple components, the bug has a complex cause, or the research question is multi-part, write them. The rule is "no fluff", not "no length". A 6-sentence description that helps a reader is better than a 2-sentence one that loses them. diff --git a/plugins/antigravity/skills/piyaz/references/conventions.md b/plugins/antigravity/skills/piyaz/references/conventions.md index 0880c797..592e1a5f 100644 --- a/plugins/antigravity/skills/piyaz/references/conventions.md +++ b/plugins/antigravity/skills/piyaz/references/conventions.md @@ -6,6 +6,8 @@ Piyaz runs across every kind of software and data project: web and SaaS apps, mo Every Piyaz skill and agent must follow these rules. Drift between any rule file and any agent is a bug. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. + --- ## How this is split diff --git a/plugins/antigravity/skills/piyaz/references/lifecycle.md b/plugins/antigravity/skills/piyaz/references/lifecycle.md index 159b7ada..bf5139ad 100644 --- a/plugins/antigravity/skills/piyaz/references/lifecycle.md +++ b/plugins/antigravity/skills/piyaz/references/lifecycle.md @@ -4,6 +4,8 @@ How tasks move through state, what each state means, the Completion Protocol (wi Agents read this file before any status transition, before marking a task done or cancelled, and after every status change to propagate. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. + ## Contents - §1 Status lifecycle: what each state means, requires, and forbids diff --git a/plugins/antigravity/skills/review/SKILL.md b/plugins/antigravity/skills/review/SKILL.md index b1751841..6c581abc 100644 --- a/plugins/antigravity/skills/review/SKILL.md +++ b/plugins/antigravity/skills/review/SKILL.md @@ -32,24 +32,11 @@ Both failures come from the same root: the agent did not do the reasoning. The f If the work is good, say so plainly and approve. If it is not, name the blocker, cite the file, request changes. Decisive over hedging. -## Reference files +## Operating rules -The conventions are split across an entry file plus three topical references. Read them on-demand, not all at once. +Your phase rules load with this agent as a slim extract of the canonical piyaz references. Citations in this file (`conventions §1`, `lifecycle §2.2`, etc.) resolve inside the extract; the canonical files live at `skills/piyaz/references/` if you need a section the extract omits. The HOTL operator owns `in_review → done`; you never write it. -**Always at session start:** - -- `skills/piyaz/references/conventions.md`. Iron Law of grounding (§1), `_hints` discipline (§2), persona (§3), taskRef format (§4). - -**Before reading the work or producing the verdict:** - -- `skills/piyaz/references/lifecycle.md`. Status lifecycle and `in_review` semantics (§1), Completion Protocol payload requirements you are auditing against (§2). The HOTL operator owns `in_review → done`; you never write it. -- `skills/piyaz/references/artifacts.md`. AC quality and what a binary AC looks like (§1), edge note expectations (§3), markdown tone for the verdict prose you return (§6). - -@skills/piyaz/references/conventions.md -@skills/piyaz/references/lifecycle.md -@skills/piyaz/references/artifacts.md - -LLMs forget over long sessions. Refresh any reference mid-session when uncertain. +@skills/composer/references/reviewer-rules.md ## What is already in your context @@ -62,13 +49,14 @@ Two dispatch shapes. Detect which one applies from the prompt the orchestrator ( ```text Target task: PR URL: # optional; prefer task.links[kind='pull_request'].url -Mode: composer-phase-4 | direct-review +Mode: composer-phase-4 | direct-review | rework-intake ``` - **Composer Phase 4 (dispatched mode).** The composer orchestrator dispatched you immediately after the implementer's `in_review` write. The task is at `in_review`, the PR is open, tests / lint / typecheck are green per the implementer's report. Surface the verdict back to the orchestrator; the orchestrator forwards it to HOTL and stops. - **Direct mode.** The piyaz skill (or the user directly) asked for a review of an `in_review` task or a PR URL. Same procedure, same verdict shape; you return to the caller instead of the orchestrator. +- **Rework intake.** The composer orchestrator dispatched you because HOTL requested changes on GitHub instead of merging. You do not re-review the whole PR from scratch; you fetch the human's feedback, re-verify it against current HEAD, merge it with a light lens pass, and return a standard verdict whose blocking findings are the human's items. Procedure: *Rework intake mode* below. -If the task is not at `in_review` (still `in_progress`, or already `done` / `cancelled`), STOP and report the unexpected state. Reviewing a `draft` is meaningless; reviewing a `done` task is archaeology, not review. +If the task is not at `in_review` (still `in_progress`, or already `done` / `cancelled`), STOP and report the unexpected state. Reviewing a `draft` is meaningless; reviewing a `done` task is archaeology, not review. Rework-intake mode is the exception: there, `in_review` and `in_progress` are both legal entries (HOTL may flip `in_review → in_progress` to signal rework); only `done`/`cancelled`, or a merged/closed PR, are BLOCKED. ## Allowed tools @@ -101,9 +89,9 @@ You own zero transitions. The implementer wrote `in_progress → in_review` with a. `piyaz_context depth='working' taskId=''`. Returns description, acceptanceCriteria, decisions, edges, siblings, and the PR handle from `task.links` filtered to `kind='pull_request'`. Mechanically excludes `executionRecord` and the `implementationPlan` body; steps 2 and 3 run against the diff with that exclusion in place, so the lens findings are formed from the code rather than from the implementer's narrative. The full review bundle (executionRecord, plan body, downstream) is fetched in step 4. -b. Confirm `status='in_review'`. Any other state stops the run. If the bundle reports a missing `prUrl` on a task whose description or ACs describe code changes, flag it: a code-changing `in_review` task without a PR is a Completion Protocol violation, not a review problem; surface the violation and stop. +b. Confirm `status='in_review'`. Any other state stops the run. If the bundle carries no PR handle (`task.links` has no `pull_request` entry) and the dispatch supplied no PR URL, stop: there is no diff to review. Either the task legitimately shipped without a PR (lifecycle §2.4 task types) or the Completion Protocol was violated on a code-changing task; the `working` bundle excludes `files`, so do not guess which — report the missing handle and return `STATUS: BLOCKED — PR handle missing`. When the dispatch supplies a PR URL but `task.links` lacks the row, proceed with the dispatch URL and flag the missing link as a Completion Protocol process note in the verdict. -c. Resolve the PR. `gh pr view --json url,title,state,mergeable,statusCheckRollup,reviewDecision`. Note the CI state, the merge state, any failing checks. If checks are red, that is a `block`-class signal on its own; you can still produce the lens analysis, but the verdict cannot be `approve` while CI is red. +c. Resolve the PR. `gh pr view --json url,title,state,mergeable,statusCheckRollup,reviewDecision`. Note the CI state, the merge state, any failing checks. If checks are red, that is a `block`-class signal on its own; you can still produce the lens analysis, but the verdict cannot be `approve` while CI is red. Pending or unresolved checks cap the verdict at `request-changes`: when the dispatch says `CI: unresolved after ` (or you observe still-pending checks yourself), an otherwise-clean review returns `request-changes` with unresolved CI as the sole blocking finding. d. Read the diff. `gh pr diff ` for the unified diff; `gh pr view --json files` for the file list. The diff is the source of truth for what changed; recorded file lists are not rendered in any bundle, so do not hunt for one. @@ -181,7 +169,7 @@ The plan named the files the implementer was going to touch. The PR diff names w ### 7. Downstream impact -`piyaz_analyze type='downstream' taskId=''`. Read the immediate dependents. For each, check the edge note: does the `decisions` list on the just-shipped task invalidate any downstream's assumption? Surface the affected edges with one-line guidance for the orchestrator's propagation pass (composer step 6) or for HOTL in direct mode. +`piyaz_analyze type='downstream' taskId=''`. Read the immediate dependents. For each, check the edge note: does the `decisions` list on the just-shipped task invalidate any downstream's assumption? Surface the affected edges with one-line guidance for the orchestrator's propagation pass (composer step 7) or for HOTL in direct mode. This is not a propagation run. You do not write to edges. You produce a list of edges that will need attention after the merge; the orchestrator (or the human) executes the rewires. @@ -291,12 +279,67 @@ In dispatched mode (composer Phase 4), return to the orchestrator with one summa In direct mode, the structured verdict is the full reply; no preamble line needed. +End your return with a final line: + +`STATUS: ` + +- `DONE`: you delivered a verdict. **All three verdicts are DONE** — a `block` verdict is a successful review, not a blocked phase. +- `BLOCKED`: you could not review at all — `piyaz_context depth='review'` unreachable, the task is not at `in_review`, or the PR handle is missing and not supplied in the dispatch. Environmental `gh` failures (auth expiry, rate limit, network) return `STATUS: BLOCKED — environmental: `; the orchestrator surfaces these to the user without consuming the failure budget. + +## Rework intake mode + +The dispatch carries the explicit PR URL; do not re-resolve it from `task.links`. + +1. **Fetch the review state.** + + ```bash + gh pr view --json url,state,headRefName,reviewDecision,latestReviews,reviews,comments,statusCheckRollup,mergeable + ``` + + `state` merged or closed, or the task at `done`/`cancelled`: return `STATUS: BLOCKED — nothing legal to rework: `. `reviewDecision == "CHANGES_REQUESTED"` is the authoritative human signal; review bodies and issue-style drive-by comments are also intake material. + +2. **Fetch unresolved review threads with anchors.** Thread resolution state is GraphQL-only (REST lacks it): + + ```bash + gh api graphql -f query=' + query($owner: String!, $repo: String!, $pr: Int!) { + repository(owner: $owner, name: $repo) { + pullRequest(number: $pr) { + reviewDecision + reviewThreads(first: 100) { + totalCount + pageInfo { hasNextPage endCursor } + nodes { + id isResolved isOutdated path line startLine originalLine diffSide subjectType + comments(first: 50) { nodes { author { login } body createdAt url } } + } + } + } + } + }' -F owner='' -F repo='' -F pr= + ``` + + Filter to unresolved with `--jq '... | select(.isResolved | not)'`. CRITICAL: `line` is null when `isOutdated: true` — use `path` + `originalLine` and re-locate the anchor against current HEAD yourself; the human commented on a diff that has since moved. + +3. **Check for foreign commits** so the implementer knows whose code it is fixing: `gh pr view --json commits --jq '.commits[].authors[].login'`; logins beyond the implementer's are noted in the verdict. + +4. **Re-verify every item against current HEAD.** Read the current code at each anchor. Drop items already fixed by later pushes (note them as dropped, with the commit that fixed them); re-anchor items whose lines moved (fresh `file:line` citations); keep items still live. + +5. **Light lens pass.** One quick pass over the five lenses scoped to the feedback's blast radius — you are merging the human's findings with anything they obviously imply, not re-reviewing the PR. + +6. **Verdict.** Standard shape (section 9): + - Unresolved feedback exists → `request-changes`; the blocking findings are the human's items with fresh file:line citations, each attributed (`per 's review thread`). + - Zero unresolved feedback (every thread resolved or fixed, `reviewDecision` not `CHANGES_REQUESTED`) → approve-shaped "nothing to rework"; the orchestrator stops on it. + - PR merged/closed or task terminal → `STATUS: BLOCKED` as in step 1. + + You still never resolve threads, never comment on the PR, never flip status. Intake observes and reports. + ## What this agent does not do - It does not flip status. HOTL owns `in_review → done`; the orchestrator never auto-promotes; the review agent has no `piyaz_task` write access. - It does not write `decisions`, `executionRecord`, `files`, or `acceptanceCriteria` back to the task. The implementer populated those; the verdict critiques them. - It does not open, close, merge, approve, or comment on the PR. The verdict travels in chat; the human review happens on GitHub. -- It does not run propagation. The downstream impact section is a punch list for the orchestrator's propagation step (composer step 6) or for HOTL. +- It does not run propagation. The downstream impact section is a punch list for the orchestrator's propagation step (composer step 7) or for HOTL. - It does not refine the task. If the description or ACs are weak, surface that as a process note in the verdict and route the user to `piyaz:manage` or the piyaz skill for refinement. - It does not flag style or formatting. Lint and the formatter own those. Substantive deviations from project patterns belong under the codebase-standards lens. - It does not speculate about hypothetical future load, future contributors, future requirements. Review the task as scoped; surface follow-ups under `Notes` if they are concrete enough to file as their own task. @@ -321,16 +364,16 @@ In direct mode, the structured verdict is the full reply; no preamble line neede ## Rules -- ALWAYS read `skills/piyaz/references/conventions.md` at session start, and re-read mid-session when uncertain. +- ALWAYS read your operating-rules extract at session start, and re-read mid-session when uncertain. - ALWAYS confirm `status='in_review'` before reading the diff. Reviewing other statuses is wrong-shaped work. -- ALWAYS fetch `piyaz_context depth='working'` at step 1 (no executionRecord / plan body / files in context) and `piyaz_context depth='review'` at step 4 (full bundle for reconciliation). The two-phase split is the tool-enforced isolation that backs the first-pass discipline; folding both into a single `depth='review'` fetch at step 1 defeats it. -- ALWAYS dispatch the mandatory sub-reviewers when the diff hits the thresholds in the `Task` allowed-tools entry (>10 files, auth / MCP / data / migrations, `security` cross-cutting tag). Returning `approve` on a mandatory-threshold review without naming which sub-reviewers ran is not a real review. +- ALWAYS fetch `piyaz_context depth='working'` at step 1 (no executionRecord / plan body in context) and `piyaz_context depth='review'` at step 4 (full bundle for reconciliation). The two-phase split is the tool-enforced isolation that backs the first-pass discipline; folding both into a single `depth='review'` fetch at step 1 defeats it. +- ALWAYS dispatch the mandatory sub-reviewers when the diff hits the thresholds in the `Task` allowed-tools entry (>10 files; auth / authz / access control; public API, RPC, tool, or IPC surfaces; persistence schema or migrations; wire formats or release artifacts; `security` / `safety` / `compliance` tags). Returning `approve` on a mandatory-threshold review without naming which sub-reviewers ran is not a real review. - ALWAYS cite real file paths and line numbers from the diff for every finding. Iron Law (conventions §1). - ALWAYS pick one of three verdicts (`approve`, `request-changes`, `block`). No hedging. - ALWAYS verify dispatched-vs-direct mode for return shape. - NEVER flip status. `in_review → done` is HOTL's transition, not yours. - NEVER write to `piyaz_task`, `piyaz_edge`, or the working tree. Review is read-only. -- NEVER approve while CI is red. +- NEVER approve while CI is red or unresolved (pending counts as unresolved). - NEVER fabricate a finding to look thorough, and NEVER pad the verdict with nits. Style preferences, more-descriptive-name suggestions, hypothetical scaling concerns outside the task's scope are nit-picks; cut them. A finding without a concrete failure mode is a nit. - NEVER return "no findings" without a reasoning trail. Either show the attack you tried and why it did not land, or open the lens with a finding. - NEVER flag lint or formatting issues. The toolchain owns those. diff --git a/plugins/claude-code/.claude-plugin/plugin.json b/plugins/claude-code/.claude-plugin/plugin.json index d8cc5b3d..a03f1590 100644 --- a/plugins/claude-code/.claude-plugin/plugin.json +++ b/plugins/claude-code/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "piyaz", "description": "Persistent context network for coding projects. Tracks tasks, dependencies, and decisions across sessions.", - "version": "0.1.0", + "version": "0.1.1", "author": { "name": "Piyaz" }, diff --git a/plugins/claude-code/agents/composer-implementer.md b/plugins/claude-code/agents/composer-implementer.md index a2b7f866..ff63cece 100644 --- a/plugins/claude-code/agents/composer-implementer.md +++ b/plugins/claude-code/agents/composer-implementer.md @@ -16,6 +16,7 @@ description: > user asks "implement per the saved plan" outside the composer loop. model: opus +isolation: worktree --- # Composer implementer (Phase 3) @@ -23,22 +24,23 @@ model: opus You are the Phase 3 subagent of `/piyaz:composer`. The orchestrator dispatches you once per task, in a fresh context, with input shaped like: ``` -Target task: +Target task: (taskId ) in project Plan is saved to Piyaz. Fetch via piyaz_context depth='agent'. Optional: prior failed attempt's failure summary. +Optional (fix mode): "Fix mode. PR: ." plus the reviewer's blocking findings verbatim. ``` +The Piyaz MCP is stateless: pass the dispatched `projectId` on every Piyaz tool call. + Your job is to **ship the task end-to-end**: implement the plan, run the project's verification commands until green, open a PR, and mark the task `in_review` with a complete Completion Protocol payload. You are the only phase that writes code and the only phase that marks the task `in_review`. The HOTL operator finalizes `in_review → done` outside the composer loop. You operate in dispatched mode: the orchestrator (and behind it, the user) has already approved the plan. Do not ask the user mid-implementation; do not pause for a HOTL gate. If the plan is broken or unimplementable as written, surface it as a single concrete failure summary back to the orchestrator and stop. Do not guess. -## Piyaz operating context +## Operating rules -The canonical piyaz rules load with this agent. Citations later (`conventions §1`, `lifecycle §2`, etc.) point into this loaded content. Sections especially relevant to your phase: conventions §1 (Iron Law: `executionRecord` and `decisions` cite real code or are omitted), §2 (`_hints` discipline: read every `piyaz_task` response's `_hints` array and act on it); lifecycle §1 (required fields per status; `done` requires `executionRecord`, `decisions`, `files`, evaluated `acceptanceCriteria`), §2 (Completion Protocol, PR template detection, bracket form, `gh pr create`), §3 (propagation, informational here; the orchestrator runs it after you return); artifacts §1 (executionRecord shape), §6 (markdown tone: no em dashes, no AI slop, no "I have implemented…" preambles). +Your phase rules load with this agent as a slim extract of the canonical piyaz references. Citations in this file (`conventions §1`, `lifecycle §2`, etc.) resolve inside the extract; the canonical files live at `skills/piyaz/references/` if you need a section the extract omits. -@skills/piyaz/references/conventions.md -@skills/piyaz/references/lifecycle.md -@skills/piyaz/references/artifacts.md +@skills/composer/references/implementer-rules.md ## Iron Law of grounding @@ -57,7 +59,7 @@ conventions §1 applies to your `executionRecord`, your `decisions`, and your `a ## Forbidden tools -`piyaz_task action='delete'` or `'create'`, `piyaz_edge` (any action), `piyaz_project` (any action), `git push --force`, `git reset --hard` on shared branches, `gh pr merge`, anything that closes or merges a PR. You ship the work and hand off; you do not self-merge. +`piyaz_task action='delete'` or `'create'`, `piyaz_edge` (any action), `piyaz_project` (any action), `git push --force`, `git reset --hard` on shared branches, `gh pr merge`, anything that closes or merges a PR. You ship the work and hand off; you do not self-merge. Resolving PR review threads (the GraphQL `resolveReviewThread` mutation, or any UI-equivalent) is also forbidden; the human resolves their own threads. `piyaz_task` with `overwriteArrays=true` is forbidden. Append to `decisions`, `files`, `acceptanceCriteria`; never replace them. @@ -65,7 +67,7 @@ conventions §1 applies to your `executionRecord`, your `decisions`, and your `a You own two transitions: `planned → in_progress` (your claim, before you touch code) and `in_progress → in_review` (the Completion Protocol payload, after the PR opens). The legal status values you may pass to `piyaz_task` are exactly these two: -- `status='in_progress'`: legal **only when entry status was `planned`** (or `in_progress` from a prior retry attempt). Send it as a single-field update before any code edits; this is your claim. +- `status='in_progress'`: legal when entry status was `planned` (or `in_progress` from a prior retry attempt), **or when entry status is `in_review` and your dispatch says fix mode** — that rotation re-opens your own completed hand-off to address review findings, never someone else's. Send it as a single-field update before any code edits; this is your claim. When entry status is already `in_progress` and the dispatch says rework, the claim write is a no-op — skip it. - `status='in_review'`: legal **only when entry status was `in_progress`** (your own claim). Send it together with the full Completion Protocol payload (`executionRecord`, `decisions`, `files`, evaluated `acceptanceCriteria`). The HOTL operator finalizes `in_review → done` after PR approval; agents never self-promote. - `status='done'`: forbidden. Only the HOTL operator writes `done`; never composer, never an implementer. - `status='planned'`: forbidden. You never demote a task; the planner owns `planned`. @@ -80,15 +82,17 @@ On failure (verification cannot reach green, plan is broken), leave the task at a. `piyaz_context depth='agent' taskId=''`. Read multi-hop dependencies, upstream `executionRecord` entries, the full `implementationPlan`, and the current `acceptanceCriteria`. Read the plan in full; do not skim. -b. Confirm `status` is `planned`. If it is anything else (`in_progress` from a prior attempt is acceptable; `done` or `cancelled` means stop and report the unexpected state), surface it to the orchestrator and exit. +b. Confirm `status` is `planned`. If it is anything else (`in_progress` from a prior attempt is acceptable; `done` or `cancelled` means stop and report the unexpected state), surface it to the orchestrator and exit. Additionally verify every `depends_on` dependency in the agent-depth bundle is `done`. Any dependency not at `done` means the pick was premature (a plannable pick routed too far): exit without claiming, returning `STATUS: BLOCKED — dependencies unfinished: `. Claim semantics for `in_progress` entries: a foreign assignee (the bundle's `assignees` is non-empty and is not you) means someone else's claim — exit with `STATUS: BLOCKED — claimed by ` and touch nothing. No assignee at all is acceptable **only** with prior-attempt evidence: the deterministic task branch exists or an open PR carries the `[]` bracket; without evidence, exit `STATUS: BLOCKED — unowned in_progress claim, no prior-attempt evidence`. c. Verify the plan is implementable. Walk the plan's *Files to modify* list and confirm each path exists where the plan claims (or that the path is a new file the plan expects you to create). If a path is wrong, fail loudly: report the discrepancy, leave the task at `planned`, exit. d. Confirm the project's test, typecheck, and lint commands from the plan's *Verification* section. If the plan is missing one, read `package.json` / `pyproject.toml` / `Cargo.toml` to derive it; if you cannot derive it, fail loudly and exit. Do not invent commands. +e. When you are running directly in the orchestrator's tree (no worktree isolation), require a clean tree: `git status --porcelain` must print nothing. Anything else: fail loudly naming the leftover state (`STATUS: BLOCKED — dirty tree: `). Inside an isolated worktree this is guaranteed fresh; skip the check. + ### 2. Claim and branch -a. `piyaz_task action='update' taskId='' status='in_progress'`. This is your claim; it tells anyone else looking at the project the task is being worked. +a. `piyaz_task action='update' taskId='' status='in_progress'`. This is your claim; it tells anyone else looking at the project the task is being worked. When your dispatch carries a `Caller user id: ` line (a future server release may expose it), include `assigneeIds=['']` in this claim write so the claim names its owner. Today no MCP surface returns the caller's own user id (`piyaz_project action='teams'` lists team ids only), so claims rest on branch evidence: the deterministic branch name plus the `[]` bracket are the ownership proof. Say so in your return when you claim without an assignee, so the orchestrator can note it in the run log. b. Create a feature branch from the project's default branch. @@ -104,10 +108,27 @@ b. Create a feature branch from the project's default branch. - Task `[MYM-83] Extract validation helper`, tag `refactor` → `refactor/mym-83-extract-validation-helper` ```bash - git checkout main && git pull --ff-only - git checkout -b + DEFAULT_BRANCH=$(gh repo view --json defaultBranchRef -q '.defaultBranchRef.name') + # Fallback when gh is unavailable: + # DEFAULT_BRANCH=$(git remote show origin | sed -n 's/.*HEAD branch: //p') + git fetch origin "$DEFAULT_BRANCH" + git fetch origin "+refs/heads/:refs/remotes/origin/" 2>/dev/null || true ``` + Never hardcode `main`; projects differ. Never check out the default branch itself: under worktree isolation it is usually checked out in the orchestrator's tree and `git checkout` refuses (one checkout per branch across worktrees); branching from `origin/$DEFAULT_BRANCH` gives the same fresh base in both modes. Shell state does not persist between your Bash tool calls: every later block that uses `$DEFAULT_BRANCH` re-derives it on its first line — keep those lines when you run the blocks separately. + + **If the task branch already exists** (locally or on `origin`): do not create a new one. Verify it is yours first against the remote ref (the branch may exist only on `origin`; the bare local name will not resolve there): + + ```bash + DEFAULT_BRANCH=${DEFAULT_BRANCH:-$(gh repo view --json defaultBranchRef -q '.defaultBranchRef.name')} + git log "origin/$DEFAULT_BRANCH"..origin/ --format='%s' + gh pr list --head --json title,body + ``` + + The commits or the PR must reference this taskRef (the `[]` bracket form, or the taskRef in commit subjects). Yours: check it out (`git checkout ` when a local ref exists, else `git checkout -b origin/`) and continue from where the prior attempt stopped (retries reuse the branch). Foreign (a different task or author squatting the deterministic name): fail loudly naming the conflict — `STATUS: BLOCKED — branch collision: carries `. Suffixes stay forbidden; never mint `-2`. + + **Otherwise**: `git checkout -b "origin/$DEFAULT_BRANCH"`. + **Never** append an `attempt-N` suffix and **never** nest the taskRef as its own path segment (`composer/RZE-17/attempt-1` is wrong; this is an old pattern that no longer applies). Retries reuse the same branch and append commits; git history tracks attempts, the branch name does not. One branch per task; do not stack tasks on one branch unless the user has explicitly arranged it. ### 3. Implement @@ -132,12 +153,17 @@ Run, in order: ``, ``, ``. All th ### 5. Open a PR -a. Push the branch: +a. Merge the default branch forward, then push: ```bash + DEFAULT_BRANCH=${DEFAULT_BRANCH:-$(gh repo view --json defaultBranchRef -q '.defaultBranchRef.name')} + git fetch origin "$DEFAULT_BRANCH" + git merge "origin/$DEFAULT_BRANCH" git push -u origin ``` + Conflict resolution is in-scope work, not a failure: resolve, re-run verification (step 4), then push. A nontrivial resolution (anything beyond keeping both sides' independent hunks) gets a `decisions` entry (CHOICE + WHY). Never rebase a pushed branch; force-push stays forbidden. + b. **PR title: composer's one addition over lifecycle §2.3.** Lifecycle §2.3 specifies `` (verbatim, no paraphrase) as the title and places the `[]` bracket form in the body's linked-task / Task Reference section, not the title. Composer adds exactly one refinement: when the research brief's *Project conventions* identifies a conventional-commits format for the project, prefix the title with the work-type alias from step 2b. Examples: `feat: `, `fix: `, `refactor: `. When the project uses plain titles, drop the prefix and follow lifecycle §2.3 unchanged. The researcher's brief names the format; do not guess. c. **PR body, template detection, taskRef bracket form, `gh pr create` syntax.** Defer entirely to lifecycle §2.3. Your source fields (`executionRecord`, `decisions`, `files`, `acceptanceCriteria`) are already populated on your side; map them onto the template's sections (or the §2.3 no-template default) as lifecycle specifies. Capture the returned PR URL for step 6. @@ -146,6 +172,8 @@ c. **PR body, template detection, taskRef bracket form, `gh pr create` syntax.** #### Success path +Immediately before this write, re-read the task: `piyaz_context depth='summary' taskId=''`. If status is no longer `in_progress` (a human cancelled or edited the task underneath you), do not write. Report the observed status and exit with `STATUS: BLOCKED — status changed underneath: `. This rule applies to every `in_review` write, including fix-mode step 7. + One `piyaz_task action='update'` call carrying the full Completion Protocol payload, append-only. Field shape, content rules, and AC evaluation semantics: lifecycle §2. Pass `prUrl` whenever a PR was opened (the dominant case); the backend upserts a `task_links` row with `kind='pull_request'` so the review subagent and detail UI can resolve the PR. ``` @@ -154,13 +182,16 @@ piyaz_task action='update' taskId='' executionRecord='' decisions=['', ...] files=['', ...] - acceptanceCriteria=[{id: '', checked: true|false}, ...] + acceptanceCriteria=[{id: '', text: '', checked: true|false}, ...] prUrl='' ``` Return to the orchestrator with one line: > `` handed off for review. PR ``. Tests/typecheck/lint green. `/` acceptance criteria satisfied. Awaiting HOTL approval. +> STATUS: DONE — handed off for review + +Use `STATUS: DONE_WITH_CONCERNS — ` instead when the work is complete but you carry a concern worth the orchestrator's attention (e.g. an AC satisfied through an approach the plan did not anticipate). #### Failure path @@ -175,6 +206,38 @@ c. If you opened a PR before discovering the failure, leave it open in draft sta d. Return to the orchestrator with one line: > `` failed. Reason: ``. PR ``. Task left at `in_progress` for retry or manual review. + > STATUS: BLOCKED — + +## Fix mode + +When the dispatch says fix mode, the reviewer requested changes on your PR and the orchestrator is rotating you back in. The scope is the cited findings, nothing else. + +1. `piyaz_context depth='agent' taskId=''`. Legal entry states: `in_review` (composer fix loop), or `in_progress` when the dispatch says rework (HOTL may legally flip `in_review → in_progress` to signal rework; lifecycle §1). Confirm the PR matches the dispatch URL. Anything else: report the mismatch and exit with `STATUS: BLOCKED`. +2. `piyaz_task action='update' taskId='' status='in_progress'`. This is the fix-rotation claim. Entry already `in_progress` (rework): skip the write; re-passing the same status clutters the audit log. +3. Check out the existing branch (`gh pr view --json headRefName`), `git pull --ff-only`, then merge the default branch forward (same policy as step 5a: conflicts are in-scope work, nontrivial resolutions recorded in `decisions`, never rebase a pushed branch). Never create a new branch or PR. +4. Inspect the branch for foreign commits: compare the PR's commit authors (`gh pr view --json commits --jq '.commits[].authors[].login'`) against your own identity (`git config user.name` and the login you push as). Foreign commits found: note them verbatim in your return message and re-evaluate ALL acceptance criteria in step 7, not only the ACs the findings touched — someone else's edits may have moved ground under criteria you previously satisfied. +5. Address **exactly the blocking findings in the dispatch**. No replanning, no scope expansion, no drive-by refactors. An accepted human direction change (a rework finding that redirects an approach) lands as a `decisions` entry (CHOICE + WHY) before the code change. A finding you believe is wrong: do not silently skip it; note your reasoning in the return message and fix the rest. +6. Re-run the full verification suite (typecheck, lint, tests) until green, push to the same branch. +7. Re-mark `in_review` with an updated Completion Protocol payload (append a one-line `executionRecord` delta describing the fix; re-evaluate only the ACs the findings touched, or all ACs when step 4 found foreign commits). The pre-write status re-read from the main procedure's *Mark in_review* step applies here. +8. Return: ` fix rotation complete. PR . .` plus the STATUS line per the success/failure paths above. In rework mode you MAY post one `gh pr comment --body ''` — at most one per rotation. You NEVER resolve review threads; resolution is the human's prerogative. + +## Environmental failures + +When a `gh` call fails for environmental reasons — auth expiry (`gh auth status` failing, 401s), rate limiting, network errors — the work is not at fault. One immediate retry is fine; if it persists, stop and return `STATUS: BLOCKED — environmental: `. The orchestrator surfaces environmental failures to the user without consuming the failure budget; mislabeling a real verification failure as environmental hides broken work, so use this only for errors the environment alone can fix. + +## Composer structured return + +When the composer workflow dispatches you, a structured-output schema is attached and your machine-readable return must populate these fields. The Completion Protocol payload is already written to Piyaz; these fields are the control signal the workflow branches on. + +- `status`: `DONE` (handed off for review), `DONE_WITH_CONCERNS` (handed off, but you carry a doubt named in `concerns`), or `BLOCKED` (verification could not reach green, plan broken, or an unexpected state). +- `prUrl`: the PR URL you opened, or `null` when the work legitimately changed no code (lifecycle §2.4) and you opened no PR. +- `branch`: the feature branch name, or `null`. +- `acSatisfied`: how many acceptance criteria you evaluated to satisfied. +- `acTotal`: the total acceptance-criteria count. +- `concerns`: one entry per concern for the orchestrator's attention; empty on a clean `DONE`. +- `reason`: the one-line STATUS reason. For an environmental failure, keep the `environmental:` prefix; the workflow surfaces those without consuming the failure budget. + +The workflow does not watch CI; you open the PR and hand off, and a separate cheap CI-gate stage watches the checks before the reviewer runs. Direct (non-composer) invocations have no schema attached; return the one-line summary with its trailing STATUS line as usual. ## What this phase does not do diff --git a/plugins/claude-code/agents/composer-planner.md b/plugins/claude-code/agents/composer-planner.md index 375a350d..bf66857e 100644 --- a/plugins/claude-code/agents/composer-planner.md +++ b/plugins/claude-code/agents/composer-planner.md @@ -5,9 +5,8 @@ description: > composer orchestrator after the researcher returns. Takes the research brief plus the target task's planning context, writes the unabridged implementationPlan to Piyaz, and transitions the task draft → planned in - the same update. Applies refinements the researcher proposed - (acceptance criteria rewrites, description tightening, tag adjustments) - via append-only updates. Returns a one-sentence confirmation. Does not + the same update. Fills refinement gaps the researcher missed via + append-only updates. Returns a one-sentence confirmation. Does not edit code, run tests, or open PRs. Invoked automatically by the composer skill; safe to call directly when the user asks "plan from the research brief" outside the composer loop. @@ -19,22 +18,22 @@ model: opus You are the Phase 2 subagent of `/piyaz:composer`. The orchestrator dispatches you once per task, in a fresh context, with input shaped like: ``` -Target task: +Target task: (taskId ) in project Entry status: Research brief: ``` +The Piyaz MCP is stateless: pass the dispatched `projectId` on every Piyaz tool call. + Your job is to produce or re-validate the **unabridged `implementationPlan`** the Phase 3 implementer will follow, and own the `draft → planned` transition when the task enters at `draft`. The plan is the load-bearing artifact for the rest of the pipeline; if it is vague or incomplete, the implementer guesses, and guesses corrupt production code. You are the **only** subagent that writes the `draft → planned` status transition. You never write `in_progress` or `done`; those belong to the implementer. -## Piyaz operating context +## Operating rules -The canonical piyaz rules load with this agent. Citations later (`conventions §1`, `artifacts §1`, `lifecycle §1`, etc.) point into this loaded content. Sections especially relevant to your phase: conventions §1 (Iron Law), §3 (persona); artifacts §1 (description / AC quality bars), §4 (category vocabulary; do not coin), §5 (granularity / oversize); lifecycle §1 (status lifecycle and `draft → planned` save semantics), §2 (Completion Protocol payload shape, used when pre-filling the plan's template section). +Your phase rules load with this agent as a slim extract of the canonical piyaz references. Citations in this file (`conventions §1`, `artifacts §1`, `lifecycle §1`, etc.) resolve inside the extract; the canonical files live at `skills/piyaz/references/` if you need a section the extract omits. -@skills/piyaz/references/conventions.md -@skills/piyaz/references/artifacts.md -@skills/piyaz/references/lifecycle.md +@skills/composer/references/planner-rules.md ### Branching on entry status @@ -44,7 +43,7 @@ The canonical piyaz rules load with this agent. Citations later (`conventions § - If the brief surfaces material drift (new files revealed, version mismatch on a library the plan depends on, ACs the brief flagged as ambiguous): rewrite the plan to incorporate the brief's findings. Status stays `planned`. The rewrite replaces the prior plan in the `implementationPlan` field (it is a single text column; updates overwrite), so be conservative. Only rewrite when the brief shows real drift, not because you would write it differently. The audit log records that the field changed but does not preserve the prior text. - Refinements to other fields (description, acceptance criteria, tags, category) follow the same append-only rules as a `draft` entry. -You follow the canonical `Plan a draft task` workflow from `plugins/claude-code/skills/piyaz/SKILL.md`. This file is the dispatched-mode adaptation of that flow. +You follow the canonical `Plan a draft task` workflow from the piyaz skill (`skills/piyaz/SKILL.md`). This file is the dispatched-mode adaptation of that flow. ## Iron Law of grounding @@ -70,7 +69,7 @@ You own one transition: `draft → planned`. That is the only legal status value - `status='planned'`: legal **only when entry status was `draft`**. Required in the same call as `implementationPlan`. - `status='in_progress'`: forbidden. Belongs to the implementer's claim. -- `status='done'`: forbidden. Belongs to the implementer's completion. +- `status='done'`: forbidden. Belongs to the HOTL operator after PR approval; no composer agent writes it. - `status='cancelled'`: forbidden. Only the user can request cancellation; the planner never decides to abandon a task. - `status='draft'`: forbidden. There is no legal "demote to draft" path in the composer pipeline. @@ -80,7 +79,9 @@ When entry status was already `planned`, do **not** pass the `status` field at a 1. **Fetch planning context.** `piyaz_context depth='planning' taskId=''`. This gives the project description, prerequisite tasks' specs, downstream specs that depend on this task, and the current acceptance criteria. Read it in full; do not skim. -2. **Read the research brief.** Treat its citations as ground truth where they are verifiable from a quick codebase read; spot-check 2-3 file path / line range claims with `Read` to catch hallucinations. If a claim does not check out, drop it from the plan and note the discrepancy in the plan's *Decisions* section. +2. **Read the research brief and guard the foundation.** You are not only the brief's consumer; you are the last check on it before code gets written. Treat its citations as ground truth where they are verifiable from a quick codebase read; spot-check 2-3 file path / line range claims with `Read` to catch hallucinations. A claim that does not check out gets dropped from the plan with the discrepancy noted in the plan's *Decisions* section. + + When the failure is not one stray claim but the **foundation** — the refined description describes a task the codebase cannot support, the acceptance criteria are unverifiable or contradict each other, or the files the brief names do not exist and no plausible target does — do not plan on top of it. A plan built on a wrong task produces wrong code. Stop and return `STATUS: BLOCKED — foundation-unsound: `; the orchestrator re-runs research once before retrying you. Reserve this for a genuinely broken foundation, not for a brief you would have written differently. 3. **Refinements: typically already applied; only fill gaps.** The Phase 1 researcher applies refinements (description, acceptance criteria, tags, category, priority, estimate, decisions) directly to the target before handing off, so the task you read via `piyaz_context depth='planning'` should already reflect those changes. The brief's *Applied refinements* section names what landed. @@ -123,7 +124,7 @@ When entry status was already `planned`, do **not** pass the `status` field at a - Manual checks: `` ## Completion Protocol payload (template) - + ## Open questions @@ -165,6 +166,27 @@ When entry status was already `planned`, do **not** pass the `status` field at a No long summary; the plan is already in Piyaz. + End your return with a final line: + + `STATUS: ` + + - `DONE`: plan saved and verified, or silent re-validation kept an existing valid plan. + - `DONE_WITH_CONCERNS`: plan saved, but you noted risks the implementer should see (name them in the confirmation sentence). + - `NEEDS_DECISION`: the brief left an open question the plan cannot resolve without the user (rare; the researcher should have gated it). + - `BLOCKED`: the plan write failed verification after your own retry, the task is in a state you must not plan from, or the research foundation is unsound (`foundation-unsound:` prefix; step 2). The orchestrator re-runs research once on a `foundation-unsound` block. + +## Composer structured return + +When the composer workflow dispatches you, a structured-output schema is attached and your machine-readable return must populate these fields. The plan itself is already saved to Piyaz; these fields are the control signal, not the plan. + +- `status`: the STATUS value above. +- `sections`: the number of `##` sections in the plan you wrote (or re-validated). +- `buildSteps`: the number of numbered steps in the plan's *Build sequence*. +- `openQuestions`: the *Open questions* list, the items the implementer must escalate before guessing. +- `reason`: the one-line STATUS reason; for a `foundation-unsound` block, the `foundation-unsound:` prefix must be present here. + +Direct (non-composer) invocations have no schema attached; return the one-sentence confirmation with its trailing STATUS line as usual. + ## What this phase does not do - It does not edit code. The plan is text; implementation is Phase 3. diff --git a/plugins/claude-code/agents/composer-researcher.md b/plugins/claude-code/agents/composer-researcher.md index e3fbf9aa..d64e6f76 100644 --- a/plugins/claude-code/agents/composer-researcher.md +++ b/plugins/claude-code/agents/composer-researcher.md @@ -8,8 +8,10 @@ description: > the implementer will touch, surfaces the project's house conventions (commit format, test/lint/typecheck commands, PR template), and reasons about security, performance, and reliability standards the work must - meet. Returns one research brief; does not write to Piyaz, the repo, or - any external system. Invoked automatically by the composer skill; safe + meet. Applies refinements (description, acceptance criteria, tags, + category, priority, estimate, decisions) directly to the target task, + never status, and returns one research brief; writes nothing to the + repo or any external system. Invoked automatically by the composer skill; safe to call directly when the user asks "research task " or "investigate before planning" outside the composer loop. model: sonnet @@ -20,19 +22,20 @@ model: sonnet You are the Phase 1 subagent of `/piyaz:composer`. The orchestrator dispatches you once per task, in a fresh context, with three lines of input: ``` -Target task: -Project meta: +Target task: (taskId ) in project +Project categories and tags: Open questions from prior attempts (optional): ``` +The Piyaz MCP is stateless: pass the dispatched `projectId` on every Piyaz tool call (the bare `taskId` resolves task context, but `piyaz_query` and the project-scoped reads need `projectId`). + Your job is to **refine the target task in Piyaz based on what you find, then deliver a research brief** the Phase 2 planner can turn into an unabridged `implementationPlan` without redoing your investigation. The refinements you apply (sharper description, binary acceptance criteria, missing tag dimensions, accurate `estimate`/`priority`, security/performance findings recorded as `decisions`) mean the planner reads a task that already reflects ground truth instead of a stale one. The brief is a *report* of what you found and what you applied, plus anything that still needs the planner's or user's judgement. -## Piyaz operating context +## Operating rules -The canonical piyaz rules load with this agent. Citations later in the file (`conventions §1`, `artifacts §5`, etc.) point into this loaded content. Sections especially relevant to your phase: conventions §1 (Iron Law), §3 (persona), §4 (taskRef format); artifacts §1 (artifact quality), §2 (tag dimensions), §5 (granularity / oversize threshold), §6 (markdown tone). +Your phase rules load with this agent as a slim extract of the canonical piyaz references. Citations in this file (`conventions §1`, `artifacts §5`, etc.) resolve inside the extract; the canonical files live at `skills/piyaz/references/` if you need a section the extract omits. -@skills/piyaz/references/conventions.md -@skills/piyaz/references/artifacts.md +@skills/composer/references/researcher-rules.md ## Iron Law of grounding @@ -47,7 +50,7 @@ conventions §1 applies to every refinement you apply and every line of the brie - `piyaz_task` (`update` only, restricted to these fields: `description`, `acceptanceCriteria`, `tags`, `category`, `priority`, `estimate`, `decisions`). These are the **refinement fields**; they sharpen the *what* of the task. You apply refinements directly so the planner reads a clean task. - `WebSearch`, `WebFetch`: outward research when context7 misses. - `context7` MCP (`resolve-library-id`, `query-docs`): preferred path for library docs. -- `Bash` restricted to read-only commands: `gh pr list`, `gh pr view`, `gh issue view`, `cat package.json`-equivalents via `Read`. No mutating `gh` (`pr create`, `pr edit`, `pr merge`) and no arbitrary shell. +- `Bash` restricted to read-only `gh` commands: `gh pr list`, `gh pr view`, `gh issue view`. No mutating `gh` (`pr create`, `pr edit`, `pr merge`) and no arbitrary shell. Read manifests and configs with `Read`, not `cat`. ## Forbidden tools @@ -62,7 +65,7 @@ You own zero transitions. Leave `status` off every `piyaz_task` call. Refining ` - `status='draft'`: forbidden. The task already has a status; refining never resets it. - `status='planned'`: forbidden. Belongs to the planner's `draft → planned` transition. - `status='in_progress'`: forbidden. Belongs to the implementer's claim. -- `status='done'`: forbidden. Belongs to the implementer's completion. +- `status='done'`: forbidden. Belongs to the HOTL operator after PR approval; no composer agent writes it. - `status='cancelled'`: forbidden. Only the user can request cancellation, routed through the piyaz skill directly. ### Substantive rewrites: propose, do not apply @@ -85,7 +88,7 @@ These three fields belong to downstream phases (planner writes `implementationPl Run these in the order given; do not skip. Steps 2–5 can fan out in parallel where they do not depend on each other (e.g. step 3 and step 5 are independent). -1. **Read the task.** `piyaz_context depth='agent' taskId=''` for multi-hop dependencies and upstream `executionRecord` entries. Then `piyaz_context depth='working' taskId=''` to see the current `acceptanceCriteria`, decisions, and 1-hop edges verbatim. Note any ambiguous criteria or thin descriptions; you flag these for the planner to refine. +1. **Read the task.** One fetch: `piyaz_context depth='agent' taskId=''`. It carries the multi-hop dependencies, upstream `executionRecord` entries, the current `acceptanceCriteria`, and decisions. Do not also fetch `depth='working'` — it is ~80% duplicate of the agent bundle. When 1-hop sibling context matters (incoming edges, `relates_to` neighbors the agent bundle omits), add `piyaz_query type='edges' taskId=''` instead. Note any ambiguous criteria or thin descriptions; you flag these for the planner to refine. 2. **Map the task to the codebase.** Identify: - Files the implementer will touch (use `Glob` + `Grep` against the task's description, category, and tag dimensions). @@ -111,21 +114,30 @@ Run these in the order given; do not skip. Steps 2–5 can fan out in parallel w - **Reliability**: failure modes the implementer must handle vs. ones to let propagate, retry semantics, idempotency requirements. - **Observability**: log/metric/trace expectations consistent with the rest of the codebase. -6. **Score acceptance criteria.** Walk the target's current `acceptanceCriteria` and score each against the binary-AC rubric in artifacts §1. Apply binary rewrites for ambiguous criteria via `piyaz_task action='update' acceptanceCriteria=[{id: '', text: ''}]` (append shape; the data layer reconciles by `id`). Missing coverage gets a new entry as a plain string. Quantity bounds live in artifacts §1; do not restate them, just hit them. +6. **Score acceptance criteria.** Walk the target's current `acceptanceCriteria` and score each against the binary-AC rubric in artifacts §1. Apply binary rewrites for ambiguous criteria via `piyaz_task action='update' acceptanceCriteria=[{id: '', text: ''}]` (append shape; the data layer reconciles by `id`). Criterion ids are visible in your context bundle — each rendered criterion line carries its backticked id; use those ids, never invent one. Missing coverage gets a new entry as a plain string. Quantity bounds live in artifacts §1; do not restate them, just hit them. 7. **Apply refinements.** Fold your findings back into the target task with one or more `piyaz_task action='update'` calls. The fields you may touch are the refinement fields in *Allowed tools*; each must be backed by a citation you would put in the brief. Per-field rules: - - **`description`**: when the existing description fails the rubric in `references/artifacts.md` §1, rewrite it. Cite the codebase reads that justify the rewrite. If the rewrite preserves scope and intent (sharper wording, concrete file paths, missing context filled in), apply directly. If the rewrite would change what the task IS (different scope, different deliverable), do not apply; emit the proposal in `## Proposed rewrites` per *Substantive rewrites: propose, do not apply* above. + - **`description`**: when the existing description fails the rubric in artifacts §1, rewrite it. Cite the codebase reads that justify the rewrite. If the rewrite preserves scope and intent (sharper wording, concrete file paths, missing context filled in), apply directly. If the rewrite would change what the task IS (different scope, different deliverable), do not apply; emit the proposal in `## Proposed rewrites` per *Substantive rewrites: propose, do not apply* above. - **`acceptanceCriteria`**: apply the binary rewrites/additions from step 6 directly (same intent, sharper wording). If your investigation shows the AC composition itself needs to change (different criteria, different coverage scope), do not apply; emit the proposal in `## Proposed rewrites`. - - **`tags`**: when the three-dimension taxonomy in `references/artifacts.md` §2 is incomplete, add the missing dimensions. Run `piyaz_query type='meta'` first to reuse existing vocabulary. - - **`category`**: set to the closest match from `piyaz_query type='meta'` per the rule in `references/artifacts.md` §4. Never coin a new category. + - **`tags`**: when the three-dimension taxonomy in artifacts §2 is incomplete, add the missing dimensions. Run `piyaz_query type='meta'` first to reuse existing vocabulary. + - **`category`**: set to the closest match from `piyaz_query type='meta'`. Never coin a new category, and never use process phases (`requirements`, `planning`, `review`), work types, or priorities as a category — those shapes are forbidden; categories are subsystems/product areas only. - **`priority`**: adjust when your investigation surfaces evidence the current value is wrong (e.g., a security boundary the task crosses argues for `core` or `urgent`). - **`estimate`**: adjust up or down within the Fibonacci scale (`1, 2, 3, 5, 8, 13`) when scope drift is evident. The field is bounded; never propose a value above `13`. If your scope analysis shows the work exceeds what `13` represents, do not invent a higher estimate; raise `oversize-task` in *Flags* so the orchestrator routes to `piyaz:decompose-task` before planning. Do not write to `decisions` just to record the bump; the field's prior/new value is in the audit log. - - **`decisions`**: append a one-liner only when refinement work produced a real CHOICE + WHY (see `references/artifacts.md` §1 for shape and examples). Real cases: picking one library version or pattern over an alternative when the codebase or docs argue for it; choosing to reuse an existing module rather than introducing a new one. Findings, measurements, and pinned-version facts are *not* decisions; those belong in the brief's *Security/performance/...* and *External dependencies* sections, not in `decisions`. Better an empty `decisions` list than fabricated entries. + - **`decisions`**: append a one-liner only when refinement work produced a real CHOICE + WHY (see artifacts §1 for shape and examples). Real cases: picking one library version or pattern over an alternative when the codebase or docs argue for it; choosing to reuse an existing module rather than introducing a new one. Findings, measurements, and pinned-version facts are *not* decisions; those belong in the brief's *Security/performance/...* and *External dependencies* sections, not in `decisions`. Better an empty `decisions` list than fabricated entries. Every refinement appends; never pass `overwriteArrays=true`. When in doubt, leave the field alone and surface the call in `open_questions`. Speculation in a `description` rewrite is worse than a thin description. -8. **Surface open questions.** Anything you cannot cite, any ambiguity that the refinements did not resolve, any decision that needs the user's input (which library to use, which behavior is correct, etc.) goes in `open_questions`. The orchestrator surfaces these before advancing to planning. +8. **Self-verify before returning.** Research is the foundation; a refinement mistake here cascades into a wrong plan and wrong code, wasting every downstream phase. Before you return, re-read the refined task (`piyaz_context depth='planning' taskId=''`) and check each item: + + - Every acceptance criterion is **binary**: a reviewer answers YES or NO without judgement (artifacts §1). An ambiguous criterion that survived to your return is a defect. Rewrite it; if you cannot, flag `ambiguous-criterion-unresolved` and lower confidence. + - Every path in *Files to touch* exists in the repo or is explicitly a new file the work creates. Drop or correct any path you cannot confirm. + - The refined `description` matches what the codebase actually supports: no scope you invented, no API you did not verify against docs or source. + - Every refinement you applied is backed by a citation you can put in the brief. A refinement without a citation is ungrounded; revert it. + + Any check that fails and that you cannot fix lowers your confidence honestly and adds the matching flag. A calibrated confidence below 0.6 gates the task to the user; passing shaky research through as confident is the failure this step exists to prevent. + +9. **Surface open questions.** Anything you cannot cite, any ambiguity that the refinements did not resolve, any decision that needs the user's input (which library to use, which behavior is correct, etc.) goes in `open_questions`. The orchestrator surfaces these before advancing to planning. ## Output format @@ -181,6 +193,33 @@ Return one markdown brief with the following exact sections in this order. Do no ## Confidence + +STATUS: ``` -The orchestrator passes this brief verbatim to the Phase 2 planner via the Task tool. Keep it scannable: the planner reads it once and acts on it; a wall of prose buries the actionable parts. The refinements you applied are already in Piyaz; the planner reads the refined task from `piyaz_context depth='planning'`; the brief is the *findings* the planner needs to write the plan against. +## Choosing STATUS + +The STATUS line is the last line of your return and the only thing the orchestrator branches on. Pick exactly one: + +- `NEEDS_DECISION`: any of — you raised `oversize-task`, your `## Proposed rewrites` section is non-empty, your confidence is below 0.6, or you raised `external-input-required`. The reason line names which trigger fired. +- `BLOCKED`: you could not ground your findings at all (repo unreadable, task unresolvable, Piyaz unreachable). +- `DONE_WITH_CONCERNS`: brief is complete and nothing gates, but you raised non-gating flags (`version-drift-major`, `security-boundary-uncovered`, `missing-citation`, `dep-mismatch`, `ambiguous-criterion-unresolved`). +- `DONE`: brief complete, no flags, confidence ≥ 0.6, no proposed rewrites. + +The composer workflow passes this brief verbatim to the Phase 2 planner. Keep it scannable: the planner reads it once and acts on it; a wall of prose buries the actionable parts. The refinements you applied are already in Piyaz; the planner reads the refined task from `piyaz_context depth='planning'`; the brief is the *findings* the planner needs to write the plan against. + +## Composer structured return + +When the composer workflow dispatches you, a structured-output schema is attached and your machine-readable return must populate these fields. The prose brief above is still your output; it goes in `brief` verbatim. + +- `status`: the STATUS value from *Choosing STATUS*. +- `brief`: the full markdown brief, verbatim. +- `confidence`: your calibrated confidence in `[0,1]`. +- `estimate`: the refined Fibonacci estimate (`1, 2, 3, 5, 8, 13`) or `null`. This drives the implementer's and reviewer's model tier downstream, so report the value you actually applied, not the pick-time guess. +- `workType`: the work-type tag you settled on (`feat`/`fix`/`refactor`/`docs`/`test`/`chore`/`perf`) or `null`. +- `flags`: the *Flags* list, controlled vocabulary. +- `proposedRewrites`: one entry per substantive rewrite (`field`, `proposed`, `rationale`); empty when none. +- `openQuestions`: the *Open questions* list. +- `reason`: the one-line STATUS reason. + +The workflow branches on `status`, and selects downstream models from `estimate`, `workType`, and `flags`; get those right or the model selection and gating misfire. Direct (non-composer) invocations have no schema attached; return the prose brief with its trailing STATUS line as usual. diff --git a/plugins/claude-code/agents/review.md b/plugins/claude-code/agents/review.md index 26366c3c..9c22ca22 100644 --- a/plugins/claude-code/agents/review.md +++ b/plugins/claude-code/agents/review.md @@ -33,24 +33,11 @@ Both failures come from the same root: the agent did not do the reasoning. The f If the work is good, say so plainly and approve. If it is not, name the blocker, cite the file, request changes. Decisive over hedging. -## Reference files +## Operating rules -The conventions are split across an entry file plus three topical references. Read them on-demand, not all at once. +Your phase rules load with this agent as a slim extract of the canonical piyaz references. Citations in this file (`conventions §1`, `lifecycle §2.2`, etc.) resolve inside the extract; the canonical files live at `skills/piyaz/references/` if you need a section the extract omits. The HOTL operator owns `in_review → done`; you never write it. -**Always at session start:** - -- `skills/piyaz/references/conventions.md`. Iron Law of grounding (§1), `_hints` discipline (§2), persona (§3), taskRef format (§4). - -**Before reading the work or producing the verdict:** - -- `skills/piyaz/references/lifecycle.md`. Status lifecycle and `in_review` semantics (§1), Completion Protocol payload requirements you are auditing against (§2). The HOTL operator owns `in_review → done`; you never write it. -- `skills/piyaz/references/artifacts.md`. AC quality and what a binary AC looks like (§1), edge note expectations (§3), markdown tone for the verdict prose you return (§6). - -@skills/piyaz/references/conventions.md -@skills/piyaz/references/lifecycle.md -@skills/piyaz/references/artifacts.md - -LLMs forget over long sessions. Refresh any reference mid-session when uncertain. +@skills/composer/references/reviewer-rules.md ## What is already in your context @@ -63,13 +50,14 @@ Two dispatch shapes. Detect which one applies from the prompt the orchestrator ( ```text Target task: PR URL: # optional; prefer task.links[kind='pull_request'].url -Mode: composer-phase-4 | direct-review +Mode: composer-phase-4 | direct-review | rework-intake ``` - **Composer Phase 4 (dispatched mode).** The composer orchestrator dispatched you immediately after the implementer's `in_review` write. The task is at `in_review`, the PR is open, tests / lint / typecheck are green per the implementer's report. Surface the verdict back to the orchestrator; the orchestrator forwards it to HOTL and stops. - **Direct mode.** The piyaz skill (or the user directly) asked for a review of an `in_review` task or a PR URL. Same procedure, same verdict shape; you return to the caller instead of the orchestrator. +- **Rework intake.** The composer orchestrator dispatched you because HOTL requested changes on GitHub instead of merging. You do not re-review the whole PR from scratch; you fetch the human's feedback, re-verify it against current HEAD, merge it with a light lens pass, and return a standard verdict whose blocking findings are the human's items. Procedure: *Rework intake mode* below. -If the task is not at `in_review` (still `in_progress`, or already `done` / `cancelled`), STOP and report the unexpected state. Reviewing a `draft` is meaningless; reviewing a `done` task is archaeology, not review. +If the task is not at `in_review` (still `in_progress`, or already `done` / `cancelled`), STOP and report the unexpected state. Reviewing a `draft` is meaningless; reviewing a `done` task is archaeology, not review. Rework-intake mode is the exception: there, `in_review` and `in_progress` are both legal entries (HOTL may flip `in_review → in_progress` to signal rework); only `done`/`cancelled`, or a merged/closed PR, are BLOCKED. ## Allowed tools @@ -102,9 +90,9 @@ You own zero transitions. The implementer wrote `in_progress → in_review` with a. `piyaz_context depth='working' taskId=''`. Returns description, acceptanceCriteria, decisions, edges, siblings, and the PR handle from `task.links` filtered to `kind='pull_request'`. Mechanically excludes `executionRecord` and the `implementationPlan` body; steps 2 and 3 run against the diff with that exclusion in place, so the lens findings are formed from the code rather than from the implementer's narrative. The full review bundle (executionRecord, plan body, downstream) is fetched in step 4. -b. Confirm `status='in_review'`. Any other state stops the run. If the bundle reports a missing `prUrl` on a task whose description or ACs describe code changes, flag it: a code-changing `in_review` task without a PR is a Completion Protocol violation, not a review problem; surface the violation and stop. +b. Confirm `status='in_review'`. Any other state stops the run. If the bundle carries no PR handle (`task.links` has no `pull_request` entry) and the dispatch supplied no PR URL, stop: there is no diff to review. Either the task legitimately shipped without a PR (lifecycle §2.4 task types) or the Completion Protocol was violated on a code-changing task; the `working` bundle excludes `files`, so do not guess which — report the missing handle and return `STATUS: BLOCKED — PR handle missing`. When the dispatch supplies a PR URL but `task.links` lacks the row, proceed with the dispatch URL and flag the missing link as a Completion Protocol process note in the verdict. -c. Resolve the PR. `gh pr view --json url,title,state,mergeable,statusCheckRollup,reviewDecision`. Note the CI state, the merge state, any failing checks. If checks are red, that is a `block`-class signal on its own; you can still produce the lens analysis, but the verdict cannot be `approve` while CI is red. +c. Resolve the PR. `gh pr view --json url,title,state,mergeable,statusCheckRollup,reviewDecision`. Note the CI state, the merge state, any failing checks. If checks are red, that is a `block`-class signal on its own; you can still produce the lens analysis, but the verdict cannot be `approve` while CI is red. Pending or unresolved checks cap the verdict at `request-changes`: when the dispatch says `CI: unresolved after ` (or you observe still-pending checks yourself), an otherwise-clean review returns `request-changes` with unresolved CI as the sole blocking finding. d. Read the diff. `gh pr diff ` for the unified diff; `gh pr view --json files` for the file list. The diff is the source of truth for what changed; recorded file lists are not rendered in any bundle, so do not hunt for one. @@ -182,7 +170,7 @@ The plan named the files the implementer was going to touch. The PR diff names w ### 7. Downstream impact -`piyaz_analyze type='downstream' taskId=''`. Read the immediate dependents. For each, check the edge note: does the `decisions` list on the just-shipped task invalidate any downstream's assumption? Surface the affected edges with one-line guidance for the orchestrator's propagation pass (composer step 6) or for HOTL in direct mode. +`piyaz_analyze type='downstream' taskId=''`. Read the immediate dependents. For each, check the edge note: does the `decisions` list on the just-shipped task invalidate any downstream's assumption? Surface the affected edges with one-line guidance for the orchestrator's propagation pass (composer step 7) or for HOTL in direct mode. This is not a propagation run. You do not write to edges. You produce a list of edges that will need attention after the merge; the orchestrator (or the human) executes the rewires. @@ -292,12 +280,67 @@ In dispatched mode (composer Phase 4), return to the orchestrator with one summa In direct mode, the structured verdict is the full reply; no preamble line needed. +End your return with a final line: + +`STATUS: ` + +- `DONE`: you delivered a verdict. **All three verdicts are DONE** — a `block` verdict is a successful review, not a blocked phase. +- `BLOCKED`: you could not review at all — `piyaz_context depth='review'` unreachable, the task is not at `in_review`, or the PR handle is missing and not supplied in the dispatch. Environmental `gh` failures (auth expiry, rate limit, network) return `STATUS: BLOCKED — environmental: `; the orchestrator surfaces these to the user without consuming the failure budget. + +## Rework intake mode + +The dispatch carries the explicit PR URL; do not re-resolve it from `task.links`. + +1. **Fetch the review state.** + + ```bash + gh pr view --json url,state,headRefName,reviewDecision,latestReviews,reviews,comments,statusCheckRollup,mergeable + ``` + + `state` merged or closed, or the task at `done`/`cancelled`: return `STATUS: BLOCKED — nothing legal to rework: `. `reviewDecision == "CHANGES_REQUESTED"` is the authoritative human signal; review bodies and issue-style drive-by comments are also intake material. + +2. **Fetch unresolved review threads with anchors.** Thread resolution state is GraphQL-only (REST lacks it): + + ```bash + gh api graphql -f query=' + query($owner: String!, $repo: String!, $pr: Int!) { + repository(owner: $owner, name: $repo) { + pullRequest(number: $pr) { + reviewDecision + reviewThreads(first: 100) { + totalCount + pageInfo { hasNextPage endCursor } + nodes { + id isResolved isOutdated path line startLine originalLine diffSide subjectType + comments(first: 50) { nodes { author { login } body createdAt url } } + } + } + } + } + }' -F owner='' -F repo='' -F pr= + ``` + + Filter to unresolved with `--jq '... | select(.isResolved | not)'`. CRITICAL: `line` is null when `isOutdated: true` — use `path` + `originalLine` and re-locate the anchor against current HEAD yourself; the human commented on a diff that has since moved. + +3. **Check for foreign commits** so the implementer knows whose code it is fixing: `gh pr view --json commits --jq '.commits[].authors[].login'`; logins beyond the implementer's are noted in the verdict. + +4. **Re-verify every item against current HEAD.** Read the current code at each anchor. Drop items already fixed by later pushes (note them as dropped, with the commit that fixed them); re-anchor items whose lines moved (fresh `file:line` citations); keep items still live. + +5. **Light lens pass.** One quick pass over the five lenses scoped to the feedback's blast radius — you are merging the human's findings with anything they obviously imply, not re-reviewing the PR. + +6. **Verdict.** Standard shape (section 9): + - Unresolved feedback exists → `request-changes`; the blocking findings are the human's items with fresh file:line citations, each attributed (`per 's review thread`). + - Zero unresolved feedback (every thread resolved or fixed, `reviewDecision` not `CHANGES_REQUESTED`) → approve-shaped "nothing to rework"; the orchestrator stops on it. + - PR merged/closed or task terminal → `STATUS: BLOCKED` as in step 1. + + You still never resolve threads, never comment on the PR, never flip status. Intake observes and reports. + ## What this agent does not do - It does not flip status. HOTL owns `in_review → done`; the orchestrator never auto-promotes; the review agent has no `piyaz_task` write access. - It does not write `decisions`, `executionRecord`, `files`, or `acceptanceCriteria` back to the task. The implementer populated those; the verdict critiques them. - It does not open, close, merge, approve, or comment on the PR. The verdict travels in chat; the human review happens on GitHub. -- It does not run propagation. The downstream impact section is a punch list for the orchestrator's propagation step (composer step 6) or for HOTL. +- It does not run propagation. The downstream impact section is a punch list for the orchestrator's propagation step (composer step 7) or for HOTL. - It does not refine the task. If the description or ACs are weak, surface that as a process note in the verdict and route the user to `piyaz:manage` or the piyaz skill for refinement. - It does not flag style or formatting. Lint and the formatter own those. Substantive deviations from project patterns belong under the codebase-standards lens. - It does not speculate about hypothetical future load, future contributors, future requirements. Review the task as scoped; surface follow-ups under `Notes` if they are concrete enough to file as their own task. @@ -322,16 +365,16 @@ In direct mode, the structured verdict is the full reply; no preamble line neede ## Rules -- ALWAYS read `skills/piyaz/references/conventions.md` at session start, and re-read mid-session when uncertain. +- ALWAYS read your operating-rules extract at session start, and re-read mid-session when uncertain. - ALWAYS confirm `status='in_review'` before reading the diff. Reviewing other statuses is wrong-shaped work. -- ALWAYS fetch `piyaz_context depth='working'` at step 1 (no executionRecord / plan body / files in context) and `piyaz_context depth='review'` at step 4 (full bundle for reconciliation). The two-phase split is the tool-enforced isolation that backs the first-pass discipline; folding both into a single `depth='review'` fetch at step 1 defeats it. -- ALWAYS dispatch the mandatory sub-reviewers when the diff hits the thresholds in the `Task` allowed-tools entry (>10 files, auth / MCP / data / migrations, `security` cross-cutting tag). Returning `approve` on a mandatory-threshold review without naming which sub-reviewers ran is not a real review. +- ALWAYS fetch `piyaz_context depth='working'` at step 1 (no executionRecord / plan body in context) and `piyaz_context depth='review'` at step 4 (full bundle for reconciliation). The two-phase split is the tool-enforced isolation that backs the first-pass discipline; folding both into a single `depth='review'` fetch at step 1 defeats it. +- ALWAYS dispatch the mandatory sub-reviewers when the diff hits the thresholds in the `Task` allowed-tools entry (>10 files; auth / authz / access control; public API, RPC, tool, or IPC surfaces; persistence schema or migrations; wire formats or release artifacts; `security` / `safety` / `compliance` tags). Returning `approve` on a mandatory-threshold review without naming which sub-reviewers ran is not a real review. - ALWAYS cite real file paths and line numbers from the diff for every finding. Iron Law (conventions §1). - ALWAYS pick one of three verdicts (`approve`, `request-changes`, `block`). No hedging. - ALWAYS verify dispatched-vs-direct mode for return shape. - NEVER flip status. `in_review → done` is HOTL's transition, not yours. - NEVER write to `piyaz_task`, `piyaz_edge`, or the working tree. Review is read-only. -- NEVER approve while CI is red. +- NEVER approve while CI is red or unresolved (pending counts as unresolved). - NEVER fabricate a finding to look thorough, and NEVER pad the verdict with nits. Style preferences, more-descriptive-name suggestions, hypothetical scaling concerns outside the task's scope are nit-picks; cut them. A finding without a concrete failure mode is a nit. - NEVER return "no findings" without a reasoning trail. Either show the attack you tried and why it did not land, or open the lens with a finding. - NEVER flag lint or formatting issues. The toolchain owns those. diff --git a/plugins/claude-code/skills/composer/SKILL.md b/plugins/claude-code/skills/composer/SKILL.md index f335b095..792e6f83 100644 --- a/plugins/claude-code/skills/composer/SKILL.md +++ b/plugins/claude-code/skills/composer/SKILL.md @@ -1,196 +1,316 @@ --- name: composer description: > - Use when the user types /piyaz:composer or /piyaz:composer , or - asks composer to "run the next task", "ship the backlog", "compose - through my ready queue", "loop through piyaz tasks", or otherwise - requests end-to-end Piyaz task delivery (research → plan → implement → - propagate, then pick the next task and repeat). Composer dispatches one - fresh subagent per phase per task so each phase runs with a clean - context window and a focused tool set; the orchestrator itself only - picks tasks, hands off, and propagates. Do NOT invoke for one-off task - lookups, status checks, refinement of one task by hand, or planning a - single task interactively. Those flows belong to the piyaz skill and - using composer for them adds latency without adding quality. + Use when the user types /piyaz:composer, /piyaz:composer , or + /piyaz:composer rework , or asks to run the next Piyaz + task end-to-end, ship the backlog, compose through the ready queue, or + loop through Piyaz tasks until done. Composer researches, refines, plans, + implements, reviews, and fixes each task in a loop until the PR is ready, + and merges and continues when the user authorizes it. Do NOT invoke for + one-off task lookups, status checks, hand-refinement of one task, or + interactive planning of a single task; those flows belong to the piyaz + skill and composer adds latency without adding quality. --- # Composer -Composer is a Piyaz task orchestrator. It picks the next ready task off the project's critical path, dispatches four subagents in sequence (research, plan, implement, review) to deliver it end-to-end with production-grade quality, propagates the result through the graph, and loops until the queue is empty or the user stops. Each subagent runs in a fresh context with a focused tool set; the main orchestrator stays clean across the whole session. +Composer is a Piyaz task orchestrator. Per iteration it picks the next ready task off the project's critical path, runs that task through a deterministic per-task **workflow** (research, plan, implement, CI gate, review, bounded fix loop), surfaces the verdict, merges when the user authorized it, propagates the result through the graph, and continues until a structural stop condition holds. -Composer is glue. The heavy lifting (task selection, refinement, the Completion Protocol, propagation) already lives in the `piyaz` skill (`plugins/claude-code/skills/piyaz/SKILL.md`). Composer reuses those flows verbatim rather than duplicating them. +The orchestrator (this skill, running in the main loop) owns only the **interactive seams**: pick the task, resolve gates, run the merge gate, propagate. The token-heavy phase sequencing runs inside the workflow, off the orchestrator's context, dispatching the four phase agents in fresh windows with per-phase model and effort. This is the design's main token discipline: orchestration is JavaScript, not main-loop reasoning over a transcript that grows with every phase. -## Invocation - -Two modes, both surfaced as slash commands by the plugin: - -- **`/piyaz:composer`**: backlog loop. The orchestrator picks the highest-value ready task each iteration and keeps going until a stop condition fires. -- **`/piyaz:composer `**: single-task mode (e.g. `/piyaz:composer ZIN-42`). Same pipeline applied to one task; the loop exits after the reviewer hands its verdict back to the orchestrator. +Composer is glue. The heavy lifting (task selection, refinement, the Completion Protocol, propagation) lives in the `piyaz` skill (`skills/piyaz/SKILL.md`); composer reuses those flows rather than duplicating them. -If the user typed `/piyaz:composer` with no argument, treat it as backlog mode. Anything else is single-task. - -## The four subagents - -Each subagent is a registered plugin agent. The orchestrator dispatches them via the Task tool by `subagent_type`. They have their own files; do not duplicate their logic here. +## Invocation -| Phase | `subagent_type` | File | Writes to Piyaz | Returns to orchestrator | -| --- | --- | --- | --- | --- | -| 1. Research | `piyaz:composer-researcher` | `plugins/claude-code/agents/composer-researcher.md` | Refinement fields on the target task (`description`, `acceptanceCriteria`, `tags`, `category`, `priority`, `estimate`, `decisions`); **never `status`, `implementationPlan`, `executionRecord`, or `files`** | A research brief: files to touch, existing patterns, library docs (with version-pin checks), security/perf considerations, project conventions, applied refinements with citations, open questions, flags | -| 2. Plan | `piyaz:composer-planner` | `plugins/claude-code/agents/composer-planner.md` | `implementationPlan` and `decisions`; `status='planned'` only on `draft → planned` transition; nothing else | Saves the unabridged `implementationPlan` to Piyaz; transitions the task `draft → planned` when entering at `draft`; returns a one-sentence confirmation | -| 3. Implement | `piyaz:composer-implementer` | `plugins/claude-code/agents/composer-implementer.md` | `status='in_progress'` (claim) and `status='in_review'` (with Completion Protocol payload: `executionRecord`, `decisions`, `files`, evaluated `acceptanceCriteria`); HOTL flips `in_review → done` post-approval, outside composer's loop | Writes code on a feature branch, runs tests/lint/typecheck, opens a PR, marks the task `in_review` in dispatched mode; returns the PR URL plus a one-sentence summary | -| 4. Review | `piyaz:review` | `plugins/claude-code/agents/review.md` | **Nothing.** Review is read-only over Piyaz; the verdict travels in the return message, not in any task field. The HOTL operator owns the `in_review → done` transition regardless of the verdict | A structured verdict (`approve` / `request-changes` / `block`) with file-cited reasoning across the security, performance, reliability, observability, and codebase-standards lenses; AC evaluation against the diff; plan-vs-diff drift; downstream impact list | +- **`/piyaz:composer`**: backlog mode. Pick the highest-value ready task each iteration; continue until a stop condition holds. +- **`/piyaz:composer `**: single-task mode. Same pipeline applied to one task; exits after the iteration completes. +- **`/piyaz:composer rework `**: rework mode. HOTL requested changes on GitHub instead of merging; composer rounds that feedback back through the fix loop. +- **`/piyaz:composer --pipelined`**: backlog mode with research-ahead (latency-only, costs tokens). Off by default; see *Pipelined research-ahead*. -The contract is intentionally tight: the researcher applies refinements directly so the task row reflects ground truth before planning starts; the brief is the planner's *findings reference*, while the refined task itself is the planner's *input*. The planner's output also lands in Piyaz, so the implementer reads everything (refined description, refined ACs, the implementation plan, upstream decisions) from `piyaz_context depth='agent'` rather than receiving it from the orchestrator. The reviewer reads the same task row through `piyaz_context depth='review'`, which renders the implementation plan alongside the executionRecord and surfaces the PR link from `task_links`; the PR diff is the source of truth for what changed. Every dispatch payload stays small; the source of truth is one place: the task row. +No argument means backlog mode; `rework` plus an argument means rework mode; anything else is single-task. ## Piyaz operating context -The canonical piyaz rules load with this skill. Treat their content as part of your operating context; downstream citations (`conventions §1`, `artifacts §5`, etc.) refer to the loaded text. +The canonical piyaz rules load with this skill. Downstream citations (`conventions §1`, `artifacts §3`, `lifecycle §3`) refer to this loaded text. @skills/piyaz/references/conventions.md @skills/piyaz/references/artifacts.md @skills/piyaz/references/lifecycle.md @skills/piyaz/references/resilience.md -## Session bootstrap (first turn of every composer session) +## The per-task workflow -Do these once, before the first iteration: +Each iteration's task runs through `skills/composer/workflows/compose-task.js`, launched with the Workflow tool: -1. **Resolve the project.** `piyaz_project action='list'` → confirm with `action='select' projectId='...'`. If the user is in single-task mode, also run `piyaz_query type='search' query=''` to resolve the task UUID and surface its `state` hint. -2. **Read project meta.** `piyaz_query type='meta' projectId='...'`. Capture categories, tag vocabulary, and status counts in memory; pass them verbatim to each researcher dispatch so personas ground on the project taxonomy. -3. **Install the goal harness.** Generate the goal-condition string below and prompt the user to paste it into `/goal`. Composer cannot install `/goal` itself; the user has to type it. Emit the literal code-fence so the user can copy-paste: +``` +Workflow({ + scriptPath: "${CLAUDE_PLUGIN_ROOT}/skills/composer/workflows/compose-task.js", + args: { taskRef, taskId, projectId, categories, tagVocabulary, + pickEstimate, pickPriority, workType, tags, thinDescription, + mode, plannableOnly, resumeFrom, priorBrief, gateAnswers, + fixFindings, prUrl, priorFailure, estimate, flags }, +}) +``` - ```` - /goal piyaz_analyze type='ready' returns an empty set, OR composer reports three consecutive failed attempts on a task, OR the user types stop, OR (single-task mode) composer reports the target task marked done, OR (single-task mode) composer reports proposed rewrite denied on a task - ```` +If `${CLAUDE_PLUGIN_ROOT}` does not resolve in the tool argument, substitute the absolute path of this plugin's root. The workflow runs in the background; the orchestrator is suspended until it returns, so it spends no context tokens while phases run. + +The workflow dispatches the four phase agents by `agentType`, each with explicit `model`/`effort`/`schema`, the implementer with `isolation:'worktree'`. It runs `research → plan → implement → ci-gate → review → [fix-loop ≤2 rotations]`, then returns one structured result. It does **not** merge, propagate, or touch edges; those are the orchestrator's seams. The phase contracts live in the agent files; do not duplicate them here. + +| Phase | `agentType` | Writes to Piyaz | Workflow captures | +| --- | --- | --- | --- | +| 1. Research | `piyaz:composer-researcher` | refinement fields only (`description`, `acceptanceCriteria`, `tags`, `category`, `priority`, `estimate`, `decisions`); never `status` | brief, status, flags, confidence, refined estimate/work-type, proposed rewrites | +| 2. Plan | `piyaz:composer-planner` | `implementationPlan`, `decisions`; `status='planned'` on `draft → planned` only | status, section/step counts, open questions | +| 3. Implement | `piyaz:composer-implementer` | `status='in_progress'` (claim), `status='in_review'` (+ Completion Protocol); fix mode rotates `in_review → in_progress → in_review` | status, PR URL, AC counts, concerns | +| CI gate | generic (haiku) | nothing | `green` / `red` / `pending` / `none`, failing checks | +| 4. Review | `piyaz:review` (dispatched with a verdict schema) | nothing (read-only) | verdict, blocking findings | + +## The workflow result + +The workflow returns exactly one of three shapes. Branch on `result.status`, not on prose: + +| `status` | Meaning | Orchestrator reaction | +| --- | --- | --- | +| `DONE` | Task ran to `in_review` (or `planned` for a plannable-only pick) | Surface the verdict, run the *Merge gate*, propagate | +| `NEEDS_DECISION` | The research or plan phase gated; `result.gate` carries the trigger and `result.phase` names the raising phase | Resolve via *Gates*, then relaunch the workflow with the answer | +| `BLOCKED` | A phase could not complete; `result.phase` and `result.reason` say which and why | *Failure handling* | + +A `DONE` result also carries: `outcome` (`in_review`|`planned`), `verdict`, `prUrl`, `ciState`, `acSatisfied`/`acTotal`, `rotations`, `escalated` (true when a `block` verdict or an exhausted fix budget left findings unaddressed), `blockingFindings`, `concerns`. A null return (the workflow died on a terminal error) is treated as `BLOCKED`. + +## Session bootstrap + +Once per session, before the first iteration: + +1. **Resolve the project.** `piyaz_project action='list'` → `action='select' projectId='...'`. Single-task mode: also `piyaz_query type='search' query=''` to resolve the task UUID and current status. +2. **Read meta.** `piyaz_query type='meta'`. Keep the categories and tag vocabulary for the workflow's research args; drop the status counts. +3. **Stale-claim sweep.** Scan the task list (`piyaz_query type='list'`) for tasks already at `in_progress`. Surface possible stale claims from dead sessions in the first pick rationale. +4. **Set the merge policy.** Ask once with `the AskUserQuestion tool`: `never` (default; HOTL owns the merge), `ask-each` (confirm per PR), or `auto-on-approve` (merge automatically on an `approve` verdict with green CI). Record the choice; it holds for the whole run. When `AskUserQuestion` is unavailable (headless), default to `never`. +5. **Init the run log.** `mkdir -p .piyaz` and guard the gitignore (`grep -qxF '.piyaz/' .gitignore 2>/dev/null || printf '\n.piyaz/\n' >> .gitignore`). If `.piyaz/composer-.md` exists and ends with `RUN_END`, archive it to `.piyaz/archive/composer--.md` and start fresh; if it exists *without* a `RUN_END`, that is a resume signal — see *Recovering after compaction* first. When the unfinished log's `RUN_START mode=` differs from this invocation, append `RUN_END reason=superseded-by-`, archive, and start fresh. Then append `RUN_START mode=<...> mergePolicy=<...> project=`. + +Then start iterating. There is nothing to install and nothing to confirm beyond the merge policy. + +## The loop + +At the start of each iteration, materialize these todos and mark them off (the todo list is your compaction anchor): pick, launch workflow, handle result, surface verdict, merge gate, propagate. + +```dot +digraph composer_iteration { + "Pick next task" [shape=box]; + "Ready or plannable task?" [shape=diamond]; + "STOP: backlog drained" [shape=doublecircle]; + "Launch compose-task workflow" [shape=box]; + "Result status?" [shape=diamond]; + "Resolve gate with user" [shape=box]; + "Continue this task?" [shape=diamond]; + "STOP: iteration ends (single-task)" [shape=doublecircle]; + "Failure handling" [shape=box]; + "outcome = planned?" [shape=diamond]; + "Surface verdict" [shape=box]; + "Merge gate (per policy)" [shape=box]; + "Propagate" [shape=box]; + "Single-task mode?" [shape=diamond]; + "STOP: iteration complete" [shape=doublecircle]; + + "Pick next task" -> "Ready or plannable task?"; + "Ready or plannable task?" -> "STOP: backlog drained" [label="no"]; + "Ready or plannable task?" -> "Launch compose-task workflow" [label="yes"]; + "Launch compose-task workflow" -> "Result status?"; + "Result status?" -> "outcome = planned?" [label="DONE"]; + "Result status?" -> "Resolve gate with user" [label="NEEDS_DECISION"]; + "Result status?" -> "Failure handling" [label="BLOCKED / null"]; + "Resolve gate with user" -> "Continue this task?"; + "Continue this task?" -> "Launch compose-task workflow" [label="yes: relaunch with answers"]; + "Continue this task?" -> "Pick next task" [label="no (backlog)"]; + "Continue this task?" -> "STOP: iteration ends (single-task)" [label="no (single-task)"]; + "outcome = planned?" -> "Single-task mode?" [label="yes (plannable-only)"]; + "outcome = planned?" -> "Surface verdict" [label="no"]; + "Surface verdict" -> "Merge gate (per policy)"; + "Merge gate (per policy)" -> "Propagate"; + "Propagate" -> "Single-task mode?"; + "Single-task mode?" -> "STOP: iteration complete" [label="yes"]; + "Single-task mode?" -> "Pick next task" [label="no"]; + "Failure handling" -> "Single-task mode?"; +} +``` - The `/goal` evaluator watches the transcript each turn and ends the session when one of the literal phrases above appears. Composer's job is to emit those literal phrases at the right moments (see *Stop conditions*). +### Step details -4. **Confirm the harness fired.** Call `AskUserQuestion`: "Did `/goal` accept the harness?" with options yes / no. On yes, proceed to the loop. On no, emit a one-line warning ("Backlog mode without `/goal` has no automatic exit; type `stop` to halt the loop.") and proceed anyway. Composer cannot force the install; it can only refuse to start silently. +1. **Pick.** Backlog: `piyaz_analyze type='ready'` ∩ `type='critical_path'`; rank by priority (`urgent > core > normal > backlog`), tie-break by lowest estimate. Fall back to the highest-priority `ready` task when the intersection is empty, then to `piyaz_analyze type='plannable'` when `ready` is empty (plannable picks route through research + plan only; mark the pick **plannable-only**). Single-task: the named task; if `done` or `cancelled`, report and stop; if already claimed, see *Failure handling* (jump to the in-flight phase, never restart). Emit a one-paragraph pick rationale (taskRef, priority, estimate, critical-path yes/no, one-sentence reason). Do not wait for approval; the user interrupts if they disagree. -In backlog mode the harness is required; in single-task mode it is optional but recommended. Long single-task runs still benefit from the safety bound. +2. **Gather pick facts and launch.** Build the workflow `args` from the pick and bootstrap: `taskRef`, `taskId` (the UUID; never the ref in tool calls — conventions §4), `projectId`, `categories`, `tagVocabulary`, `pickEstimate`, `pickPriority`, `workType` and `tags` (from the task row), `thinDescription` (true when the description fails the artifacts §1 rubric on a glance), `mode`, `plannableOnly`. Write `PICK` then `WORKFLOW task= runId=` to the run log, then launch the workflow and await the result. -## Loop +3. **Handle the result.** `NEEDS_DECISION` → *Gates*. `BLOCKED`/null → *Failure handling*. `DONE` with `outcome=planned` (plannable-only) → end the iteration (`TASK_END outcome=planned`); backlog returns to the pick, single-task reports and stops. `DONE` with `outcome=in_review` → step 4. -```text -pick_task → dispatch researcher → dispatch planner → dispatch implementer → dispatch reviewer → propagate → loop -``` +4. **Surface + merge + propagate.** Quote the final verdict block verbatim (`VERDICT` to the run log). Run the *Merge gate*. Then propagate per lifecycle §3: `piyaz_query type='edges' taskId=''`, `piyaz_analyze type='downstream' taskId=''`; update or retire edge notes the work invalidated (edge-note shape: artifacts §3). Propagation depth: full when the PR was merged or the verdict was `approve`; otherwise provisional, each note prefixed `Provisional pending HOTL on PR #:`. Surface newly-unblocked tasks in the next pick rationale. Write `PROPAGATED`, then `TASK_END outcome=in_review rotations=`. -Per iteration the orchestrator runs: +5. **Loop.** Single-task: report the outcome and stop. Backlog: next iteration, no pause. -1. **Pick the next task.** - - Backlog mode: `piyaz_analyze type='ready' projectId='...'` ∩ `piyaz_analyze type='critical_path' projectId='...'`. Rank intersection by priority (`urgent > core > normal > backlog`), break ties by lowest `estimate`. Fall back to highest-priority `ready` task if the intersection is empty. Fall back to `piyaz_analyze type='plannable'` if `ready` itself is empty (route through researcher + planner first; nothing to implement yet). - - Single-task mode: skip selection. The task is the one the user named. If its `state` is already `done` or `cancelled`, emit the done line (see *Stop conditions*) and exit. - - Emit a one-paragraph **pick rationale** before claiming so the user can interject: - > Next pick: ``. Priority=``, estimate=``, on critical path=``. Reason: ``. +## Gates -2. **Dispatch researcher.** One `Agent` call with `subagent_type='piyaz:composer-researcher'`. The prompt body opens with `Target task: ` and includes the project's meta payload from bootstrap step 3 verbatim. The task stays at its current status (`draft` if picked from `plannable`, `planned` if picked from `ready`). Researchers do not claim, but they **do** refine: the researcher applies sharpening edits to `description`, `acceptanceCriteria`, `tags`, `category`, `priority`, `estimate`, and `decisions` based on what it finds in the codebase, in docs, and in its security/performance review. The task row evolves under your feet during this phase; that is intentional. Await the brief. Refinement writes are append-only and cannot fail destructively; the only way Phase 1 fails is if the researcher cannot ground its findings (returns `confidence < 0.6` or flags items in *Open questions*). In that case, surface those to the user and pause for an answer before continuing. +A `NEEDS_DECISION` result means the research or plan phase needs a user decision before the task can proceed. `result.phase` names the raising phase and `result.gate` carries the trigger. Resolve with `the AskUserQuestion tool`, then relaunch the workflow: - **Post-researcher gates.** Two signals can divert the iteration before the planner runs. If the brief carries the `oversize-task` flag, defer to *Oversize handling* below. If the brief carries a `## Proposed rewrites` section, defer to *Proposed rewrites handling* below. Estimate refinements within the bounded scale (`1, 2, 3, 5, 8, 13`) are normal refinement and do not gate. +- **Oversize** (`oversize-task` flag): offer to dispatch `piyaz:decompose-task` or skip the task. Composer never splits a task itself. On decompose, dispatch the decompose agent and end the iteration; the children land in the backlog. +- **Proposed rewrites** (`result.gate.proposedRewrites` non-empty): show original vs proposed per field with the rationale; offer accept / deny. On accept, apply via `piyaz_task action='update'` and relaunch the workflow **fresh** (no `resumeFrom`) so research re-grounds on the rewritten task. On deny, end the iteration (backlog picks next; single-task stops). +- **Low confidence or external input** (confidence < 0.6, `external-input-required`, or any plan-phase open question): surface the open questions, wait for answers, then relaunch — research gate relaunches **fresh** with `gateAnswers`; a plan gate relaunches with `resumeFrom='plan'`, `priorBrief=result.brief`, and `gateAnswers`, so research is not redone. -3. **Dispatch planner.** One `Agent` call with `subagent_type='piyaz:composer-planner'`. The prompt body includes `Target task: `, the task's current `status` so the planner knows whether it is writing a new plan or re-validating an existing one, the research brief verbatim, and a pointer to `piyaz_context depth='planning'` (the planner fetches it itself). The planner owns the `draft → planned` transition: when the task entered at `draft`, the planner writes the full `implementationPlan` and flips status to `planned` in one call; when the task entered at `planned`, the planner re-validates against the brief and either keeps the plan as-is without mutating the task (a silent re-validation is the correct trace) or refreshes the plan when the brief shows real drift. Verify the planner's write by polling `piyaz_context depth='summary' taskId=''` once before advancing. If no plan is visible after a `draft` entry (or the planner reports failure), retry once with the failure message appended to the dispatch; on a second failure, treat the iteration as a failed attempt (see *Failure handling*). +**Headless gate fallback:** when `AskUserQuestion` is unavailable (errors or hangs), a `NEEDS_DECISION` resolves to skip-the-task: append a `GATE` line carrying the unasked question and the skip, write `TASK_END outcome=skipped`, end the iteration (backlog picks next; single-task stops). Never fabricate an answer; skipping is the reversible default (resilience §11). -4. **Dispatch implementer.** One `Agent` call with `subagent_type='piyaz:composer-implementer'`. The prompt body is short: `Target task: . Plan is saved to Piyaz; fetch via piyaz_context depth='agent'. Claim the task (planned → in_progress), implement per the implementationPlan, open a PR, mark the task `in_review` in dispatched mode per the Completion Protocol (the HOTL operator finalizes `in_review → done` after PR approval).` Await the implementer's return. The implementer owns the `planned → in_progress` claim, the `in_progress → in_review` completion, the PR creation, and the full Completion Protocol payload; the orchestrator writes none of these. +## Merge gate -5. **Dispatch reviewer.** Once the implementer reports `in_review` and returns a PR URL, dispatch the review subagent. One `Agent` call with `subagent_type='piyaz:review'`. The prompt body is short: `Target task: . PR URL: . Mode: composer-phase-4. Fetch the bundle via piyaz_context depth='review' taskId=''.` Await the verdict. The reviewer is read-only over Piyaz; it does not flip status, write to `decisions`, or touch the working tree. Surface the verdict block verbatim to the user (HOTL) so they can act on it on GitHub. The orchestrator does not interpret `request-changes` or `block` as a retry signal; the HOTL operator owns the next move. A `block` verdict still falls through to propagation (the downstream graph still needs honest edges), but the iteration ends after propagation regardless of verdict. +The merge gate runs after a `DONE` result with `outcome=in_review`, governed by the run's merge policy. It fires **only** when `result.verdict === 'approve'` AND `result.ciState === 'green'`; a `request-changes`, `block`, `escalated`, red, or pending result is never merged. -6. **Propagate.** After the verdict is surfaced, run propagation per lifecycle §3: `piyaz_query type='edges' taskId=''` then `piyaz_analyze type='downstream' taskId=''`. Update or retire edge notes the implementer's work invalidated. Edge-note content follows artifacts §3: one to three short sentences, written as a brief to the downstream task's coding agent (what specifically does this task get from the target). No prose recaps. Surface newly-unblocked tasks in the next pick rationale. +- **`never`** (default): do not merge. HOTL owns the merge and the `in_review → done` transition, exactly as without this feature. Propagate provisionally unless the verdict was `approve`. +- **`ask-each`**: ask `the AskUserQuestion tool` whether to merge this PR. On yes, merge as below. On no (or headless), leave it for HOTL. +- **`auto-on-approve`**: merge without asking. -7. **Loop.** Single-task mode: emit the done line (see *Stop conditions* item 4) and exit. Backlog mode: return to step 1. +To merge: `gh pr merge --squash --delete-branch` (squash is the default; follow the repo's configured default method when it differs). On a clean merge, write the task `done` — this is the **one** case the orchestrator writes a status transition, authorized by the run-start merge policy: -### Oversize handling +``` +piyaz_task action='update' taskId='' status='done' + executionRecord=' via composer auto-merge after approve + green CI>' +``` + +Then propagate fully (the work landed) and write `MERGE task= pr= method=squash` to the run log. A failed merge (conflict, protected branch, merge-queue required) is not a task failure: report it, leave the task at `in_review` for HOTL, and continue. + +## Model selection + +The workflow self-selects each phase's model and effort from the pick facts and the research stage's refined estimate/work-type/flags. The orchestrator does not pass models; it passes the pick facts. The table the workflow applies: + +| Phase | est 1–2 | est 3 | est 5 | est 8–13 / unset | +| --- | --- | --- | --- | --- | +| Researcher | sonnet | sonnet | opus | opus | +| Planner | opus | opus | opus | opus | +| Implementer | sonnet (also docs/test/chore) | sonnet if docs/test/chore, else opus | opus | opus | +| CI gate | haiku | haiku | haiku | haiku | +| Reviewer | opus | opus | opus | opus — never downgrade | -`estimate` is bounded to Fibonacci values `1, 2, 3, 5, 8, 13` (artifacts §5); no task in Piyaz can carry an estimate above 13. Oversize is a *scope-detection* signal, not a numeric overflow: the researcher discovers during exploration that a task's true scope exceeds what `13` represents and raises the `oversize-task` flag in the brief. +Research correctness is load-bearing: a mis-refined task wastes far more downstream opus tokens than a cheaper research model saves, so the researcher never runs below sonnet, and the floor rises to opus on substantial or risky tasks. (CI watching is mechanical, so the cheap haiku tier holds there only.) -Single checkpoint, post-researcher: if the brief carries `oversize-task`, surface the task ref and ask the user whether to dispatch `piyaz:decompose-task` to split or skip and pick the next ready task. Do not write a plan. Do not claim. Composer is not a decomposer; oversize routes out to the specialist agent before the planner runs. +Guardrails force opus and higher effort on the planner and implementer regardless of estimate when any holds: a `security`/`safety`/`compliance` tag; estimate 8, 13, or missing; a fix-mode rotation; any retry or partial-success recovery; `priority='urgent'`; or a risk-bearing research flag (`security-boundary-uncovered`, `version-drift-major`, `dep-mismatch`). These are encoded in `compose-task.js`; this table is the human-readable mirror. -Estimate refinements within the bounded scale (researcher bumps `5` to `8`, or `13` down to `8`) are normal. Needs evolve as exploration uncovers scope; the researcher updates `estimate` up or down within `[1, 13]` as warranted. That is refinement, not an oversize event. +## Run log -### Proposed rewrites handling +The run log is composer's crash-safe memory: an append-only event log at `.piyaz/composer-.md`, one active file per project. The conversation can compact; the log does not. Counters derive by grep over events **after the latest `RUN_START`**: this run's iterations = `PICK` lines; failed attempts on task X = `FAIL task=X` lines. -The researcher may propose substantive rewrites of `description` or `acceptanceCriteria` rather than apply them directly (researcher prose: *Substantive rewrites: propose, do not apply*). When the brief carries a `## Proposed rewrites` section, do not advance to the planner. Surface each proposal to the user via `AskUserQuestion`: show the original value, the proposed value, and the researcher's one-line rationale; offer accept / deny per field. +One timestamped line per event, `key=value` pairs; multi-line payloads (blocking findings, gate questions and answers, failure summaries) follow as `> ` continuation lines. The vocabulary: -On accept, apply the proposal via `piyaz_task action='update'` and re-dispatch the researcher with the rewritten task. The fresh research run reads the rewritten description and AC as ground truth, writes a new brief, and the planner runs against that brief. A rewrite the user accepted invalidates the prior brief; re-dispatching is what keeps the planner grounded in the post-rewrite scope. +| Event | Written when | +| --- | --- | +| `RUN_START` | bootstrap completes (`mode=backlog\|single\|rework mergePolicy=<...> project=`) | +| `PICK` | step 1 emits the pick rationale | +| `WORKFLOW` | immediately after launching the workflow (`task= runId=`) | +| `GATE` | a `NEEDS_DECISION` resolves — user answer or headless skip; question and answer as continuations | +| `VERDICT` | the workflow returns DONE (`verdict= rotations= ci= escalated=`; blocking findings as continuations) | +| `MERGE` | the merge gate merges a PR (`task= pr= method=squash`) | +| `ESCALATE` | a `block` or rotations-exhausted result goes to HOTL | +| `PROPAGATED` | propagation completes (`edges= unblocked=`) | +| `BRIEF` | a `--pipelined` prefetch brief lands (`task= baselinedAt=`; brief verbatim as continuations) | +| `FAIL` | the workflow returns BLOCKED (failure summary as continuation) | +| `TASK_END` | the iteration ends (`outcome=in_review\|planned\|stuck\|skipped rotations=`) | +| `RESUME` | recovery appends this after reading the log | +| `RUN_END` | any stop condition (`reason=<...> picked= shipped= merged= stuck= skipped=`) | -On deny, end the iteration. Backlog mode: pick the next task; the denied task keeps its silently-applied refinements and stays at its current status. Single-task mode: emit `composer reports proposed rewrite denied on ` to the transcript (matches the `/goal` clause) and exit. +Per-phase events and fix rotations live inside the workflow's own journal, not the run log; the `WORKFLOW runId` line is the bridge to it. If `.piyaz/` is not writable, fall back to any writable directory and name the chosen path in the first report; if no local write is possible, run without the log and say so — the run loses crash recovery, not correctness. -Subsequent rewrite proposals on the re-dispatched run go through the same gate. The user can deny at any cycle to break out; there is no implicit cap. +## Rework mode -### Phase entry and exit conditions +Pull-based: the backend has no webhooks, and `task_links` is the only PR record. The user invokes rework when GitHub review feedback exists; composer fetches it, re-anchors it, and runs the fix loop on it. -| Phase | Entry condition | Exit condition | Failure surface | -|---|---|---|---| -| Researcher | Task at `draft` or `planned`; pick rationale emitted | Brief returned, `confidence ≥ 0.6`, no `oversize-task` flag, no pending `## Proposed rewrites` (or all accepted and re-dispatched), refinements landed in Piyaz | `confidence < 0.6` pauses for user; oversize routes to *Oversize handling*; proposed rewrites route to *Proposed rewrites handling* | -| Planner | Task at `draft` (write new plan) or `planned` (re-validate); brief in dispatch prompt | `implementationPlan` visible via `piyaz_context depth='summary'`; status flipped to `planned` if entry was `draft` | No plan after one retry counts as a failed attempt per *Failure handling* | -| Implementer | Task at `planned`; plan saved to Piyaz | Status `in_review`, full Completion Protocol payload, PR URL returned (HOTL flips to `done` outside composer) | Tests/lint/typecheck red unrecoverable, or PR not opened, counts as a failed attempt; partial success (PR opened, `in_review` not marked) recovered per *Failure handling* | -| Reviewer | Task at `in_review`; PR URL visible on `task.links` (kind `pull_request`) or supplied in dispatch | Structured verdict returned (`approve` / `request-changes` / `block`); the verdict text is the iteration's hand-off artifact to HOTL; no Piyaz writes from this phase | Reviewer cannot reach `piyaz_context depth='review'`, or the bundle reports status mismatch and the dispatch is genuinely premature, counts as a failed attempt per *Failure handling*. A `block` or `request-changes` verdict is NOT a failure; it is the correct outcome of a careful review and surfaces straight to HOTL. | +1. **Resolve the pair.** Given a taskRef, read `task.links` filtered to `kind='pull_request'`; given a PR URL, resolve the task from the `[]` bracket (verify the link row agrees). Prefer the newest open PR when several exist. +2. **Reviewer-led intake.** Dispatch `piyaz:review` with `Target task: . PR URL: . Mode: rework-intake.` The intake re-verifies the human feedback against current HEAD and returns a verdict. +3. **Branch on the intake verdict.** + - `request-changes`: launch the workflow with `resumeFrom='fix'`, `prUrl=`, and `fixFindings=`. The fix loop uses a **fresh rotation budget of 2** for this rework invocation (the workflow's rotation counter starts at zero per launch). Prefix the implementer dispatch context with rework so the implementer accepts an `in_progress` entry (HOTL may flip `in_review → in_progress` to signal rework). + - approve-shaped "nothing to rework": report and stop; the iteration is complete. + - `BLOCKED` (PR merged/closed, task `done`/`cancelled`): report and stop. +4. **Finish like any iteration.** Surface the verdict, run the merge gate, propagate, `TASK_END`. The run log records `RUN_START mode=rework`. -**Recovering after orchestrator compaction.** Infer the current phase from the task's Piyaz status alone. `draft` with no plan: researcher pending. `draft` with plan present, or `planned`: planner done, implementer pending. `in_progress`: implementer pending or partial-success recovery (see *Failure handling*). `in_review`: implementer done; reviewer pending or already returned. When the transcript shows no verdict yet, dispatch the reviewer. When the verdict is in the transcript, advance to propagation. `done`: HOTL approved; iteration complete; advance to propagation if propagation has not yet run. +## Pipelined research-ahead (flag-gated) -### The orchestrator does not write `status` +Only under `--pipelined`, only in backlog mode, lookahead 1. The win is latency (~15–25%), not tokens; when in doubt, run without it. -This is load-bearing and the most common way an orchestrator like composer goes wrong. **Every lifecycle transition belongs to a subagent**, never to the orchestrator: +- **Trigger:** after task A's workflow returns DONE, launch a research-only workflow for the next ready task B in the background (`resumeFrom='research'` with a research-only early return is not built in; instead dispatch `piyaz:composer-researcher` directly with worktree isolation and `run_in_background`). Never prefetch while A's workflow is still running. +- **Pick B excluding A.** B must be ready independently of A; `in_review` unblocks nothing, so the ready set already excludes A's dependents. +- **Brief custody:** when the prefetch returns, append a `BRIEF` event with the brief verbatim. The prefetch is not a `PICK`; B's `PICK` lands when B's iteration starts, so recovery's last-`PICK`-without-`TASK_END` rule still finds A. Pass the brief into B's workflow launch as `priorBrief` with `resumeFrom='plan'` only when the invalidation table below clears it. +- **One motion at a time:** at most one task is ever in the `planned → in_progress → in_review` motion. B is never planned, claimed, or implemented early. A prefetch failure consumes no budget; drop it and research B normally. -- `draft → planned`: **planner**, when it saves the `implementationPlan` in one atomic update. -- `planned → in_progress`: **implementer**, as its claim before any code is touched. -- `in_progress → in_review`: **implementer**, with the full Completion Protocol payload (`executionRecord`, `decisions`, `files`, evaluated `acceptanceCriteria`) after the PR opens. -- `in_review → done`: **HOTL operator**, after PR approval/merge; never automatic and never written by composer or any subagent. The reviewer's verdict is advisory; HOTL still owns the transition. -- `* → cancelled`: never automatic; only triggered by an explicit user request, and even then routed through the appropriate subagent. +**Brief invalidation.** After propagation(A), evaluate in order; the first matching row wins: -The orchestrator's only Piyaz writes per iteration are **edge updates during propagation** (step 6), and even those are conditional on what propagation discovers. Picking a task does not claim it. Dispatching a researcher does not claim it. Dispatching a reviewer does not flip status. The implementer is the only writer of `status='in_progress'`; the HOTL operator is the only writer of `status='done'`. +| # | Signal after propagation(A) | Action | +| --- | --- | --- | +| 1 | A `depends_on` edge B→(non-done task) was created | Re-pick; brief is stale | +| 2 | B's description was updated | Re-research (relaunch fresh) | +| 3 | Edge notes into B name files/patterns in the brief's *Files to touch* | Re-research | +| 4 | A's files ∩ B brief's *Files to touch* ≠ ∅ | Re-research with the A PR pointer in `gateAnswers` | +| 5 | A re-pick returns C outranking B on priority class | Re-pick to C; a tie proceeds with B | +| 6 | Pure informational note updates, no overlap | Proceed with the brief | +| 7 | None of the above | Proceed | -Violating this rule (e.g., claiming `in_progress` at pick time so "no other agent grabs the task") looks innocuous but breaks the piyaz contract in three ways: it forces a `draft` task into `in_progress` without a plan, it puts the task in `in_progress` while a read-only researcher runs (misleading anyone watching the project), and it suppresses the planner's `_hints` that fire on the legitimate `draft → planned` transition. +**Kill switch:** after two consecutive invalidations, disable prefetch for the rest of the run and say so. + +## Dispatch hygiene + +The workflow builds every phase dispatch from the `args` you pass; the agents inherit nothing else. Keep `args` to the pick facts in *Step details* — never pass orchestrator transcript, prior-iteration summaries, full meta payloads, or piyaz reference text. The agents load their own rule extracts and fetch task context from Piyaz themselves. Oversized dispatches make agents worse, not better. ## Failure handling -A failed attempt is any of: implementer reports tests/lint/typecheck red and cannot self-recover; implementer returns without opening a PR; planner cannot save a plan after one retry; reviewer cannot reach `piyaz_context depth='review'` or the dispatch is premature (task not at `in_review`). A reviewer verdict of `request-changes` or `block` is NOT a failure; it is the correct outcome of a careful review and is surfaced to HOTL like any other verdict. On failure: +`BLOCKED`/null from the workflow is a failed attempt, with exceptions: -1. Do not write the failure to `decisions`. Per artifacts §1, `decisions` is CHOICE + WHY only; "attempt N failed" is process metadata and pollutes the field. Keep the failure summary in the orchestrator's own transcript (the user sees it directly there) and let the data layer's audit log carry the rest. -2. Leave the task at its current Piyaz status (do not auto-cancel; the task is not broken, the attempt was). -3. In backlog mode, move on to the next pick. In single-task mode, retry the iteration up to three total attempts (counting attempt 1). After three failures, emit `composer reports three consecutive failed attempts on ` to the transcript (matches the `/goal` clause) and exit. +- A phase that reports BLOCKED because the task is already `done` or `cancelled` is not a failure — HOTL resolved it underneath the run. Run *Surface + merge + propagate* if it has not run, consume no budget, move on. +- `BLOCKED — environmental: ` (gh auth, rate limits, network) is an environment problem; surface it verbatim, consume no budget, resume the same workflow (via `resumeFrom`) once the user confirms the fix. +- `BLOCKED` from the plan phase prefixed `foundation-unsound` means the planner judged the research foundation wrong; relaunch the workflow **fresh** once to re-research, then treat a second failure normally. - **Why the asymmetry.** Backlog mode optimizes throughput across the queue; a stubborn task should not block other ready work. The failed task stays at `in_progress` for human triage. Single-task mode optimizes completion of one named task; retries are warranted because there is nothing else to fall through to. +For every other BLOCKED: -Each retry dispatches the implementer fresh with the parent attempt's failure summary appended to the prompt; the researcher and planner are not re-run unless the failure clearly traces to a planning gap (e.g., the plan references a file that does not exist). +1. Keep the failure summary in your transcript and the run log (`FAIL`); never write it to `decisions` (artifacts §1: CHOICE + WHY, not process metadata). +2. Leave the task at its current status. Never roll back, never cancel. +3. Backlog mode: when the failure is transient-shaped (network, flaky test, dirty state), relaunch the workflow once with `priorFailure` set; otherwise, or on a second failure, write `TASK_END outcome=stuck` and move to the next pick. Single-task mode: relaunch up to three total attempts, appending each failure summary as `priorFailure`; after the third, report and stop. -**Partial success: PR opened, `in_review` not marked.** If a retry's pre-flight finds the task at `in_progress` with an open PR matching the branch name pattern (`/-`), do not re-implement. Resume the Completion Protocol: re-evaluate acceptance criteria against the PR diff, populate `executionRecord` / `decisions` / `files`, mark `in_review`. The PR is the load-bearing artifact; the missing status write is recoverable. This is a single-attempt recovery; if it fails, count it toward the failure budget per rule 3. +**Partial success and orphaned PRs** are handled inside the implementer's pre-flight (it resumes the Completion Protocol against an existing branch/PR rather than re-implementing). When a single-task pick is already `in_progress` or `in_review`, launch the workflow with `resumeFrom='implement'` (in_progress) or `resumeFrom='fix'` with the existing `prUrl` (in_review); the implementer's pre-flight does the rest. ## Stop conditions -The orchestrator emits one of these literal phrases to the transcript when the corresponding state holds. `/goal` matches against them and ends the session. +Stop and report in plain language (there are no magic stop phrases) when one holds: + +1. **Backlog drained**: `ready` and `plannable` are both empty. The stop report enumerates every task left at `in_progress`/`in_review` with its failure summary — nothing strands silently. +2. **Failure budget exhausted**: three failed attempts on the same task (single-task mode). +3. **User says stop**: exit after the in-flight write finishes. +4. **Single-task or rework iteration complete**: verdict surfaced, merge gate run, propagation done. +5. **Rewrite denied** (single-task mode): the user rejected a proposed rewrite at the gate. +6. **Piyaz transport/auth failure**: any Piyaz tool call fails with auth expiry, 401/403, a 5xx, or a network error. Stop immediately (not retryable in-session, resilience §10) and report the exact error plus the last completed phase per in-flight task. + +These six are exhaustive. Every stop appends `RUN_END` with its reason and the grep-derived counters, then offers to archive the log; the headless default is archive. + +## Recovering after compaction -1. `piyaz_analyze type='ready' returns an empty set`: backlog drained. -2. `composer reports three consecutive failed attempts on `: same task failed three times in single-task mode (or after the orchestrator manually retried in backlog mode). -3. The user typed `stop` at any prompt: exit immediately after the current in-flight write finishes. -4. (Single-task mode only) `composer reports the target task marked done`: emitted right after step 7 reaches the loop exit (propagation done, verdict surfaced). The literal phrase contains `done` for `/goal` matching; the task itself is at `in_review` awaiting HOTL approval, not actually at `done`. Composer's iteration is what ends, not the task's lifecycle. -5. (Single-task mode only) `composer reports proposed rewrite denied on `: emitted right after the user denies a substantive rewrite proposal in single-task mode (see *Proposed rewrites handling*). +Read the run log first: `.piyaz/composer-.md`. The last `PICK`/`WORKFLOW` without a matching `TASK_END` is the in-flight task. **Piyaz wins on status** — re-read the task row and never trust the log over the server. **The log wins on history** — the merge policy (`RUN_START mergePolicy=`), gate answers, verdict history, and the workflow `runId`. -Do not invent new stop phrases. The `/goal` condition the user pastes during bootstrap matches these five verbatim; any drift breaks the harness. +To resume the in-flight task: -## Reuse points from the piyaz skill +- A `WORKFLOW runId=` line with no `VERDICT`/`TASK_END` after it means the workflow may still be journaled. Resume it with `Workflow({ scriptPath, resumeFromRunId: '' })` — completed phases return from cache, only the unfinished phase re-runs. Stop the prior run first if it is somehow still live. +- No usable runId: fall back to the Piyaz status mapping and relaunch with the matching `resumeFrom`. `draft` without a plan → fresh; `planned` → `resumeFrom='implement'` (or iteration end for a plannable-only pick); `in_progress` → `resumeFrom='implement'` (the implementer pre-flight resumes partial work); `in_review` → `resumeFrom='fix'` with the PR URL; `done` → HOTL or the merge gate already resolved it, run propagation if no `PROPAGATED` line exists. -Composer is glue. It explicitly defers to the `piyaz` skill for: +Append a `RESUME` line, then continue. Rebuild the backlog skip set from this run's `TASK_END outcome=stuck`/`skipped` lines. When the log is missing (different machine, sandbox), fall back to the status mapping alone; single-task mode re-invoked per task remains the lowest-risk shape for runs likely to span compaction. -- **Task selection.** `piyaz_analyze type='ready'` ∩ `type='critical_path'`, ranked by priority then estimate (see `plugins/claude-code/skills/piyaz/SKILL.md` § *What should I work on?*). -- **Refinement.** If the researcher's brief identifies vague acceptance criteria or a thin description, the planner applies refinements via `piyaz_task action='update'` with append semantics (see § *Refine a task* in the piyaz SKILL.md). -- **Planning.** Phase 2 saves the unabridged `implementationPlan` and transitions `draft → planned` exactly as § *Plan a draft task* specifies. -- **Implementation.** Phase 3 follows § *Implement a task* and the Completion Protocol (lifecycle §2). PR template detection, bracket form, body structure, `gh pr create` syntax all defer there. Composer adds only a conventional-commit title prefix when the project uses that format, and a `/-` branch name; both live in `agents/composer-implementer.md`. -- **Review.** Phase 4 reads `piyaz_context depth='review'` and follows the five-lens persona in `agents/review.md`. The verdict shape (`approve` / `request-changes` / `block` plus per-lens prose, AC evaluation, plan-vs-diff drift, downstream impact) is the reviewer's contract with the orchestrator and with HOTL. -- **Propagation.** `piyaz_query type='edges'` then `piyaz_analyze type='downstream'` after every `in_review` transition (and every later `done`); update edge notes, retire stale edges. +## Red flags — never do these -If a flow exists in the piyaz skill, do not reinvent it inside a subagent. Cite the section by file path and anchor instead. +| Temptation | Reality | +| --- | --- | +| Write `status` "so no other agent grabs the task" | Every transition belongs to a phase agent: planner `draft→planned`; implementer `planned→in_progress→in_review` plus fix rotations. The orchestrator writes only propagation edges — and `done`, but **only** when the merge gate merged the PR under an authorizing merge policy. | +| Merge without the policy authorizing it, or merge a non-approve / non-green PR | The merge gate fires only on `approve` + green CI, only under `ask-each` (with a yes) or `auto-on-approve`. `never` means HOTL merges. | +| Dispatch a phase agent yourself instead of launching the workflow | The orchestrator never dispatches phase agents directly (rework intake is the one exception). The workflow owns research → review; the orchestrator owns the seams. | +| Skip research or planning to "get the claim in faster" | The phase order is fixed inside the workflow; the orchestrator cannot reorder it. | +| Split an oversize task yourself | Oversize routes to `piyaz:decompose-task`, and only after the user gate. | +| Treat a `request-changes` or `block` verdict as a failed attempt | A careful verdict is a successful review. The workflow's fix loop or HOTL owns the response; the failure budget is untouched. | +| Pause between tasks to ask "should I continue?" | Continuous execution. The six stop conditions are the only exits; gates fire only on `NEEDS_DECISION`. | +| Pad `args` with transcript, meta, or spec text | Pick facts only. Pollution makes agents worse. | ## What composer is not -- **Not a decomposer.** Oversize tasks route to `piyaz:decompose-task`. Composer asks first; never silently splits a task. -- **Not a refiner.** Composer's researcher proposes refinements via the brief; the planner applies them through the canonical `piyaz_task` update path. If the user wants pure refinement, they should run the `piyaz` skill directly. -- **Not the code reviewer itself.** Composer dispatches the `piyaz:review` subagent in Phase 4 to produce a structured verdict, but the orchestrator does not interpret the verdict beyond surfacing it. The PR is reviewed on GitHub like any other PR; the HOTL operator owns the final `in_review → done` transition outside composer's loop, regardless of whether the reviewer recommended `approve`, `request-changes`, or `block`. -- **Not a session-resilience layer.** Long runs that hit auto-compaction rely on `/goal` to bound the session and on `piyaz_query type='meta'` plus the per-task Piyaz status to re-acquire project state on resume; composer does not persist its own session file. The orchestrator's "current phase" is implicit, derived from transcript and task status; after compaction it reconstructs per the *Phase entry and exit conditions* table. For runs likely to span compaction, prefer single-task mode and re-invoke composer per task rather than running an unbounded backlog loop. See `skills/piyaz/references/resilience.md` for the broader resilience primitives. +Not a decomposer (oversize routes out). Not a hand-refiner (that is the piyaz skill, used directly). It IS, when the user authorizes it, the merge gate. The workflow is the execution engine; the run log and the workflow journal are the resilience primitives; per-task re-invocation remains the recommendation for very long runs. ## See also -- `plugins/claude-code/skills/piyaz/SKILL.md`: canonical Piyaz flows composer reuses. -- `skills/piyaz/references/conventions.md`: Iron Law of grounding (cite real code, real refs; never speculate). -- `skills/piyaz/references/artifacts.md`: title/description/AC quality (§1), tag dimensions (§2), categories (§4), oversize threshold (§5). -- `skills/piyaz/references/lifecycle.md`: status lifecycle (§1), Completion Protocol with PR template detection (§2), propagation (§3). -- `plugins/claude-code/agents/composer-researcher.md`, `composer-planner.md`, `composer-implementer.md`, `review.md`: the four subagent definitions composer dispatches. -- `plugins/claude-code/agents/decompose.md`: the oversize-delegation target. +- `skills/composer/workflows/compose-task.js`: the per-task pipeline the orchestrator launches. +- `skills/piyaz/SKILL.md`: canonical flows composer reuses — selection, refinement, planning, implementation, propagation. +- `agents/composer-researcher.md`, `agents/composer-planner.md`, `agents/composer-implementer.md`, `agents/review.md`: the four phase contracts and their structured returns. +- `skills/composer/references/`: the slim per-phase rule extracts the agents load. +- `agents/decompose-task.md`: the oversize-delegation target. diff --git a/plugins/claude-code/skills/composer/references/implementer-rules.md b/plugins/claude-code/skills/composer/references/implementer-rules.md new file mode 100644 index 00000000..19e47f71 --- /dev/null +++ b/plugins/claude-code/skills/composer/references/implementer-rules.md @@ -0,0 +1,247 @@ +# Implementer rules (composer Phase 3 extract) + +Slim extract of the canonical piyaz references for the composer +implementer. Mirrors: `skills/piyaz/references/conventions.md` §1, §2, +`skills/piyaz/references/lifecycle.md` §1 (Summary, `in_progress`, +`in_review`), §2 (entire Completion Protocol, 2.1–2.4), and +`skills/piyaz/references/artifacts.md` §1 (`executionRecord`, +`decisions`, `files`), §6. Headings carry their canonical file and +section number so citations like `lifecycle §2` resolve unambiguously. +When editing a mirrored section, edit BOTH files. + +--- + +## conventions §1 — The Iron Law of grounding + +``` +Never write what you cannot cite or do not know. +``` + +Applies wherever an agent generates `executionRecord`, `decisions`, `description`, or `files`. + +- `executionRecord` claims must reference real code: file paths that exist, functions that are defined, endpoints that are routed, commits that are in the log. +- `description` must reflect actual scope. Do not stretch a one-line ask into an invented full feature. +- `files` must list paths the agent has either modified, observed, or has explicit confirmation exist. + +When uncertain, write less. A short, true record is more valuable than a rich, fabricated one. + +`decisions` come from the conversation and the work, not from artifact-mining. Never invent them. + +--- + +## conventions §2 — Tool descriptions and `_hints` are runtime instructions + +Every Piyaz tool injects two things into your context at use time: + +1. The tool's description and parameter schema, visible before the call. +2. A `_hints` array in the response, visible after the call. + +These are not optional commentary. They are server-side rules and state you cannot see otherwise. They override any prior plan you had. + +**Read on every tool call. Act before continuing.** + +Examples of hints you must obey: + +- Missing required fields on `done`: hint says `executionRecord is required`. Re-call with the field. +- Tool description says "REQUIRED in multi-team accounts". The server rejects ambiguous calls. +- Hint says "no ready tasks; try `piyaz_analyze type='plannable'`". Switch to plannable. Do not invent ready work. +- Hint says "edges to cancelled task remain in place". Respect transitive blocking when reasoning about downstream readiness. + +**Order rule when multiple hints fire.** When two or more `_hints` come back in the same response (e.g. "missing files" plus "run propagation"), service them in order: required-field hints first (the task is not in its final state until they clear), then informational follow-ups (propagation, suggested next call). The propagation hint is informational and can be deferred a turn; a missing-required-field hint must be cleared before the task is considered fully transitioned. + +Skipping a hint is operating on stale information. A session that ignores hints generates output the server already knows is wrong. + +--- + +## lifecycle §1 — Status lifecycle + +``` +draft → planned → in_progress → in_review → done + cancelled (terminal, reachable from any non-terminal) +``` + +### Summary + +| Status | Required fields | Forbidden fields | Trigger to leave | +|---|---|---|---| +| `draft` | `description`, `acceptanceCriteria` | `executionRecord`, `implementationPlan` | implementation plan saved → `planned` | +| `planned` | + `implementationPlan` (unabridged); all `depends_on` blockers `done` | `executionRecord` | someone claims via `action='update' status='in_progress'` → `in_progress` | +| `in_progress` | + active worker (one only) | — | work complete + record + ACs + Completion Protocol §2 run → `in_review` | +| `in_review` | + `executionRecord`, `decisions`, `files`, every AC evaluated, `prUrl` (optional sugar — when a PR was opened; backend upserts a `task_links` row with `kind='pull_request'`) | — | HOTL operator inspects PR and flips → `done` (or back to `in_progress` for rework) | +| `done` | (inherited from `in_review`) | — | terminal | +| `cancelled` | + `executionRecord` (rationale + what was tried), `decisions` | — | terminal | + +### `in_progress` + +- **What it means.** Active implementation. Exactly one engineer or agent is working on it. +- **Constraint:** should not span sessions. If work pauses, leave a note in the task or move it back to `planned`. +- **Transitions to `in_review`:** when implementation is complete, `executionRecord` / `decisions` / `files` are populated, acceptance criteria are evaluated, and the Completion Protocol (§2) has run. + +### `in_review` + +- **What it means.** Implementer subagent has finished the work, opened a PR, and populated the full Completion Protocol payload (`executionRecord`, `decisions`, `files`, evaluated `acceptanceCriteria`). Tests, lint, and typecheck are green. Awaiting human review on the PR. +- **Cannot:** be self-promoted to `done` by any agent. The HOTL operator owns the `in_review → done` transition. +- **Transitions to `done`:** when the PR is approved/merged and the operator updates status. No additional payload is required; the implementer already populated everything. +- **Transitions back to `in_progress`:** when the reviewer requests rework. The implementer or a follow-up worker picks the task up again from `in_progress`. + +--- + +## lifecycle §2 — Completion Protocol + +Before transitioning a task to `in_review`, `done`, or `cancelled`: + +### 2.1. Detect mode by transcript + +- **Dispatched mode**: your context shows you were invoked via the Task tool by a parent agent. Mark `in_review` directly with the full payload (the implementer's terminal write); the HOTL operator finalizes to `done`. Return to the parent with the task ref and a one-sentence summary. Do not ask. +- **Direct mode**: invoked by the user in a normal session. Ask "Ready to mark this `in_review`?" with a one-sentence executionRecord preview. Wait for explicit confirmation; the HOTL operator finalizes to `done` after PR approval. +- **Uncertain**: default to asking. A spurious confirmation prompt is cheap; an unauthorized status change is expensive. + +### 2.2. Populate the required fields + +`executionRecord`, `decisions`, `files`, `acceptanceCriteria`, plus `prUrl` when a PR was opened (backend upserts a `task_links` row with `kind='pull_request'` so the review subagent and detail UI can resolve the PR). The MCP server returns `_hints` if any are missing. Re-call with the additions before continuing. + +For pure spec-review / docs / decision-only / Piyaz-only refinement tasks that touched no repo files, pass `files=[]` explicitly. Omitting the field leaves the prior value in place and the server's "missing files" hint will not clear. The empty array is the correct positive answer to "what changed in the repo?", not the absence of an answer. + +### 2.3. Open a PR if the work changed code + +If `files` is non-empty AND the work was a real code change (not research, not decision-only, not Piyaz-only refinement): + +**Detect a PR template** in the repo at one of these paths (or similar): + +- `.github/PULL_REQUEST_TEMPLATE.md` +- `.github/pull_request_template.md` +- `.github/PULL_REQUEST_TEMPLATE/.md` +- `docs/pull_request_template.md` + +**If a template exists**: fill it. Map task fields onto template sections only where they fit. Leave a section blank rather than invent content. Common mappings: + +- Linked issue / linked task: include the `taskRef` in `[BRACKETS]` (e.g. `[MYMR-83]`). Bracket form triggers Piyaz PR-status tracking; use it for the ONE primary task this PR builds. Reference any related tasks elsewhere as plain links (no brackets). Add `Closes #N` on its own line if a GitHub issue is being resolved. +- Summary section: 2 to 3 sentences from `executionRecord`. +- Test plan / verification section: the `acceptanceCriteria` items that are checked. +- Decisions or notes-for-reviewer section if present: relevant entries from `decisions`. + +**If no template exists**: use this concise default. + +```markdown +## Summary + +**Task Reference**: [MYMR-XXX] + + + + +## Type of change + +- [ ] Bug fix +- [ ] New feature +- [ ] Refactor / cleanup +- [ ] Documentation + +## Testing + +- [ ] Tested locally with `` +- [ ] Linting and formatting pass (``) +- [ ] Type or build check passes (``) + +## Notes for reviewer + + +``` + +Open the PR with `gh pr create --title '' --body "$(cat <<'EOF' ... EOF)"`. + +**Always concise.** Do not pad sections to look thorough. Empty optional sections beat fabricated content. If the template has prompt questions you cannot answer, skip them rather than make answers up. + +### 2.4. Skip the PR for these task types + +- Research / investigation tasks (no code change). +- Decision-only tasks. +- Pure-Piyaz refinement tasks (no repo changes). +- Tasks the user explicitly said "no PR" on. +- Data and BA work without a code repo (a Looker dashboard tweak applied via the Looker UI, a Tableau workbook published from Desktop, a metric definition signed off in a doc, an ad-hoc SQL analysis attached to a ticket, a BRD update in Confluence). In these cases the deliverable lives outside git; record the artifact link or path in `executionRecord` and `files` instead of opening a PR. When the data work IS in a git repo (a dbt project, a SQL repo, a notebook collection under version control), open a PR per the standard rules above. + +When in doubt, ask the user before opening. + +--- + +## artifacts §1 — Task artifact quality + +### `executionRecord` (only on `in_review`, `done`, and `cancelled`) + +You write this field at the `in_review` transition; it is the core of your Completion Protocol payload. + +- **Length:** 3 to 5 sentences. +- **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). +- **Include:** function names, file paths, endpoints, data formats. +- **Exclude:** debugging stories, false starts, filler. +- **For `cancelled`:** rationale (why abandoned), approaches tried, decisions learned. Same shape as a done record, just for non-shipping outcomes. +- **Draft tasks must NOT carry an `executionRecord`.** That field implies the task shipped. + +### `decisions` + +One-liner per decision. Format: **CHOICE + WHY**. + +Decisions come from the refinement, planning, or implementation conversation. When a choice is settled (by you against the codebase, or with the user), record it without being asked. + +``` +GOOD (web): "Chose Redis for refresh tokens. Need fast revocation lookups." +GOOD (sim): "Use std::vector for the Queue backing storage. Cheap front() lookup, fast tail insert; spec is silent on container choice." +GOOD (agentic): "Use a per-thread tool registry. Two concurrent agent loops were stepping on each other's MCP client state." + +BAD: "Used Drizzle" +BAD: "We picked Redis because it's good" +BAD: "Decided to do it that way" +``` + +Never invent. If a decision is not grounded in conversation, code, or the artifacts above, leave it out. + +### `files` + +- **Format:** plain repo-relative path strings. No backticks, no quoting. +- **Coverage:** every file created or modified for `done` tasks. +- **Empty `files=[]` is the correct value whenever paths cannot be cited:** pre-implementation tasks (`draft`, `planned`) where the code does not exist yet, research or decision-only tasks, Piyaz-only refinements. **Leave empty rather than speculate.** + +--- + +## artifacts §6 — Markdown formatting and tone + +Applies to `description`, `acceptanceCriteria`, `executionRecord`, `implementationPlan`, `decisions`, and edge `note`. Not to `files` (plain paths) or `tags` (kebab-case). + +### Structure + +- Bullet lists (`-`) for 3 or more items. Never run-on prose. +- Backticks for code references: file paths, function names, endpoints, variables, package names. +- Paragraph breaks between distinct topics. +- Headings (`##`, `###`) only in long fields like `implementationPlan`. + +### Tone: never sound like AI + +The text you write into Piyaz is read by other engineers. It must read like an engineer wrote it, not a chatbot. + +**Do not use:** + +- Em dashes (the `—` character). Use periods, commas, parentheses, or colons. +- Hedging openers: "I think", "perhaps", "seems to", "might be", "arguably". +- Enthusiasm: "Great question", "Awesome", "Exciting", "Love this". +- Throat-clearing: "Let me dive into", "I hope this helps", "Here's the thing", "To be honest". +- Marketing words: "comprehensive", "robust", "powerful", "leverage", "utilize", "ensure", "facilitate", "seamless", "game-changer", "best-in-class". +- Adverb-heavy openers: "Importantly", "Crucially", "Notably", "Essentially", "Basically". +- Empty filler: "It's worth noting that", "It should be mentioned", "As a matter of fact". +- Performative summaries at the end: "I hope this helps!", "Let me know if you need anything else!" + +**Do:** + +- Subject, verb, object. +- Active voice. +- Concrete over abstract. "Adds 50ms p99" beats "improves performance". +- Specific over vague. "Stripe webhook handler" beats "payment integration". +- Cut adverbs. +- One idea per sentence. + +### Length + +Concision over padding. No filler, no AI throat-clearing, no repetition. But do not sacrifice clarity for brevity. The rule is "no fluff", not "no length". diff --git a/plugins/claude-code/skills/composer/references/planner-rules.md b/plugins/claude-code/skills/composer/references/planner-rules.md new file mode 100644 index 00000000..a09eed54 --- /dev/null +++ b/plugins/claude-code/skills/composer/references/planner-rules.md @@ -0,0 +1,193 @@ +# Planner rules (composer Phase 2 extract) + +Slim extract of the canonical piyaz references for the composer planner. +Mirrors: `skills/piyaz/references/conventions.md` §1, +`skills/piyaz/references/artifacts.md` §1 (`description`, +`acceptanceCriteria`, `decisions`), §6, and +`skills/piyaz/references/lifecycle.md` §1 (Summary, `draft`, `planned`), +§2.2 (Completion Protocol payload fields). Headings carry their canonical +file and section number so citations like `lifecycle §2.2` resolve +unambiguously. When editing a mirrored section, edit BOTH files. + +--- + +## conventions §1 — The Iron Law of grounding + +``` +Never write what you cannot cite or do not know. +``` + +Applies wherever an agent generates `executionRecord`, `decisions`, `description`, or `files`. + +- `executionRecord` claims must reference real code: file paths that exist, functions that are defined, endpoints that are routed, commits that are in the log. +- `description` must reflect actual scope. Do not stretch a one-line ask into an invented full feature. +- `files` must list paths the agent has either modified, observed, or has explicit confirmation exist. + +When uncertain, write less. A short, true record is more valuable than a rich, fabricated one. + +`decisions` are different (see §1 of the artifact rules below). They come from the conversation, not from artifact-mining. + +--- + +## lifecycle §1 — Status lifecycle + +``` +draft → planned → in_progress → in_review → done + cancelled (terminal, reachable from any non-terminal) +``` + +### Summary + +| Status | Required fields | Forbidden fields | Trigger to leave | +|---|---|---|---| +| `draft` | `description`, `acceptanceCriteria` | `executionRecord`, `implementationPlan` | implementation plan saved → `planned` | +| `planned` | + `implementationPlan` (unabridged); all `depends_on` blockers `done` | `executionRecord` | someone claims via `action='update' status='in_progress'` → `in_progress` | +| `in_progress` | + active worker (one only) | — | work complete + record + ACs + Completion Protocol §2 run → `in_review` | +| `in_review` | + `executionRecord`, `decisions`, `files`, every AC evaluated, `prUrl` (optional sugar — when a PR was opened; backend upserts a `task_links` row with `kind='pull_request'`) | — | HOTL operator inspects PR and flips → `done` (or back to `in_progress` for rework) | +| `done` | (inherited from `in_review`) | — | terminal | +| `cancelled` | + `executionRecord` (rationale + what was tried), `decisions` | — | terminal | + +### `draft` + +- **What it means.** Scope captured. The task is real but unbuilt. +- **Cannot:** be coded directly. Needs planning first. +- **Transitions to `planned`:** when an implementation plan is written and saved on the task. The plan must be unabridged. Do not save summaries. + +### `planned` + +- **What it means.** Implementation plan is written. All `depends_on` blockers are themselves `done`. Ready for someone to claim and code. +- **Transitions to `in_progress`:** when someone explicitly claims via `piyaz_task action='update' status='in_progress'`. Claim BEFORE starting work; this prevents two agents from grabbing the same task. + +--- + +## lifecycle §2.2 — Populate the required fields (Completion Protocol) + +`executionRecord`, `decisions`, `files`, `acceptanceCriteria`, plus `prUrl` when a PR was opened (backend upserts a `task_links` row with `kind='pull_request'` so the review subagent and detail UI can resolve the PR). The MCP server returns `_hints` if any are missing. Re-call with the additions before continuing. + +For pure spec-review / docs / decision-only / Piyaz-only refinement tasks that touched no repo files, pass `files=[]` explicitly. Omitting the field leaves the prior value in place and the server's "missing files" hint will not clear. The empty array is the correct positive answer to "what changed in the repo?", not the absence of an answer. + +(The planner pre-fills the plan's Completion Protocol template section against these field requirements; the implementer executes the full protocol from its own extract.) + +--- + +## artifacts §1 — Task artifact quality + +### `description` + +The first thing a coding agent or engineer reads when picking up a task. It must be enough on its own to start the work. Concise and clear. + +Cover, depending on task type: + +- **Feature**: what the capability does, who it serves, where it lives in the architecture. +- **Bug**: what is broken, when it manifests, why it matters, and the suspected root cause if known. +- **Refactor / improvement**: what changes, what stays the same, why it is worth doing now. +- **Research / investigation**: what the question is, why it needs answering, what a good answer looks like. +- **Chore / setup / docs**: what needs doing and why now. + +- **Solution sketch:** if you have one, include it. "Use Drizzle, mirror the patterns in `lib/data/task.ts`" is more useful than "Define the database tables". +- **No speculation:** do not pad with implementation guesses when the approach is uncertain. The implementation plan is for that. + +Length: 2 to 4 sentences for most tasks. Up to 6 to 8 sentences for genuinely complex tasks. Single-sentence descriptions are rejected. + +``` +GOOD (feature, web SaaS): +"Build the habit completion endpoint at POST /api/habits/:id/complete. Inserts +into habit_logs with the user's timezone-adjusted date. Returns the updated +streak count. Idempotent on (habit_id, log_date): duplicate calls return the +existing log. Used by both the web dashboard and the iOS widget." + +GOOD (bug, simulation engine): +"Fix Queue::front returning a copy instead of a reference. Spec §4.2.4.2 +requires the head pointer to be modifiable in-place so Airport::moveToRunway +can swap it out without a re-insert. Currently caught by a unit test on +takeoff_flow. Likely a one-line change in include/Queue.h." + +BAD: "Improve the database." +BAD: "Make auth better." +BAD: "Fix the bug in queue." +BAD: "Build the dashboard." +``` + +### `acceptanceCriteria` + +2 to 4 items. Each criterion must be **binary**: a reviewer can answer YES or NO without ambiguity. + +``` +GOOD: +- "Running bun run db:push creates all tables without errors" +- "User table has id, email, name, passwordHash, createdAt columns" +- "FK from tasks.projectId to projects.id with ON DELETE CASCADE" +- "Seed script creates 3 test users and 2 projects with tasks" + +GOOD (firmware): +- "spi_send returns within 50µs at 80MHz clock measured on logic analyzer" +- "DMA TX completion fires interrupt; no busy-loop in the driver" +- "spi_recv returns 0xFF when MISO is held high, verified on the bench" + +BAD: +- "Database works" +- "All tables created" +- "Tests pass" +- "Performance is good" +``` + +Single-AC tasks are rejected. Tasks with vague ACs ("works correctly", "is complete", "performs well") are rejected. + +### `decisions` + +One-liner per decision. Format: **CHOICE + WHY**. + +Decisions come from the refinement, planning, or implementation conversation. When the user and the agent (or two agents) settle on a choice, that's a decision. The agent should automatically record it without being asked. If the agent is uncertain whether a choice rises to "decision" level, ask the user briefly to confirm. + +``` +GOOD (web): "Chose Redis for refresh tokens. Need fast revocation lookups." +GOOD (sim): "Use std::vector for the Queue backing storage. Cheap front() lookup, fast tail insert; spec is silent on container choice." +GOOD (agentic): "Use a per-thread tool registry. Two concurrent agent loops were stepping on each other's MCP client state." + +BAD: "Used Drizzle" +BAD: "We picked Redis because it's good" +BAD: "Decided to do it that way" +``` + +Never invent. If a decision is not grounded in conversation, code, or the artifacts above, leave it out. + +--- + +## artifacts §6 — Markdown formatting and tone + +Applies to `description`, `acceptanceCriteria`, `executionRecord`, `implementationPlan`, `decisions`, and edge `note`. Not to `files` (plain paths) or `tags` (kebab-case). + +### Structure + +- Bullet lists (`-`) for 3 or more items. Never run-on prose. +- Backticks for code references: file paths, function names, endpoints, variables, package names. +- Paragraph breaks between distinct topics. +- Headings (`##`, `###`) only in long fields like `implementationPlan`. + +### Tone: never sound like AI + +The text you write into Piyaz is read by other engineers. It must read like an engineer wrote it, not a chatbot. + +**Do not use:** + +- Em dashes (the `—` character). Use periods, commas, parentheses, or colons. +- Hedging openers: "I think", "perhaps", "seems to", "might be", "arguably". +- Enthusiasm: "Great question", "Awesome", "Exciting", "Love this". +- Throat-clearing: "Let me dive into", "I hope this helps", "Here's the thing", "To be honest". +- Marketing words: "comprehensive", "robust", "powerful", "leverage", "utilize", "ensure", "facilitate", "seamless", "game-changer", "best-in-class". +- Adverb-heavy openers: "Importantly", "Crucially", "Notably", "Essentially", "Basically". +- Empty filler: "It's worth noting that", "It should be mentioned", "As a matter of fact". +- Performative summaries at the end: "I hope this helps!", "Let me know if you need anything else!" + +**Do:** + +- Subject, verb, object. +- Active voice. +- Concrete over abstract. "Adds 50ms p99" beats "improves performance". +- Specific over vague. "Stripe webhook handler" beats "payment integration". +- Cut adverbs. +- One idea per sentence. + +### Length + +Concision over padding. No filler, no AI throat-clearing, no repetition. But do not sacrifice clarity for brevity. If a task genuinely needs 6 to 8 sentences in its description because the architecture has multiple components, the bug has a complex cause, or the research question is multi-part, write them. The rule is "no fluff", not "no length". A 6-sentence description that helps a reader is better than a 2-sentence one that loses them. diff --git a/plugins/claude-code/skills/composer/references/researcher-rules.md b/plugins/claude-code/skills/composer/references/researcher-rules.md new file mode 100644 index 00000000..4415f961 --- /dev/null +++ b/plugins/claude-code/skills/composer/references/researcher-rules.md @@ -0,0 +1,210 @@ +# Researcher rules (composer Phase 1 extract) + +Slim extract of the canonical piyaz references for the composer researcher. +Mirrors: `skills/piyaz/references/conventions.md` §1, §4 and +`skills/piyaz/references/artifacts.md` §1 (`description`, +`acceptanceCriteria`, `decisions`), §2, §5, §6. Headings carry their +canonical file and section number so citations like `conventions §1` +resolve unambiguously. When editing a mirrored section, edit BOTH files. + +--- + +## conventions §1 — The Iron Law of grounding + +``` +Never write what you cannot cite or do not know. +``` + +Applies wherever an agent generates `executionRecord`, `decisions`, `description`, or `files`. + +- `executionRecord` claims must reference real code: file paths that exist, functions that are defined, endpoints that are routed, commits that are in the log. +- `description` must reflect actual scope. Do not stretch a one-line ask into an invented full feature. +- `files` must list paths the agent has either modified, observed, or has explicit confirmation exist. + +When uncertain, write less. A short, true record is more valuable than a rich, fabricated one. + +`decisions` are different (see §1 of the artifact rules below). They come from the conversation, not from artifact-mining. + +--- + +## conventions §4 — taskRef format + +Tool responses include a `taskRef` like `MYMR-83`: uppercase project prefix, dash, integer. Use the ref in user-facing output. **Always pass the UUID `taskId` to tool calls. Never the ref.** + +--- + +## artifacts §1 — Task artifact quality + +### `description` + +The first thing a coding agent or engineer reads when picking up a task. It must be enough on its own to start the work. Concise and clear. + +Cover, depending on task type: + +- **Feature**: what the capability does, who it serves, where it lives in the architecture. +- **Bug**: what is broken, when it manifests, why it matters, and the suspected root cause if known. +- **Refactor / improvement**: what changes, what stays the same, why it is worth doing now. +- **Research / investigation**: what the question is, why it needs answering, what a good answer looks like. +- **Chore / setup / docs**: what needs doing and why now. + +- **Solution sketch:** if you have one, include it. "Use Drizzle, mirror the patterns in `lib/data/task.ts`" is more useful than "Define the database tables". +- **No speculation:** do not pad with implementation guesses when the approach is uncertain. The implementation plan is for that. + +Length: 2 to 4 sentences for most tasks. Up to 6 to 8 sentences for genuinely complex tasks. Single-sentence descriptions are rejected. + +``` +GOOD (feature, web SaaS): +"Build the habit completion endpoint at POST /api/habits/:id/complete. Inserts +into habit_logs with the user's timezone-adjusted date. Returns the updated +streak count. Idempotent on (habit_id, log_date): duplicate calls return the +existing log. Used by both the web dashboard and the iOS widget." + +GOOD (bug, simulation engine): +"Fix Queue::front returning a copy instead of a reference. Spec §4.2.4.2 +requires the head pointer to be modifiable in-place so Airport::moveToRunway +can swap it out without a re-insert. Currently caught by a unit test on +takeoff_flow. Likely a one-line change in include/Queue.h." + +GOOD (research, ML platform): +"Investigate whether torch.compile improves training throughput on the +ResNet-50 baseline. Question: does compile-time speedup outweigh JIT overhead +on our 8-GPU pod? A good answer is a benchmark script plus a one-paragraph +recommendation comparing wall-clock per epoch and peak memory." + +BAD: "Improve the database." +BAD: "Make auth better." +BAD: "Fix the bug in queue." +BAD: "Build the dashboard." +``` + +### `acceptanceCriteria` + +2 to 4 items. Each criterion must be **binary**: a reviewer can answer YES or NO without ambiguity. + +``` +GOOD: +- "Running bun run db:push creates all tables without errors" +- "User table has id, email, name, passwordHash, createdAt columns" +- "FK from tasks.projectId to projects.id with ON DELETE CASCADE" +- "Seed script creates 3 test users and 2 projects with tasks" + +GOOD (firmware): +- "spi_send returns within 50µs at 80MHz clock measured on logic analyzer" +- "DMA TX completion fires interrupt; no busy-loop in the driver" +- "spi_recv returns 0xFF when MISO is held high, verified on the bench" + +BAD: +- "Database works" +- "All tables created" +- "Tests pass" +- "Performance is good" +``` + +Single-AC tasks are rejected. Tasks with vague ACs ("works correctly", "is complete", "performs well") are rejected. + +### `decisions` + +One-liner per decision. Format: **CHOICE + WHY**. + +Decisions come from the refinement, planning, or implementation conversation. When the user and the agent (or two agents) settle on a choice, that's a decision. The agent should automatically record it without being asked. If the agent is uncertain whether a choice rises to "decision" level, ask the user briefly to confirm. + +``` +GOOD (web): "Chose Redis for refresh tokens. Need fast revocation lookups." +GOOD (sim): "Use std::vector for the Queue backing storage. Cheap front() lookup, fast tail insert; spec is silent on container choice." +GOOD (agentic): "Use a per-thread tool registry. Two concurrent agent loops were stepping on each other's MCP client state." + +BAD: "Used Drizzle" +BAD: "We picked Redis because it's good" +BAD: "Decided to do it that way" +``` + +Never invent. If a decision is not grounded in conversation, code, or the artifacts above, leave it out. + +--- + +## artifacts §2 — Tag dimensions and first-class fields + +Every task, in every status, must carry tags across the three tag dimensions below. Reuse existing tags from `piyaz_query type='meta'` before coining new ones. + +| Dimension | Count | Vocabulary | +|---|---|---| +| **Work type** | exactly 1 | `bug`, `feature`, `refactor`, `docs`, `test`, `chore`, `perf` | +| **Cross-cutting concern** | ≥1 | quality attribute (`security`, `a11y`, `dx`, `perf`, `reliability`, `observability`, `i18n`, `compliance`, `safety`) or feature cluster spanning multiple categories (web: `onboarding-flow`, `live-replay`; aerospace: `flight-control`, `mission-planning`; agentic: `agent-loop`, `eval-harness`; ML: `inference-pipeline`, `data-drift`; financial: `risk-engine`, `pricing-model`) | +| **Tech** | at most 2 | most important stack pieces the task touches; pull from manifest deps | + +### First-class fields (priority, estimate, assignees) + +These are top-level columns on every task, set via `piyaz_task` parameters of the same name. They are NOT tags. + +- **`priority`** (one of `urgent`, `core`, `normal`, `backlog`). Required-on-create-by-convention: pick deliberately. Defaults: onboarding (shipped features) lands at `core`; decompose picks per task and avoids `core` everywhere or `urgent` everywhere (the dimension carries no signal then). A 30-task project usually has 3 to 6 `urgent` tasks and the rest split between `core`, `normal`, and `backlog`. +- **`estimate`** (Fibonacci story points: `1`, `2`, `3`, `5`, `8`, `13`). Optional. `1` is trivial, `2` and `3` are routine, `5` is nontrivial, `8` and `13` are risky or multi-day. If a task feels larger than `13`, split it (§5). +- **`assigneeIds`** (array of team-member user UUIDs). Optional. Declares ownership / intent, not concurrent execution; the single-worker `in_progress` invariant still holds. Each id must be a member of the project's owning team (the server rejects non-members at write time). + +**Do NOT tag:** + +- Priority: that is the `priority` field's job. Setting `urgent`, `core`, `normal`, or `backlog` as tags duplicates the field and adds no signal. +- Codebase area: that's `category`'s job. **Test: would this name plausibly be a category in some other project shape?** `render-loop`, `effect-system`, `auth`, `payments`, `inference`, `marts`, `flight-control`, `hal-drivers` all answer YES. They're subsystems / product areas, even if your project's category list happens to omit them. Tags are axes the project does not shape itself around: quality attributes (`security`, `a11y`, `perf`, `reliability`, `observability`, `dx`, `compliance`, `safety`, `i18n`) and multi-category feature clusters (`onboarding-flow`, `agent-loop`, `mission-planning`, `live-replay`). If a candidate tag names a subsystem, surface it as a category proposal at the gate or use the existing category. Coining an area-shaped tag because the categories lack a good slot is a category-list bug, not a tag. +- Task status: that is `status`'s job. +- Generic adjectives like "important", "main", "primary". + +**Honoring user-specified tags:** if the user explicitly tagged something, preserve their tags. Add the missing dimensions if any of the three are absent. + +**Tech tag examples by domain:** + +- Web: `react`, `next`, `drizzle`, `postgres`, `tailwind` +- Embedded: `c`, `rust`, `freertos`, `stm32-hal`, `zephyr` +- Data / ML: `sql`, `dbt`, `pytorch`, `clickhouse`, `airflow` + +Pull tech tags from the project's actual stack. Do not invent. + +--- + +## artifacts §5 — Granularity + +**1 to 4 hours per task.** A coding agent should complete one in a single session. + +Too small (under 30 minutes): overhead exceeds work. +Too large (over 1 day): hidden subtasks, unclear scope, hard to track. + +When in doubt, split. Tasks become more useful, and more parallelizable, as they shrink toward the 1-hour mark. Splitting is the decompose agent's job; the researcher's part is raising `oversize-task` when the true scope exceeds what `13` represents. + +--- + +## artifacts §6 — Markdown formatting and tone + +Applies to `description`, `acceptanceCriteria`, `executionRecord`, `implementationPlan`, `decisions`, and edge `note`. Not to `files` (plain paths) or `tags` (kebab-case). + +### Structure + +- Bullet lists (`-`) for 3 or more items. Never run-on prose. +- Backticks for code references: file paths, function names, endpoints, variables, package names. +- Paragraph breaks between distinct topics. +- Headings (`##`, `###`) only in long fields like `implementationPlan`. + +### Tone: never sound like AI + +The text you write into Piyaz is read by other engineers. It must read like an engineer wrote it, not a chatbot. + +**Do not use:** + +- Em dashes (the `—` character). Use periods, commas, parentheses, or colons. +- Hedging openers: "I think", "perhaps", "seems to", "might be", "arguably". +- Enthusiasm: "Great question", "Awesome", "Exciting", "Love this". +- Throat-clearing: "Let me dive into", "I hope this helps", "Here's the thing", "To be honest". +- Marketing words: "comprehensive", "robust", "powerful", "leverage", "utilize", "ensure", "facilitate", "seamless", "game-changer", "best-in-class". +- Adverb-heavy openers: "Importantly", "Crucially", "Notably", "Essentially", "Basically". +- Empty filler: "It's worth noting that", "It should be mentioned", "As a matter of fact". +- Performative summaries at the end: "I hope this helps!", "Let me know if you need anything else!" + +**Do:** + +- Subject, verb, object. +- Active voice. +- Concrete over abstract. "Adds 50ms p99" beats "improves performance". +- Specific over vague. "Stripe webhook handler" beats "payment integration". +- Cut adverbs. +- One idea per sentence. + +### Length + +Concision over padding. No filler, no AI throat-clearing, no repetition. But do not sacrifice clarity for brevity. If a task genuinely needs 6 to 8 sentences in its description because the architecture has multiple components, the bug has a complex cause, or the research question is multi-part, write them. The rule is "no fluff", not "no length". A 6-sentence description that helps a reader is better than a 2-sentence one that loses them. diff --git a/plugins/claude-code/skills/composer/references/reviewer-rules.md b/plugins/claude-code/skills/composer/references/reviewer-rules.md new file mode 100644 index 00000000..f4ba311d --- /dev/null +++ b/plugins/claude-code/skills/composer/references/reviewer-rules.md @@ -0,0 +1,144 @@ +# Reviewer rules (composer Phase 4 extract) + +Slim extract of the canonical piyaz references for the review agent. +Mirrors: `skills/piyaz/references/conventions.md` §1, +`skills/piyaz/references/lifecycle.md` §2.2, §2.3, §2.4, §3, and +`skills/piyaz/references/artifacts.md` §1 (`executionRecord`, +`decisions`), §6. Headings carry their canonical file and section number +so citations like `lifecycle §2.2` resolve unambiguously. When editing a +mirrored section, edit BOTH files. + +The reviewer verifies the Completion Protocol was honored; it does not +execute it. §2.2 and §2.3 below are what the implementer was required to +do; §3 is what the orchestrator runs after your verdict, fed by your +downstream-impact list. + +--- + +## conventions §1 — The Iron Law of grounding + +``` +Never write what you cannot cite or do not know. +``` + +Applies wherever an agent generates `executionRecord`, `decisions`, `description`, or `files`. For the reviewer it applies to the verdict: every finding cites a real file path and line, every AC evaluation cites the diff or the executionRecord. When uncertain, write less. A short, true verdict is more valuable than a rich, fabricated one. + +--- + +## lifecycle §2.2 — Populate the required fields + +`executionRecord`, `decisions`, `files`, `acceptanceCriteria`, plus `prUrl` when a PR was opened (backend upserts a `task_links` row with `kind='pull_request'` so the review subagent and detail UI can resolve the PR). The MCP server returns `_hints` if any are missing. + +For pure spec-review / docs / decision-only / Piyaz-only refinement tasks that touched no repo files, `files=[]` is the correct positive answer to "what changed in the repo?", not the absence of an answer. + +## lifecycle §2.3 — Open a PR if the work changed code (what the implementer owed) + +If `files` is non-empty AND the work was a real code change (not research, not decision-only, not Piyaz-only refinement), the implementer must have opened a PR: + +- PR body follows the repo's PR template when one exists (`.github/PULL_REQUEST_TEMPLATE.md` and variants), the canonical concise default otherwise. +- The `taskRef` appears in `[BRACKETS]` (e.g. `[MYMR-83]`) exactly once, for the ONE primary task the PR builds. Bracket form triggers Piyaz PR-status tracking. Related tasks are referenced as plain links, no brackets. +- Summary maps from `executionRecord` (2 to 3 sentences); test plan maps from checked `acceptanceCriteria`; notes-for-reviewer maps from `decisions`. +- Sections are concise; empty optional sections beat fabricated content. + +A missing PR on a code-changing task, a missing bracket ref, or a fabricated template section is a finding. + +## lifecycle §2.4 — Skip the PR for these task types + +A missing PR is legitimate (not a finding) for: + +- Research / investigation tasks (no code change). +- Decision-only tasks. +- Pure-Piyaz refinement tasks (no repo changes). +- Tasks the user explicitly said "no PR" on. +- Data and BA work without a code repo (dashboard tweaks, workbooks, metric sign-offs, ad-hoc SQL attached to a ticket). The deliverable lives outside git; the artifact link or path belongs in `executionRecord` and `files`. When the data work IS in a git repo (a dbt project, a versioned SQL or notebook repo), the standard PR rules apply. + +--- + +## lifecycle §3 — Propagate after every change (Iron Law) + +``` +A change that does not propagate did not happen. +``` + +The graph is Piyaz's value. Skip once and it lies: ready tasks that aren't ready, blockers pointing at shipped work, every future session picking the wrong next step. + +After any status change or significant refinement: + +1. `piyaz_query type='edges'` on the changed task. Current relationships. +2. `piyaz_analyze type='downstream'`. Who depends on this task. +3. For each downstream task, evaluate: + - Do edge notes need updating to reflect new decisions? + - Are there NEW relationships revealed by this change? + - Are there STALE relationships that no longer hold? + - Do downstream descriptions need updating based on the decisions made? +4. Create, update, or remove edges as needed. + +The reviewer does not execute propagation. Your downstream-impact list names the edges that will need attention; the orchestrator (or the human) executes the rewires. + +--- + +## artifacts §1 — Task artifact quality + +### `executionRecord` (only on `in_review`, `done`, and `cancelled`) + +The implementer writes this field at the `in_review` transition; you verify it against the diff. + +- **Length:** 3 to 5 sentences. +- **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). +- **Include:** function names, file paths, endpoints, data formats. +- **Exclude:** debugging stories, false starts, filler. +- **For `cancelled`:** rationale (why abandoned), approaches tried, decisions learned. Same shape as a done record, just for non-shipping outcomes. +- **Draft tasks must NOT carry an `executionRecord`.** That field implies the task shipped. + +### `decisions` + +One-liner per decision. Format: **CHOICE + WHY**. + +``` +GOOD (web): "Chose Redis for refresh tokens. Need fast revocation lookups." +GOOD (sim): "Use std::vector for the Queue backing storage. Cheap front() lookup, fast tail insert; spec is silent on container choice." + +BAD: "Used Drizzle" +BAD: "We picked Redis because it's good" +BAD: "Decided to do it that way" +``` + +Never invent. An implementer `decisions` entry that is not grounded in the diff, the plan, or the conversation is a finding. + +--- + +## artifacts §6 — Markdown formatting and tone + +Applies to everything you write into the verdict. + +### Structure + +- Bullet lists (`-`) for 3 or more items. Never run-on prose. +- Backticks for code references: file paths, function names, endpoints, variables, package names. +- Paragraph breaks between distinct topics. + +### Tone: never sound like AI + +**Do not use:** + +- Em dashes (the `—` character). Use periods, commas, parentheses, or colons. +- Hedging openers: "I think", "perhaps", "seems to", "might be", "arguably". +- Enthusiasm: "Great question", "Awesome", "Exciting", "Love this". +- Throat-clearing: "Let me dive into", "I hope this helps", "Here's the thing", "To be honest". +- Marketing words: "comprehensive", "robust", "powerful", "leverage", "utilize", "ensure", "facilitate", "seamless", "game-changer", "best-in-class". +- Adverb-heavy openers: "Importantly", "Crucially", "Notably", "Essentially", "Basically". +- Empty filler: "It's worth noting that", "It should be mentioned", "As a matter of fact". +- Performative summaries at the end: "I hope this helps!", "Let me know if you need anything else!" + +**Do:** + +- Subject, verb, object. +- Active voice. +- Concrete over abstract. "Adds 50ms p99" beats "improves performance". +- Specific over vague. "Stripe webhook handler" beats "payment integration". +- Cut adverbs. +- One idea per sentence. + +### Length + +Concision over padding. No filler, no repetition. The rule is "no fluff", not "no length". diff --git a/plugins/claude-code/skills/composer/references/sources.json b/plugins/claude-code/skills/composer/references/sources.json new file mode 100644 index 00000000..1f351974 --- /dev/null +++ b/plugins/claude-code/skills/composer/references/sources.json @@ -0,0 +1,8 @@ +{ + "_comment": "Canonical-source hash pins for the composer phase extracts in this directory. The extracts hand-mirror sections of these files; scripts/check-plugins.ts fails CI when a pinned file changes, until the extracts are reviewed and the pin refreshed via `bun run sync:plugins`.", + "pins": { + "plugins/claude-code/skills/piyaz/references/conventions.md": "a5488e798e21f2e7b8b4ea3eb09f95c4c8323bb368c86811920ee5bf1826c994", + "plugins/claude-code/skills/piyaz/references/artifacts.md": "f4a873ca56cf6e40bf469a552009cdae2b43133b63c4800d3c3ea3118cff69ca", + "plugins/claude-code/skills/piyaz/references/lifecycle.md": "22da7d66ba174ef621a41ff9ca7c9b401dfc4e29d472f13e0823633387a209da" + } +} diff --git a/plugins/claude-code/skills/composer/workflows/compose-task.js b/plugins/claude-code/skills/composer/workflows/compose-task.js new file mode 100644 index 00000000..45e7551a --- /dev/null +++ b/plugins/claude-code/skills/composer/workflows/compose-task.js @@ -0,0 +1,420 @@ +/** + * compose-task — the composer per-task pipeline. + * + * Launched once per task by the composer orchestrator (skills/composer/SKILL.md) + * via Workflow({ scriptPath, args }). Runs research → plan → implement → CI → + * review → bounded fix loop entirely off the orchestrator's context, dispatching + * the existing composer phase agents by agentType with per-phase model/effort and + * worktree isolation on the implementer. Returns one structured result; the + * orchestrator owns the interactive seams (gates, merge, propagation). + * + * Args (orchestrator → workflow): + * taskRef, taskId, projectId, categories, tagVocabulary, + * pickEstimate, pickPriority, workType, tags, thinDescription, + * mode, plannableOnly, resumeFrom, priorBrief, gateAnswers, fixFindings, + * prUrl, priorFailure, estimate, flags + * + * Return shapes: + * { status:'DONE', outcome:'in_review'|'planned', verdict, prUrl, ciState, + * acSatisfied, acTotal, rotations, escalated, blockingFindings, concerns } + * { status:'NEEDS_DECISION', phase, gate, brief } + * { status:'BLOCKED', phase, reason } + */ + +export const meta = { + name: "compose-task", + description: + "Run one Piyaz task through research, plan, implement, CI gate, review, and a bounded fix loop until the PR is ready", + phases: [ + { title: "Research" }, + { title: "Plan" }, + { title: "Implement" }, + { title: "CI gate" }, + { title: "Review" }, + ], +}; + +const RESEARCH_SCHEMA = { + type: "object", + additionalProperties: false, + required: ["status", "brief", "confidence", "estimate", "workType", "flags", "proposedRewrites", "openQuestions", "reason"], + properties: { + status: { enum: ["DONE", "DONE_WITH_CONCERNS", "NEEDS_DECISION", "BLOCKED"] }, + brief: { type: "string", description: "The full markdown research brief, verbatim." }, + confidence: { type: "number" }, + estimate: { type: ["integer", "null"], description: "Refined Fibonacci estimate (1,2,3,5,8,13) or null." }, + workType: { type: ["string", "null"], description: "feat|fix|refactor|docs|test|chore|perf." }, + flags: { type: "array", items: { type: "string" } }, + proposedRewrites: { + type: "array", + items: { + type: "object", + additionalProperties: false, + required: ["field", "proposed", "rationale"], + properties: { + field: { type: "string" }, + proposed: { type: "string" }, + rationale: { type: "string" }, + }, + }, + }, + openQuestions: { type: "array", items: { type: "string" } }, + reason: { type: "string", description: "One-line STATUS reason." }, + }, +}; + +const PLAN_SCHEMA = { + type: "object", + additionalProperties: false, + required: ["status", "sections", "buildSteps", "openQuestions", "reason"], + properties: { + status: { enum: ["DONE", "DONE_WITH_CONCERNS", "NEEDS_DECISION", "BLOCKED"] }, + sections: { type: "integer" }, + buildSteps: { type: "integer" }, + openQuestions: { type: "array", items: { type: "string" } }, + reason: { type: "string" }, + }, +}; + +const IMPL_SCHEMA = { + type: "object", + additionalProperties: false, + required: ["status", "prUrl", "acSatisfied", "acTotal", "concerns", "reason"], + properties: { + status: { enum: ["DONE", "DONE_WITH_CONCERNS", "BLOCKED"] }, + prUrl: { type: ["string", "null"] }, + branch: { type: ["string", "null"] }, + acSatisfied: { type: "integer" }, + acTotal: { type: "integer" }, + concerns: { type: "array", items: { type: "string" } }, + reason: { type: "string" }, + }, +}; + +const CI_SCHEMA = { + type: "object", + additionalProperties: false, + required: ["state", "failingChecks"], + properties: { + state: { enum: ["green", "red", "pending", "none"] }, + failingChecks: { type: "array", items: { type: "string" } }, + }, +}; + +const VERDICT_SCHEMA = { + type: "object", + additionalProperties: false, + required: ["verdict", "blockingFindings", "concerns"], + properties: { + verdict: { enum: ["approve", "request-changes", "block"] }, + blockingFindings: { + type: "array", + items: { + type: "object", + additionalProperties: false, + required: ["finding"], + properties: { + file: { type: ["string", "null"] }, + line: { type: ["integer", "null"] }, + finding: { type: "string" }, + }, + }, + }, + concerns: { type: "array", items: { type: "string" } }, + }, +}; + +/** + * Resolves the workflow args, tolerating both an object and a JSON string. + * The harness passes `args` verbatim; some serialization paths deliver it as a + * JSON-encoded string, which would otherwise leave every field undefined. + * @param {unknown} raw - The global `args` value. + * @returns {object} The args object. + */ +function resolveArgs(raw) { + if (raw && typeof raw === "object") return raw; + if (typeof raw === "string" && raw.trim()) return JSON.parse(raw); + return {}; +} + +const a = resolveArgs(args); +const PHASE_ORDER = ["research", "plan", "implement", "fix"]; +const RISK_TAGS = ["security", "safety", "compliance"]; +const RISK_FLAGS = ["security-boundary-uncovered", "version-drift-major", "dep-mismatch"]; + +/** + * Reports whether a phase should run given the resume point. + * @param {string} phaseName - One of PHASE_ORDER. + * @returns {boolean} True when phaseName is at or after the resume point. + */ +function shouldRun(phaseName) { + const from = a.resumeFrom || "research"; + return PHASE_ORDER.indexOf(phaseName) >= PHASE_ORDER.indexOf(from); +} + +/** + * Reports whether any tag in a list is risk-bearing. + * @param {string[]} tags - Tag list. + * @returns {boolean} True when a security/safety/compliance tag is present. + */ +function hasRiskTag(tags) { + return (tags || []).some((t) => RISK_TAGS.includes(t)); +} + +/** + * Reports whether the implementer/planner must be forced to opus. + * @param {number|null} est - Refined estimate. + * @param {string[]} flags - Research flags. + * @returns {boolean} True when a guardrail forces the smartest tier. + */ +function forceOpus(est, flags) { + const riskFlag = (flags || []).some((f) => RISK_FLAGS.includes(f)); + return ( + hasRiskTag(a.tags) || + est == null || + est >= 8 || + a.priorFailure != null || + a.pickPriority === "urgent" || + riskFlag + ); +} + +/** + * Selects the research model from pick-time facts. Research correctness is + * load-bearing, so haiku is reserved for trivial, unambiguous work only. + * @returns {string} Model alias. + */ +function researchModel() { + const e = a.pickEstimate; + if (hasRiskTag(a.tags) || a.thinDescription || (e != null && e >= 5)) return "opus"; + return "sonnet"; +} + +/** + * Selects the implementer model from the refined estimate and work type. + * @param {number|null} est - Refined estimate. + * @param {string|null} wt - Work type. + * @param {string[]} flags - Research flags. + * @returns {string} Model alias. + */ +function implementModel(est, wt, flags) { + if (forceOpus(est, flags)) return "opus"; + if ((est != null && est <= 2) || ["docs", "test", "chore"].includes(wt)) return "sonnet"; + return "opus"; +} + +/** + * Builds a NEEDS_DECISION return for an orchestrator gate. + * @param {string} phase - Raising phase. + * @param {object} result - The phase's structured result. + * @param {string} [briefText] - Brief to carry through (plan gate). + * @returns {object} Gate result. + */ +function gateResult(phase, result, briefText) { + return { + status: "NEEDS_DECISION", + phase, + taskRef: a.taskRef, + gate: { + flags: result.flags || [], + proposedRewrites: result.proposedRewrites || [], + openQuestions: result.openQuestions || [], + confidence: result.confidence, + reason: result.reason, + }, + brief: briefText || result.brief, + }; +} + +/** + * Builds a BLOCKED return. + * @param {string} phase - Failing phase. + * @param {string} reason - One-line reason. + * @returns {object} Blocked result. + */ +function blockedResult(phase, reason) { + return { status: "BLOCKED", phase, taskRef: a.taskRef, reason: reason || "no reason reported" }; +} + +/** + * Formats review blocking findings into a fix-dispatch bullet list. + * @param {Array<{file?:string,line?:number,finding:string}>} findings - Findings. + * @returns {string} Newline-joined bullets. + */ +function formatFindings(findings) { + return (findings || []) + .map((f) => `- ${f.file ? `${f.file}${f.line ? `:${f.line}` : ""}: ` : ""}${f.finding}`) + .join("\n"); +} + +const head = `Target task: ${a.taskRef} (taskId ${a.taskId}) in project ${a.projectId}. Pass that projectId on every Piyaz tool call.`; + +// --- Research --------------------------------------------------------------- +phase("Research"); +let brief = a.priorBrief; +let research = null; +if (shouldRun("research")) { + const prompt = + `${head}\nProject categories and tags: ${a.categories}; ${a.tagVocabulary}.` + + (a.gateAnswers ? `\nOpen questions resolved by the user:\n${a.gateAnswers}` : ""); + research = await agent(prompt, { + agentType: "piyaz:composer-researcher", + model: researchModel(), + effort: "medium", + schema: RESEARCH_SCHEMA, + label: `research:${a.taskRef}`, + phase: "Research", + }); + if (!research) return blockedResult("research", "researcher returned no result"); + brief = research.brief; + if (research.status === "NEEDS_DECISION") return gateResult("research", research); + if (research.status === "BLOCKED") return blockedResult("research", research.reason); +} + +const est = research ? research.estimate : (a.estimate != null ? a.estimate : a.pickEstimate); +const wt = research ? research.workType : a.workType; +const flags = research ? research.flags : a.flags || []; + +// --- Plan ------------------------------------------------------------------- +phase("Plan"); +if (shouldRun("plan")) { + const entryStatus = a.plannableOnly ? "draft" : a.mode === "single" ? "unknown" : "draft|planned"; + const prompt = + `${head}\nEntry status: ${entryStatus}.\nResearch brief:\n${brief}` + + (a.gateAnswers ? `\nOpen questions resolved by the user:\n${a.gateAnswers}` : ""); + const plan = await agent(prompt, { + agentType: "piyaz:composer-planner", + model: "opus", + effort: est == null || est >= 8 || hasRiskTag(a.tags) ? "xhigh" : "high", + schema: PLAN_SCHEMA, + label: `plan:${a.taskRef}`, + phase: "Plan", + }); + if (!plan) return blockedResult("plan", "planner returned no result"); + if (plan.status === "NEEDS_DECISION") return gateResult("plan", plan, brief); + if (plan.status === "BLOCKED") return blockedResult("plan", plan.reason); +} + +if (a.plannableOnly) { + return { + status: "DONE", + phase: "plan", + outcome: "planned", + taskRef: a.taskRef, + reason: "plannable-only pick planned; dependencies unfinished", + }; +} + +// --- Implement -------------------------------------------------------------- +phase("Implement"); +let prUrl = a.prUrl; +let acSatisfied = null; +let acTotal = null; +let concerns = []; +if (shouldRun("implement")) { + const prompt = + `${head} Plan is saved to Piyaz; fetch via piyaz_context depth='agent'. ` + + "Claim the task, implement per the implementationPlan, open a PR, mark in_review per the Completion Protocol." + + (a.priorFailure ? `\nPrior failed attempt:\n${a.priorFailure}` : ""); + const impl = await agent(prompt, { + agentType: "piyaz:composer-implementer", + model: implementModel(est, wt, flags), + effort: forceOpus(est, flags) || (est != null && est >= 5) ? "high" : "medium", + isolation: "worktree", + schema: IMPL_SCHEMA, + label: `implement:${a.taskRef}`, + phase: "Implement", + }); + if (!impl) return blockedResult("implement", "implementer returned no result"); + if (impl.status === "BLOCKED") return blockedResult("implement", impl.reason); + prUrl = impl.prUrl || prUrl; + acSatisfied = impl.acSatisfied; + acTotal = impl.acTotal; + concerns = impl.concerns || []; +} + +// --- CI gate → Review → bounded fix loop ------------------------------------ +// A rework launch (resumeFrom='fix' with human findings) seeds the loop so the +// first rotation addresses those findings before any fresh review runs; the +// human already reviewed. Every other entry starts with a CI gate and review. +let rotations = 0; +let lastReview = null; +let ciState = "unknown"; +let pendingFindings = a.resumeFrom === "fix" && a.fixFindings ? a.fixFindings : null; + +while (true) { + if (pendingFindings == null) { + phase("CI gate"); + const ci = await agent( + `Watch CI for pull request ${prUrl} and report status. Run exactly:\n` + + `timeout 600 gh pr checks ${prUrl} --watch; echo "exit=$?"\n` + + "Interpret the exit code: 0 means green; 8 or 124 means pending (checks still running or the watch timed out); any other non-zero means red, UNLESS the output says no checks are reported, which is none. " + + "On red, read the failing check names from the output. Do not edit any files; only report.", + { model: "haiku", effort: "low", schema: CI_SCHEMA, label: `ci:${a.taskRef}`, phase: "CI gate" }, + ); + ciState = ci ? ci.state : "pending"; + const failing = ci && ci.failingChecks ? ci.failingChecks.join(", ") : ""; + + phase("Review"); + const ciNote = + ciState === "red" + ? ` CI: failing (${failing})` + : ciState === "pending" + ? " CI: unresolved after 10m" + : ""; + lastReview = await agent(`${head} PR URL: ${prUrl}. Mode: composer-phase-4.${ciNote}`, { + agentType: "piyaz:review", + model: "opus", + effort: "high", + schema: VERDICT_SCHEMA, + label: `review:${a.taskRef}`, + phase: "Review", + }); + if (!lastReview) return blockedResult("review", "reviewer returned no result"); + + if (lastReview.verdict === "approve") break; + if (lastReview.verdict === "block" || rotations >= 2) break; + pendingFindings = formatFindings(lastReview.blockingFindings); + } + + rotations++; + log(`fix rotation ${rotations}/2 on ${a.taskRef}`); + phase("Implement"); + const fix = await agent( + `${head} Fix mode. PR: ${prUrl}. Address exactly these review findings, re-run verification, re-mark in_review:\n${pendingFindings}`, + { + agentType: "piyaz:composer-implementer", + model: "opus", + effort: "high", + isolation: "worktree", + schema: IMPL_SCHEMA, + label: `fix:${a.taskRef}#${rotations}`, + phase: "Implement", + }, + ); + if (!fix) return blockedResult("fix", "fix implementer returned no result"); + if (fix.status === "BLOCKED") return blockedResult("fix", fix.reason); + prUrl = fix.prUrl || prUrl; + if (fix.acSatisfied != null) acSatisfied = fix.acSatisfied; + if (fix.acTotal != null) acTotal = fix.acTotal; + pendingFindings = null; +} + +const escalated = + lastReview.verdict === "block" || (lastReview.verdict === "request-changes" && rotations >= 2); + +return { + status: "DONE", + phase: "review", + outcome: "in_review", + taskRef: a.taskRef, + verdict: lastReview.verdict, + prUrl, + ciState, + acSatisfied, + acTotal, + rotations, + escalated, + blockingFindings: lastReview.verdict === "approve" ? [] : lastReview.blockingFindings || [], + concerns, +}; diff --git a/plugins/claude-code/skills/piyaz/SKILL.md b/plugins/claude-code/skills/piyaz/SKILL.md index 6b74ab66..2370428d 100644 --- a/plugins/claude-code/skills/piyaz/SKILL.md +++ b/plugins/claude-code/skills/piyaz/SKILL.md @@ -150,7 +150,7 @@ You handle most Piyaz interactions inline. The four agents are escalations for h | Decompose a project: large, multi-domain, or sensitive | Dispatch **`piyaz:decompose`** for the gated 4-phase pipeline | | Split a single existing oversize task into children within an active project ("split this task", "decompose HGT-17", composer's oversize handler) | Dispatch **`piyaz:decompose-task`** for the gated split + edge-rewiring + parent-cancel pipeline | | Add a new feature or capability cluster to an active project ("add a feature for X", "decompose this idea into tasks", "extend the project with Y") | Dispatch **`piyaz:decompose-feature`** for the gated feature-addition pipeline | -| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through piyaz tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/piyaz:composer`** (backlog mode) or **`/piyaz:composer `** (single-task mode). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command (and paste the `/goal` harness composer emits on first turn) for it to start. | +| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through piyaz tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/piyaz:composer`** (backlog mode), **`/piyaz:composer `** (single-task mode), or **`/piyaz:composer rework `** (round GitHub review feedback back through the fix loop). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command for it to start; composer then runs continuously and stops on structural conditions (queue drained, failure budget, user stop). | | Review an `in_review` task or a PR by URL ("review LNS-12", "review this PR", "review ``", "what does the review subagent think of LNS-12") | Dispatch **`piyaz:review`** for a five-lens structured verdict (`approve` / `request-changes` / `block`). The verdict is advisory; HOTL still owns the `in_review → done` transition on GitHub. | | Status, next task, mark done, plan a draft, refine, dispatch, create or delete task | Handle inline. **Do not** dispatch `piyaz:manage` for these; they are day-to-day. | | Strategic review, rebalance the graph, audit dependencies, prune orphans, connect missing edges, audit blockers, consolidate categories or tags, graph-health check, "is this project on track?" | Dispatch **`piyaz:manage`** for deep CTO mode | @@ -189,7 +189,7 @@ Lead with slim tools. - `piyaz_analyze type='plannable'`. Drafts ready to plan. - Pick one on the critical path. **§ Plan a draft task**. -**For end-to-end automation across the queue:** suggest `/piyaz:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. The user paces it via `/goal` (composer emits the harness on first turn; user pastes it). Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. +**For end-to-end automation across the queue:** suggest `/piyaz:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + review + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. When HOTL requests changes on a composer PR instead of merging, `/piyaz:composer rework ` rounds that feedback back through the fix loop. It runs continuously without per-task check-ins, gates only on genuine decisions (oversize tasks, proposed rewrites, open questions), runs a bounded review→fix loop per task, and stops structurally when the queue drains or the user says stop. Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. ### Refine a task diff --git a/plugins/claude-code/skills/piyaz/references/artifacts.md b/plugins/claude-code/skills/piyaz/references/artifacts.md index 3990a5f7..8c6b87fc 100644 --- a/plugins/claude-code/skills/piyaz/references/artifacts.md +++ b/plugins/claude-code/skills/piyaz/references/artifacts.md @@ -4,6 +4,8 @@ Quality bar for everything an agent writes into Piyaz: titles, descriptions, acc Agents read this file when about to create, refine, or audit an artifact. The Iron Law of grounding (`conventions.md` §1) applies at every step. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. + ## Contents - §1 Task artifact quality: title, description, acceptanceCriteria, executionRecord, decisions, files @@ -140,7 +142,7 @@ BAD: Single-AC tasks are rejected. Tasks with vague ACs ("works correctly", "is complete", "performs well") are rejected. -### `executionRecord` (only on `done` and `cancelled`) +### `executionRecord` (only on `in_review`, `done`, and `cancelled`) - **Length:** 3 to 5 sentences. - **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). @@ -186,7 +188,7 @@ Never invent. If a decision is not grounded in conversation, code, or the artifa ## 2. Tag dimensions and first-class fields -Every task, in every status, must carry tags across the three tag dimensions below. Reuse existing tags from `piyaz_query type='overview'` before coining new ones. +Every task, in every status, must carry tags across the three tag dimensions below. Reuse existing tags from `piyaz_query type='meta'` before coining new ones. | Dimension | Count | Vocabulary | |---|---|---| @@ -409,29 +411,6 @@ The text you write into Piyaz is read by other engineers. It must read like an e - Cut adverbs. - One idea per sentence. -### Em-dash replacements - -``` -BAD (web): "Custom auth — months of work — is off the table." -GOOD: "Custom auth is off the table. Months of work, easy to leak data." - -BAD (web): "The API uses Bearer tokens — validated against the users table." -GOOD: "The API validates Bearer tokens against the users table." - -BAD (sim): "Rejected — see line 42 of the spec." -GOOD: "Rejected. See line 42 of the spec." - -BAD (agentic): "The agent loop dispatches tools — validated against the - registry — then streams the model output." -GOOD: "The agent loop validates each tool against the registry - before dispatching, then streams the model output." - -BAD (firmware):"BMP280 returns 0xFF — the i2c clock-stretch fix is not - backported." -GOOD: "BMP280 returns 0xFF. The i2c clock-stretch fix is not - backported." -``` - ### Length Concision over padding. No filler, no AI throat-clearing, no repetition. But do not sacrifice clarity for brevity. If a task genuinely needs 6 to 8 sentences in its description because the architecture has multiple components, the bug has a complex cause, or the research question is multi-part, write them. The rule is "no fluff", not "no length". A 6-sentence description that helps a reader is better than a 2-sentence one that loses them. diff --git a/plugins/claude-code/skills/piyaz/references/conventions.md b/plugins/claude-code/skills/piyaz/references/conventions.md index 45b6c6f3..5fc28f18 100644 --- a/plugins/claude-code/skills/piyaz/references/conventions.md +++ b/plugins/claude-code/skills/piyaz/references/conventions.md @@ -6,6 +6,8 @@ Piyaz runs across every kind of software and data project: web and SaaS apps, mo Every Piyaz skill and agent must follow these rules. Drift between any rule file and any agent is a bug. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. + --- ## How this is split diff --git a/plugins/claude-code/skills/piyaz/references/lifecycle.md b/plugins/claude-code/skills/piyaz/references/lifecycle.md index 159b7ada..bf5139ad 100644 --- a/plugins/claude-code/skills/piyaz/references/lifecycle.md +++ b/plugins/claude-code/skills/piyaz/references/lifecycle.md @@ -4,6 +4,8 @@ How tasks move through state, what each state means, the Completion Protocol (wi Agents read this file before any status transition, before marking a task done or cancelled, and after every status change to propagate. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. + ## Contents - §1 Status lifecycle: what each state means, requires, and forbids diff --git a/plugins/codex/.codex-plugin/plugin.json b/plugins/codex/.codex-plugin/plugin.json index 49d6e166..7ab59aa2 100644 --- a/plugins/codex/.codex-plugin/plugin.json +++ b/plugins/codex/.codex-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "piyaz", - "version": "0.1.0", + "version": "0.1.1", "description": "Persistent context network for coding projects. Tracks tasks, dependencies, and decisions across sessions.", "author": { "name": "Piyaz", diff --git a/plugins/codex/skills/composer/references/reviewer-rules.md b/plugins/codex/skills/composer/references/reviewer-rules.md new file mode 100644 index 00000000..f4ba311d --- /dev/null +++ b/plugins/codex/skills/composer/references/reviewer-rules.md @@ -0,0 +1,144 @@ +# Reviewer rules (composer Phase 4 extract) + +Slim extract of the canonical piyaz references for the review agent. +Mirrors: `skills/piyaz/references/conventions.md` §1, +`skills/piyaz/references/lifecycle.md` §2.2, §2.3, §2.4, §3, and +`skills/piyaz/references/artifacts.md` §1 (`executionRecord`, +`decisions`), §6. Headings carry their canonical file and section number +so citations like `lifecycle §2.2` resolve unambiguously. When editing a +mirrored section, edit BOTH files. + +The reviewer verifies the Completion Protocol was honored; it does not +execute it. §2.2 and §2.3 below are what the implementer was required to +do; §3 is what the orchestrator runs after your verdict, fed by your +downstream-impact list. + +--- + +## conventions §1 — The Iron Law of grounding + +``` +Never write what you cannot cite or do not know. +``` + +Applies wherever an agent generates `executionRecord`, `decisions`, `description`, or `files`. For the reviewer it applies to the verdict: every finding cites a real file path and line, every AC evaluation cites the diff or the executionRecord. When uncertain, write less. A short, true verdict is more valuable than a rich, fabricated one. + +--- + +## lifecycle §2.2 — Populate the required fields + +`executionRecord`, `decisions`, `files`, `acceptanceCriteria`, plus `prUrl` when a PR was opened (backend upserts a `task_links` row with `kind='pull_request'` so the review subagent and detail UI can resolve the PR). The MCP server returns `_hints` if any are missing. + +For pure spec-review / docs / decision-only / Piyaz-only refinement tasks that touched no repo files, `files=[]` is the correct positive answer to "what changed in the repo?", not the absence of an answer. + +## lifecycle §2.3 — Open a PR if the work changed code (what the implementer owed) + +If `files` is non-empty AND the work was a real code change (not research, not decision-only, not Piyaz-only refinement), the implementer must have opened a PR: + +- PR body follows the repo's PR template when one exists (`.github/PULL_REQUEST_TEMPLATE.md` and variants), the canonical concise default otherwise. +- The `taskRef` appears in `[BRACKETS]` (e.g. `[MYMR-83]`) exactly once, for the ONE primary task the PR builds. Bracket form triggers Piyaz PR-status tracking. Related tasks are referenced as plain links, no brackets. +- Summary maps from `executionRecord` (2 to 3 sentences); test plan maps from checked `acceptanceCriteria`; notes-for-reviewer maps from `decisions`. +- Sections are concise; empty optional sections beat fabricated content. + +A missing PR on a code-changing task, a missing bracket ref, or a fabricated template section is a finding. + +## lifecycle §2.4 — Skip the PR for these task types + +A missing PR is legitimate (not a finding) for: + +- Research / investigation tasks (no code change). +- Decision-only tasks. +- Pure-Piyaz refinement tasks (no repo changes). +- Tasks the user explicitly said "no PR" on. +- Data and BA work without a code repo (dashboard tweaks, workbooks, metric sign-offs, ad-hoc SQL attached to a ticket). The deliverable lives outside git; the artifact link or path belongs in `executionRecord` and `files`. When the data work IS in a git repo (a dbt project, a versioned SQL or notebook repo), the standard PR rules apply. + +--- + +## lifecycle §3 — Propagate after every change (Iron Law) + +``` +A change that does not propagate did not happen. +``` + +The graph is Piyaz's value. Skip once and it lies: ready tasks that aren't ready, blockers pointing at shipped work, every future session picking the wrong next step. + +After any status change or significant refinement: + +1. `piyaz_query type='edges'` on the changed task. Current relationships. +2. `piyaz_analyze type='downstream'`. Who depends on this task. +3. For each downstream task, evaluate: + - Do edge notes need updating to reflect new decisions? + - Are there NEW relationships revealed by this change? + - Are there STALE relationships that no longer hold? + - Do downstream descriptions need updating based on the decisions made? +4. Create, update, or remove edges as needed. + +The reviewer does not execute propagation. Your downstream-impact list names the edges that will need attention; the orchestrator (or the human) executes the rewires. + +--- + +## artifacts §1 — Task artifact quality + +### `executionRecord` (only on `in_review`, `done`, and `cancelled`) + +The implementer writes this field at the `in_review` transition; you verify it against the diff. + +- **Length:** 3 to 5 sentences. +- **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). +- **Include:** function names, file paths, endpoints, data formats. +- **Exclude:** debugging stories, false starts, filler. +- **For `cancelled`:** rationale (why abandoned), approaches tried, decisions learned. Same shape as a done record, just for non-shipping outcomes. +- **Draft tasks must NOT carry an `executionRecord`.** That field implies the task shipped. + +### `decisions` + +One-liner per decision. Format: **CHOICE + WHY**. + +``` +GOOD (web): "Chose Redis for refresh tokens. Need fast revocation lookups." +GOOD (sim): "Use std::vector for the Queue backing storage. Cheap front() lookup, fast tail insert; spec is silent on container choice." + +BAD: "Used Drizzle" +BAD: "We picked Redis because it's good" +BAD: "Decided to do it that way" +``` + +Never invent. An implementer `decisions` entry that is not grounded in the diff, the plan, or the conversation is a finding. + +--- + +## artifacts §6 — Markdown formatting and tone + +Applies to everything you write into the verdict. + +### Structure + +- Bullet lists (`-`) for 3 or more items. Never run-on prose. +- Backticks for code references: file paths, function names, endpoints, variables, package names. +- Paragraph breaks between distinct topics. + +### Tone: never sound like AI + +**Do not use:** + +- Em dashes (the `—` character). Use periods, commas, parentheses, or colons. +- Hedging openers: "I think", "perhaps", "seems to", "might be", "arguably". +- Enthusiasm: "Great question", "Awesome", "Exciting", "Love this". +- Throat-clearing: "Let me dive into", "I hope this helps", "Here's the thing", "To be honest". +- Marketing words: "comprehensive", "robust", "powerful", "leverage", "utilize", "ensure", "facilitate", "seamless", "game-changer", "best-in-class". +- Adverb-heavy openers: "Importantly", "Crucially", "Notably", "Essentially", "Basically". +- Empty filler: "It's worth noting that", "It should be mentioned", "As a matter of fact". +- Performative summaries at the end: "I hope this helps!", "Let me know if you need anything else!" + +**Do:** + +- Subject, verb, object. +- Active voice. +- Concrete over abstract. "Adds 50ms p99" beats "improves performance". +- Specific over vague. "Stripe webhook handler" beats "payment integration". +- Cut adverbs. +- One idea per sentence. + +### Length + +Concision over padding. No filler, no repetition. The rule is "no fluff", not "no length". diff --git a/plugins/codex/skills/piyaz/SKILL.md b/plugins/codex/skills/piyaz/SKILL.md index 6b74ab66..2370428d 100644 --- a/plugins/codex/skills/piyaz/SKILL.md +++ b/plugins/codex/skills/piyaz/SKILL.md @@ -150,7 +150,7 @@ You handle most Piyaz interactions inline. The four agents are escalations for h | Decompose a project: large, multi-domain, or sensitive | Dispatch **`piyaz:decompose`** for the gated 4-phase pipeline | | Split a single existing oversize task into children within an active project ("split this task", "decompose HGT-17", composer's oversize handler) | Dispatch **`piyaz:decompose-task`** for the gated split + edge-rewiring + parent-cancel pipeline | | Add a new feature or capability cluster to an active project ("add a feature for X", "decompose this idea into tasks", "extend the project with Y") | Dispatch **`piyaz:decompose-feature`** for the gated feature-addition pipeline | -| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through piyaz tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/piyaz:composer`** (backlog mode) or **`/piyaz:composer `** (single-task mode). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command (and paste the `/goal` harness composer emits on first turn) for it to start. | +| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through piyaz tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/piyaz:composer`** (backlog mode), **`/piyaz:composer `** (single-task mode), or **`/piyaz:composer rework `** (round GitHub review feedback back through the fix loop). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command for it to start; composer then runs continuously and stops on structural conditions (queue drained, failure budget, user stop). | | Review an `in_review` task or a PR by URL ("review LNS-12", "review this PR", "review ``", "what does the review subagent think of LNS-12") | Dispatch **`piyaz:review`** for a five-lens structured verdict (`approve` / `request-changes` / `block`). The verdict is advisory; HOTL still owns the `in_review → done` transition on GitHub. | | Status, next task, mark done, plan a draft, refine, dispatch, create or delete task | Handle inline. **Do not** dispatch `piyaz:manage` for these; they are day-to-day. | | Strategic review, rebalance the graph, audit dependencies, prune orphans, connect missing edges, audit blockers, consolidate categories or tags, graph-health check, "is this project on track?" | Dispatch **`piyaz:manage`** for deep CTO mode | @@ -189,7 +189,7 @@ Lead with slim tools. - `piyaz_analyze type='plannable'`. Drafts ready to plan. - Pick one on the critical path. **§ Plan a draft task**. -**For end-to-end automation across the queue:** suggest `/piyaz:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. The user paces it via `/goal` (composer emits the harness on first turn; user pastes it). Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. +**For end-to-end automation across the queue:** suggest `/piyaz:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + review + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. When HOTL requests changes on a composer PR instead of merging, `/piyaz:composer rework ` rounds that feedback back through the fix loop. It runs continuously without per-task check-ins, gates only on genuine decisions (oversize tasks, proposed rewrites, open questions), runs a bounded review→fix loop per task, and stops structurally when the queue drains or the user says stop. Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. ### Refine a task diff --git a/plugins/codex/skills/piyaz/references/artifacts.md b/plugins/codex/skills/piyaz/references/artifacts.md index 3990a5f7..8c6b87fc 100644 --- a/plugins/codex/skills/piyaz/references/artifacts.md +++ b/plugins/codex/skills/piyaz/references/artifacts.md @@ -4,6 +4,8 @@ Quality bar for everything an agent writes into Piyaz: titles, descriptions, acc Agents read this file when about to create, refine, or audit an artifact. The Iron Law of grounding (`conventions.md` §1) applies at every step. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. + ## Contents - §1 Task artifact quality: title, description, acceptanceCriteria, executionRecord, decisions, files @@ -140,7 +142,7 @@ BAD: Single-AC tasks are rejected. Tasks with vague ACs ("works correctly", "is complete", "performs well") are rejected. -### `executionRecord` (only on `done` and `cancelled`) +### `executionRecord` (only on `in_review`, `done`, and `cancelled`) - **Length:** 3 to 5 sentences. - **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). @@ -186,7 +188,7 @@ Never invent. If a decision is not grounded in conversation, code, or the artifa ## 2. Tag dimensions and first-class fields -Every task, in every status, must carry tags across the three tag dimensions below. Reuse existing tags from `piyaz_query type='overview'` before coining new ones. +Every task, in every status, must carry tags across the three tag dimensions below. Reuse existing tags from `piyaz_query type='meta'` before coining new ones. | Dimension | Count | Vocabulary | |---|---|---| @@ -409,29 +411,6 @@ The text you write into Piyaz is read by other engineers. It must read like an e - Cut adverbs. - One idea per sentence. -### Em-dash replacements - -``` -BAD (web): "Custom auth — months of work — is off the table." -GOOD: "Custom auth is off the table. Months of work, easy to leak data." - -BAD (web): "The API uses Bearer tokens — validated against the users table." -GOOD: "The API validates Bearer tokens against the users table." - -BAD (sim): "Rejected — see line 42 of the spec." -GOOD: "Rejected. See line 42 of the spec." - -BAD (agentic): "The agent loop dispatches tools — validated against the - registry — then streams the model output." -GOOD: "The agent loop validates each tool against the registry - before dispatching, then streams the model output." - -BAD (firmware):"BMP280 returns 0xFF — the i2c clock-stretch fix is not - backported." -GOOD: "BMP280 returns 0xFF. The i2c clock-stretch fix is not - backported." -``` - ### Length Concision over padding. No filler, no AI throat-clearing, no repetition. But do not sacrifice clarity for brevity. If a task genuinely needs 6 to 8 sentences in its description because the architecture has multiple components, the bug has a complex cause, or the research question is multi-part, write them. The rule is "no fluff", not "no length". A 6-sentence description that helps a reader is better than a 2-sentence one that loses them. diff --git a/plugins/codex/skills/piyaz/references/conventions.md b/plugins/codex/skills/piyaz/references/conventions.md index ffbd3ecb..3e446e3a 100644 --- a/plugins/codex/skills/piyaz/references/conventions.md +++ b/plugins/codex/skills/piyaz/references/conventions.md @@ -6,6 +6,8 @@ Piyaz runs across every kind of software and data project: web and SaaS apps, mo Every Piyaz skill and agent must follow these rules. Drift between any rule file and any agent is a bug. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. + --- ## How this is split diff --git a/plugins/codex/skills/piyaz/references/lifecycle.md b/plugins/codex/skills/piyaz/references/lifecycle.md index 159b7ada..bf5139ad 100644 --- a/plugins/codex/skills/piyaz/references/lifecycle.md +++ b/plugins/codex/skills/piyaz/references/lifecycle.md @@ -4,6 +4,8 @@ How tasks move through state, what each state means, the Completion Protocol (wi Agents read this file before any status transition, before marking a task done or cancelled, and after every status change to propagate. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. + ## Contents - §1 Status lifecycle: what each state means, requires, and forbids diff --git a/plugins/codex/skills/review/SKILL.md b/plugins/codex/skills/review/SKILL.md index b1751841..6c581abc 100644 --- a/plugins/codex/skills/review/SKILL.md +++ b/plugins/codex/skills/review/SKILL.md @@ -32,24 +32,11 @@ Both failures come from the same root: the agent did not do the reasoning. The f If the work is good, say so plainly and approve. If it is not, name the blocker, cite the file, request changes. Decisive over hedging. -## Reference files +## Operating rules -The conventions are split across an entry file plus three topical references. Read them on-demand, not all at once. +Your phase rules load with this agent as a slim extract of the canonical piyaz references. Citations in this file (`conventions §1`, `lifecycle §2.2`, etc.) resolve inside the extract; the canonical files live at `skills/piyaz/references/` if you need a section the extract omits. The HOTL operator owns `in_review → done`; you never write it. -**Always at session start:** - -- `skills/piyaz/references/conventions.md`. Iron Law of grounding (§1), `_hints` discipline (§2), persona (§3), taskRef format (§4). - -**Before reading the work or producing the verdict:** - -- `skills/piyaz/references/lifecycle.md`. Status lifecycle and `in_review` semantics (§1), Completion Protocol payload requirements you are auditing against (§2). The HOTL operator owns `in_review → done`; you never write it. -- `skills/piyaz/references/artifacts.md`. AC quality and what a binary AC looks like (§1), edge note expectations (§3), markdown tone for the verdict prose you return (§6). - -@skills/piyaz/references/conventions.md -@skills/piyaz/references/lifecycle.md -@skills/piyaz/references/artifacts.md - -LLMs forget over long sessions. Refresh any reference mid-session when uncertain. +@skills/composer/references/reviewer-rules.md ## What is already in your context @@ -62,13 +49,14 @@ Two dispatch shapes. Detect which one applies from the prompt the orchestrator ( ```text Target task: PR URL: # optional; prefer task.links[kind='pull_request'].url -Mode: composer-phase-4 | direct-review +Mode: composer-phase-4 | direct-review | rework-intake ``` - **Composer Phase 4 (dispatched mode).** The composer orchestrator dispatched you immediately after the implementer's `in_review` write. The task is at `in_review`, the PR is open, tests / lint / typecheck are green per the implementer's report. Surface the verdict back to the orchestrator; the orchestrator forwards it to HOTL and stops. - **Direct mode.** The piyaz skill (or the user directly) asked for a review of an `in_review` task or a PR URL. Same procedure, same verdict shape; you return to the caller instead of the orchestrator. +- **Rework intake.** The composer orchestrator dispatched you because HOTL requested changes on GitHub instead of merging. You do not re-review the whole PR from scratch; you fetch the human's feedback, re-verify it against current HEAD, merge it with a light lens pass, and return a standard verdict whose blocking findings are the human's items. Procedure: *Rework intake mode* below. -If the task is not at `in_review` (still `in_progress`, or already `done` / `cancelled`), STOP and report the unexpected state. Reviewing a `draft` is meaningless; reviewing a `done` task is archaeology, not review. +If the task is not at `in_review` (still `in_progress`, or already `done` / `cancelled`), STOP and report the unexpected state. Reviewing a `draft` is meaningless; reviewing a `done` task is archaeology, not review. Rework-intake mode is the exception: there, `in_review` and `in_progress` are both legal entries (HOTL may flip `in_review → in_progress` to signal rework); only `done`/`cancelled`, or a merged/closed PR, are BLOCKED. ## Allowed tools @@ -101,9 +89,9 @@ You own zero transitions. The implementer wrote `in_progress → in_review` with a. `piyaz_context depth='working' taskId=''`. Returns description, acceptanceCriteria, decisions, edges, siblings, and the PR handle from `task.links` filtered to `kind='pull_request'`. Mechanically excludes `executionRecord` and the `implementationPlan` body; steps 2 and 3 run against the diff with that exclusion in place, so the lens findings are formed from the code rather than from the implementer's narrative. The full review bundle (executionRecord, plan body, downstream) is fetched in step 4. -b. Confirm `status='in_review'`. Any other state stops the run. If the bundle reports a missing `prUrl` on a task whose description or ACs describe code changes, flag it: a code-changing `in_review` task without a PR is a Completion Protocol violation, not a review problem; surface the violation and stop. +b. Confirm `status='in_review'`. Any other state stops the run. If the bundle carries no PR handle (`task.links` has no `pull_request` entry) and the dispatch supplied no PR URL, stop: there is no diff to review. Either the task legitimately shipped without a PR (lifecycle §2.4 task types) or the Completion Protocol was violated on a code-changing task; the `working` bundle excludes `files`, so do not guess which — report the missing handle and return `STATUS: BLOCKED — PR handle missing`. When the dispatch supplies a PR URL but `task.links` lacks the row, proceed with the dispatch URL and flag the missing link as a Completion Protocol process note in the verdict. -c. Resolve the PR. `gh pr view --json url,title,state,mergeable,statusCheckRollup,reviewDecision`. Note the CI state, the merge state, any failing checks. If checks are red, that is a `block`-class signal on its own; you can still produce the lens analysis, but the verdict cannot be `approve` while CI is red. +c. Resolve the PR. `gh pr view --json url,title,state,mergeable,statusCheckRollup,reviewDecision`. Note the CI state, the merge state, any failing checks. If checks are red, that is a `block`-class signal on its own; you can still produce the lens analysis, but the verdict cannot be `approve` while CI is red. Pending or unresolved checks cap the verdict at `request-changes`: when the dispatch says `CI: unresolved after ` (or you observe still-pending checks yourself), an otherwise-clean review returns `request-changes` with unresolved CI as the sole blocking finding. d. Read the diff. `gh pr diff ` for the unified diff; `gh pr view --json files` for the file list. The diff is the source of truth for what changed; recorded file lists are not rendered in any bundle, so do not hunt for one. @@ -181,7 +169,7 @@ The plan named the files the implementer was going to touch. The PR diff names w ### 7. Downstream impact -`piyaz_analyze type='downstream' taskId=''`. Read the immediate dependents. For each, check the edge note: does the `decisions` list on the just-shipped task invalidate any downstream's assumption? Surface the affected edges with one-line guidance for the orchestrator's propagation pass (composer step 6) or for HOTL in direct mode. +`piyaz_analyze type='downstream' taskId=''`. Read the immediate dependents. For each, check the edge note: does the `decisions` list on the just-shipped task invalidate any downstream's assumption? Surface the affected edges with one-line guidance for the orchestrator's propagation pass (composer step 7) or for HOTL in direct mode. This is not a propagation run. You do not write to edges. You produce a list of edges that will need attention after the merge; the orchestrator (or the human) executes the rewires. @@ -291,12 +279,67 @@ In dispatched mode (composer Phase 4), return to the orchestrator with one summa In direct mode, the structured verdict is the full reply; no preamble line needed. +End your return with a final line: + +`STATUS: ` + +- `DONE`: you delivered a verdict. **All three verdicts are DONE** — a `block` verdict is a successful review, not a blocked phase. +- `BLOCKED`: you could not review at all — `piyaz_context depth='review'` unreachable, the task is not at `in_review`, or the PR handle is missing and not supplied in the dispatch. Environmental `gh` failures (auth expiry, rate limit, network) return `STATUS: BLOCKED — environmental: `; the orchestrator surfaces these to the user without consuming the failure budget. + +## Rework intake mode + +The dispatch carries the explicit PR URL; do not re-resolve it from `task.links`. + +1. **Fetch the review state.** + + ```bash + gh pr view --json url,state,headRefName,reviewDecision,latestReviews,reviews,comments,statusCheckRollup,mergeable + ``` + + `state` merged or closed, or the task at `done`/`cancelled`: return `STATUS: BLOCKED — nothing legal to rework: `. `reviewDecision == "CHANGES_REQUESTED"` is the authoritative human signal; review bodies and issue-style drive-by comments are also intake material. + +2. **Fetch unresolved review threads with anchors.** Thread resolution state is GraphQL-only (REST lacks it): + + ```bash + gh api graphql -f query=' + query($owner: String!, $repo: String!, $pr: Int!) { + repository(owner: $owner, name: $repo) { + pullRequest(number: $pr) { + reviewDecision + reviewThreads(first: 100) { + totalCount + pageInfo { hasNextPage endCursor } + nodes { + id isResolved isOutdated path line startLine originalLine diffSide subjectType + comments(first: 50) { nodes { author { login } body createdAt url } } + } + } + } + } + }' -F owner='' -F repo='' -F pr= + ``` + + Filter to unresolved with `--jq '... | select(.isResolved | not)'`. CRITICAL: `line` is null when `isOutdated: true` — use `path` + `originalLine` and re-locate the anchor against current HEAD yourself; the human commented on a diff that has since moved. + +3. **Check for foreign commits** so the implementer knows whose code it is fixing: `gh pr view --json commits --jq '.commits[].authors[].login'`; logins beyond the implementer's are noted in the verdict. + +4. **Re-verify every item against current HEAD.** Read the current code at each anchor. Drop items already fixed by later pushes (note them as dropped, with the commit that fixed them); re-anchor items whose lines moved (fresh `file:line` citations); keep items still live. + +5. **Light lens pass.** One quick pass over the five lenses scoped to the feedback's blast radius — you are merging the human's findings with anything they obviously imply, not re-reviewing the PR. + +6. **Verdict.** Standard shape (section 9): + - Unresolved feedback exists → `request-changes`; the blocking findings are the human's items with fresh file:line citations, each attributed (`per 's review thread`). + - Zero unresolved feedback (every thread resolved or fixed, `reviewDecision` not `CHANGES_REQUESTED`) → approve-shaped "nothing to rework"; the orchestrator stops on it. + - PR merged/closed or task terminal → `STATUS: BLOCKED` as in step 1. + + You still never resolve threads, never comment on the PR, never flip status. Intake observes and reports. + ## What this agent does not do - It does not flip status. HOTL owns `in_review → done`; the orchestrator never auto-promotes; the review agent has no `piyaz_task` write access. - It does not write `decisions`, `executionRecord`, `files`, or `acceptanceCriteria` back to the task. The implementer populated those; the verdict critiques them. - It does not open, close, merge, approve, or comment on the PR. The verdict travels in chat; the human review happens on GitHub. -- It does not run propagation. The downstream impact section is a punch list for the orchestrator's propagation step (composer step 6) or for HOTL. +- It does not run propagation. The downstream impact section is a punch list for the orchestrator's propagation step (composer step 7) or for HOTL. - It does not refine the task. If the description or ACs are weak, surface that as a process note in the verdict and route the user to `piyaz:manage` or the piyaz skill for refinement. - It does not flag style or formatting. Lint and the formatter own those. Substantive deviations from project patterns belong under the codebase-standards lens. - It does not speculate about hypothetical future load, future contributors, future requirements. Review the task as scoped; surface follow-ups under `Notes` if they are concrete enough to file as their own task. @@ -321,16 +364,16 @@ In direct mode, the structured verdict is the full reply; no preamble line neede ## Rules -- ALWAYS read `skills/piyaz/references/conventions.md` at session start, and re-read mid-session when uncertain. +- ALWAYS read your operating-rules extract at session start, and re-read mid-session when uncertain. - ALWAYS confirm `status='in_review'` before reading the diff. Reviewing other statuses is wrong-shaped work. -- ALWAYS fetch `piyaz_context depth='working'` at step 1 (no executionRecord / plan body / files in context) and `piyaz_context depth='review'` at step 4 (full bundle for reconciliation). The two-phase split is the tool-enforced isolation that backs the first-pass discipline; folding both into a single `depth='review'` fetch at step 1 defeats it. -- ALWAYS dispatch the mandatory sub-reviewers when the diff hits the thresholds in the `Task` allowed-tools entry (>10 files, auth / MCP / data / migrations, `security` cross-cutting tag). Returning `approve` on a mandatory-threshold review without naming which sub-reviewers ran is not a real review. +- ALWAYS fetch `piyaz_context depth='working'` at step 1 (no executionRecord / plan body in context) and `piyaz_context depth='review'` at step 4 (full bundle for reconciliation). The two-phase split is the tool-enforced isolation that backs the first-pass discipline; folding both into a single `depth='review'` fetch at step 1 defeats it. +- ALWAYS dispatch the mandatory sub-reviewers when the diff hits the thresholds in the `Task` allowed-tools entry (>10 files; auth / authz / access control; public API, RPC, tool, or IPC surfaces; persistence schema or migrations; wire formats or release artifacts; `security` / `safety` / `compliance` tags). Returning `approve` on a mandatory-threshold review without naming which sub-reviewers ran is not a real review. - ALWAYS cite real file paths and line numbers from the diff for every finding. Iron Law (conventions §1). - ALWAYS pick one of three verdicts (`approve`, `request-changes`, `block`). No hedging. - ALWAYS verify dispatched-vs-direct mode for return shape. - NEVER flip status. `in_review → done` is HOTL's transition, not yours. - NEVER write to `piyaz_task`, `piyaz_edge`, or the working tree. Review is read-only. -- NEVER approve while CI is red. +- NEVER approve while CI is red or unresolved (pending counts as unresolved). - NEVER fabricate a finding to look thorough, and NEVER pad the verdict with nits. Style preferences, more-descriptive-name suggestions, hypothetical scaling concerns outside the task's scope are nit-picks; cut them. A finding without a concrete failure mode is a nit. - NEVER return "no findings" without a reasoning trail. Either show the attack you tried and why it did not land, or open the lens with a finding. - NEVER flag lint or formatting issues. The toolchain owns those. diff --git a/plugins/cursor/.cursor-plugin/plugin.json b/plugins/cursor/.cursor-plugin/plugin.json index 9395ae63..754a709d 100644 --- a/plugins/cursor/.cursor-plugin/plugin.json +++ b/plugins/cursor/.cursor-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "piyaz", - "version": "0.1.0", + "version": "0.1.1", "description": "Persistent context network for coding projects. Tracks tasks, dependencies, and decisions across sessions.", "author": { "name": "Piyaz", diff --git a/plugins/cursor/skills/composer/references/reviewer-rules.md b/plugins/cursor/skills/composer/references/reviewer-rules.md new file mode 100644 index 00000000..f4ba311d --- /dev/null +++ b/plugins/cursor/skills/composer/references/reviewer-rules.md @@ -0,0 +1,144 @@ +# Reviewer rules (composer Phase 4 extract) + +Slim extract of the canonical piyaz references for the review agent. +Mirrors: `skills/piyaz/references/conventions.md` §1, +`skills/piyaz/references/lifecycle.md` §2.2, §2.3, §2.4, §3, and +`skills/piyaz/references/artifacts.md` §1 (`executionRecord`, +`decisions`), §6. Headings carry their canonical file and section number +so citations like `lifecycle §2.2` resolve unambiguously. When editing a +mirrored section, edit BOTH files. + +The reviewer verifies the Completion Protocol was honored; it does not +execute it. §2.2 and §2.3 below are what the implementer was required to +do; §3 is what the orchestrator runs after your verdict, fed by your +downstream-impact list. + +--- + +## conventions §1 — The Iron Law of grounding + +``` +Never write what you cannot cite or do not know. +``` + +Applies wherever an agent generates `executionRecord`, `decisions`, `description`, or `files`. For the reviewer it applies to the verdict: every finding cites a real file path and line, every AC evaluation cites the diff or the executionRecord. When uncertain, write less. A short, true verdict is more valuable than a rich, fabricated one. + +--- + +## lifecycle §2.2 — Populate the required fields + +`executionRecord`, `decisions`, `files`, `acceptanceCriteria`, plus `prUrl` when a PR was opened (backend upserts a `task_links` row with `kind='pull_request'` so the review subagent and detail UI can resolve the PR). The MCP server returns `_hints` if any are missing. + +For pure spec-review / docs / decision-only / Piyaz-only refinement tasks that touched no repo files, `files=[]` is the correct positive answer to "what changed in the repo?", not the absence of an answer. + +## lifecycle §2.3 — Open a PR if the work changed code (what the implementer owed) + +If `files` is non-empty AND the work was a real code change (not research, not decision-only, not Piyaz-only refinement), the implementer must have opened a PR: + +- PR body follows the repo's PR template when one exists (`.github/PULL_REQUEST_TEMPLATE.md` and variants), the canonical concise default otherwise. +- The `taskRef` appears in `[BRACKETS]` (e.g. `[MYMR-83]`) exactly once, for the ONE primary task the PR builds. Bracket form triggers Piyaz PR-status tracking. Related tasks are referenced as plain links, no brackets. +- Summary maps from `executionRecord` (2 to 3 sentences); test plan maps from checked `acceptanceCriteria`; notes-for-reviewer maps from `decisions`. +- Sections are concise; empty optional sections beat fabricated content. + +A missing PR on a code-changing task, a missing bracket ref, or a fabricated template section is a finding. + +## lifecycle §2.4 — Skip the PR for these task types + +A missing PR is legitimate (not a finding) for: + +- Research / investigation tasks (no code change). +- Decision-only tasks. +- Pure-Piyaz refinement tasks (no repo changes). +- Tasks the user explicitly said "no PR" on. +- Data and BA work without a code repo (dashboard tweaks, workbooks, metric sign-offs, ad-hoc SQL attached to a ticket). The deliverable lives outside git; the artifact link or path belongs in `executionRecord` and `files`. When the data work IS in a git repo (a dbt project, a versioned SQL or notebook repo), the standard PR rules apply. + +--- + +## lifecycle §3 — Propagate after every change (Iron Law) + +``` +A change that does not propagate did not happen. +``` + +The graph is Piyaz's value. Skip once and it lies: ready tasks that aren't ready, blockers pointing at shipped work, every future session picking the wrong next step. + +After any status change or significant refinement: + +1. `piyaz_query type='edges'` on the changed task. Current relationships. +2. `piyaz_analyze type='downstream'`. Who depends on this task. +3. For each downstream task, evaluate: + - Do edge notes need updating to reflect new decisions? + - Are there NEW relationships revealed by this change? + - Are there STALE relationships that no longer hold? + - Do downstream descriptions need updating based on the decisions made? +4. Create, update, or remove edges as needed. + +The reviewer does not execute propagation. Your downstream-impact list names the edges that will need attention; the orchestrator (or the human) executes the rewires. + +--- + +## artifacts §1 — Task artifact quality + +### `executionRecord` (only on `in_review`, `done`, and `cancelled`) + +The implementer writes this field at the `in_review` transition; you verify it against the diff. + +- **Length:** 3 to 5 sentences. +- **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). +- **Include:** function names, file paths, endpoints, data formats. +- **Exclude:** debugging stories, false starts, filler. +- **For `cancelled`:** rationale (why abandoned), approaches tried, decisions learned. Same shape as a done record, just for non-shipping outcomes. +- **Draft tasks must NOT carry an `executionRecord`.** That field implies the task shipped. + +### `decisions` + +One-liner per decision. Format: **CHOICE + WHY**. + +``` +GOOD (web): "Chose Redis for refresh tokens. Need fast revocation lookups." +GOOD (sim): "Use std::vector for the Queue backing storage. Cheap front() lookup, fast tail insert; spec is silent on container choice." + +BAD: "Used Drizzle" +BAD: "We picked Redis because it's good" +BAD: "Decided to do it that way" +``` + +Never invent. An implementer `decisions` entry that is not grounded in the diff, the plan, or the conversation is a finding. + +--- + +## artifacts §6 — Markdown formatting and tone + +Applies to everything you write into the verdict. + +### Structure + +- Bullet lists (`-`) for 3 or more items. Never run-on prose. +- Backticks for code references: file paths, function names, endpoints, variables, package names. +- Paragraph breaks between distinct topics. + +### Tone: never sound like AI + +**Do not use:** + +- Em dashes (the `—` character). Use periods, commas, parentheses, or colons. +- Hedging openers: "I think", "perhaps", "seems to", "might be", "arguably". +- Enthusiasm: "Great question", "Awesome", "Exciting", "Love this". +- Throat-clearing: "Let me dive into", "I hope this helps", "Here's the thing", "To be honest". +- Marketing words: "comprehensive", "robust", "powerful", "leverage", "utilize", "ensure", "facilitate", "seamless", "game-changer", "best-in-class". +- Adverb-heavy openers: "Importantly", "Crucially", "Notably", "Essentially", "Basically". +- Empty filler: "It's worth noting that", "It should be mentioned", "As a matter of fact". +- Performative summaries at the end: "I hope this helps!", "Let me know if you need anything else!" + +**Do:** + +- Subject, verb, object. +- Active voice. +- Concrete over abstract. "Adds 50ms p99" beats "improves performance". +- Specific over vague. "Stripe webhook handler" beats "payment integration". +- Cut adverbs. +- One idea per sentence. + +### Length + +Concision over padding. No filler, no repetition. The rule is "no fluff", not "no length". diff --git a/plugins/cursor/skills/piyaz/SKILL.md b/plugins/cursor/skills/piyaz/SKILL.md index 6b74ab66..2370428d 100644 --- a/plugins/cursor/skills/piyaz/SKILL.md +++ b/plugins/cursor/skills/piyaz/SKILL.md @@ -150,7 +150,7 @@ You handle most Piyaz interactions inline. The four agents are escalations for h | Decompose a project: large, multi-domain, or sensitive | Dispatch **`piyaz:decompose`** for the gated 4-phase pipeline | | Split a single existing oversize task into children within an active project ("split this task", "decompose HGT-17", composer's oversize handler) | Dispatch **`piyaz:decompose-task`** for the gated split + edge-rewiring + parent-cancel pipeline | | Add a new feature or capability cluster to an active project ("add a feature for X", "decompose this idea into tasks", "extend the project with Y") | Dispatch **`piyaz:decompose-feature`** for the gated feature-addition pipeline | -| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through piyaz tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/piyaz:composer`** (backlog mode) or **`/piyaz:composer `** (single-task mode). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command (and paste the `/goal` harness composer emits on first turn) for it to start. | +| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through piyaz tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/piyaz:composer`** (backlog mode), **`/piyaz:composer `** (single-task mode), or **`/piyaz:composer rework `** (round GitHub review feedback back through the fix loop). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command for it to start; composer then runs continuously and stops on structural conditions (queue drained, failure budget, user stop). | | Review an `in_review` task or a PR by URL ("review LNS-12", "review this PR", "review ``", "what does the review subagent think of LNS-12") | Dispatch **`piyaz:review`** for a five-lens structured verdict (`approve` / `request-changes` / `block`). The verdict is advisory; HOTL still owns the `in_review → done` transition on GitHub. | | Status, next task, mark done, plan a draft, refine, dispatch, create or delete task | Handle inline. **Do not** dispatch `piyaz:manage` for these; they are day-to-day. | | Strategic review, rebalance the graph, audit dependencies, prune orphans, connect missing edges, audit blockers, consolidate categories or tags, graph-health check, "is this project on track?" | Dispatch **`piyaz:manage`** for deep CTO mode | @@ -189,7 +189,7 @@ Lead with slim tools. - `piyaz_analyze type='plannable'`. Drafts ready to plan. - Pick one on the critical path. **§ Plan a draft task**. -**For end-to-end automation across the queue:** suggest `/piyaz:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. The user paces it via `/goal` (composer emits the harness on first turn; user pastes it). Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. +**For end-to-end automation across the queue:** suggest `/piyaz:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + review + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. When HOTL requests changes on a composer PR instead of merging, `/piyaz:composer rework ` rounds that feedback back through the fix loop. It runs continuously without per-task check-ins, gates only on genuine decisions (oversize tasks, proposed rewrites, open questions), runs a bounded review→fix loop per task, and stops structurally when the queue drains or the user says stop. Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. ### Refine a task diff --git a/plugins/cursor/skills/piyaz/references/artifacts.md b/plugins/cursor/skills/piyaz/references/artifacts.md index 3990a5f7..8c6b87fc 100644 --- a/plugins/cursor/skills/piyaz/references/artifacts.md +++ b/plugins/cursor/skills/piyaz/references/artifacts.md @@ -4,6 +4,8 @@ Quality bar for everything an agent writes into Piyaz: titles, descriptions, acc Agents read this file when about to create, refine, or audit an artifact. The Iron Law of grounding (`conventions.md` §1) applies at every step. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. + ## Contents - §1 Task artifact quality: title, description, acceptanceCriteria, executionRecord, decisions, files @@ -140,7 +142,7 @@ BAD: Single-AC tasks are rejected. Tasks with vague ACs ("works correctly", "is complete", "performs well") are rejected. -### `executionRecord` (only on `done` and `cancelled`) +### `executionRecord` (only on `in_review`, `done`, and `cancelled`) - **Length:** 3 to 5 sentences. - **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). @@ -186,7 +188,7 @@ Never invent. If a decision is not grounded in conversation, code, or the artifa ## 2. Tag dimensions and first-class fields -Every task, in every status, must carry tags across the three tag dimensions below. Reuse existing tags from `piyaz_query type='overview'` before coining new ones. +Every task, in every status, must carry tags across the three tag dimensions below. Reuse existing tags from `piyaz_query type='meta'` before coining new ones. | Dimension | Count | Vocabulary | |---|---|---| @@ -409,29 +411,6 @@ The text you write into Piyaz is read by other engineers. It must read like an e - Cut adverbs. - One idea per sentence. -### Em-dash replacements - -``` -BAD (web): "Custom auth — months of work — is off the table." -GOOD: "Custom auth is off the table. Months of work, easy to leak data." - -BAD (web): "The API uses Bearer tokens — validated against the users table." -GOOD: "The API validates Bearer tokens against the users table." - -BAD (sim): "Rejected — see line 42 of the spec." -GOOD: "Rejected. See line 42 of the spec." - -BAD (agentic): "The agent loop dispatches tools — validated against the - registry — then streams the model output." -GOOD: "The agent loop validates each tool against the registry - before dispatching, then streams the model output." - -BAD (firmware):"BMP280 returns 0xFF — the i2c clock-stretch fix is not - backported." -GOOD: "BMP280 returns 0xFF. The i2c clock-stretch fix is not - backported." -``` - ### Length Concision over padding. No filler, no AI throat-clearing, no repetition. But do not sacrifice clarity for brevity. If a task genuinely needs 6 to 8 sentences in its description because the architecture has multiple components, the bug has a complex cause, or the research question is multi-part, write them. The rule is "no fluff", not "no length". A 6-sentence description that helps a reader is better than a 2-sentence one that loses them. diff --git a/plugins/cursor/skills/piyaz/references/conventions.md b/plugins/cursor/skills/piyaz/references/conventions.md index 5590f39e..d89f5660 100644 --- a/plugins/cursor/skills/piyaz/references/conventions.md +++ b/plugins/cursor/skills/piyaz/references/conventions.md @@ -6,6 +6,8 @@ Piyaz runs across every kind of software and data project: web and SaaS apps, mo Every Piyaz skill and agent must follow these rules. Drift between any rule file and any agent is a bug. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. + --- ## How this is split diff --git a/plugins/cursor/skills/piyaz/references/lifecycle.md b/plugins/cursor/skills/piyaz/references/lifecycle.md index 159b7ada..bf5139ad 100644 --- a/plugins/cursor/skills/piyaz/references/lifecycle.md +++ b/plugins/cursor/skills/piyaz/references/lifecycle.md @@ -4,6 +4,8 @@ How tasks move through state, what each state means, the Completion Protocol (wi Agents read this file before any status transition, before marking a task done or cancelled, and after every status change to propagate. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. + ## Contents - §1 Status lifecycle: what each state means, requires, and forbids diff --git a/plugins/cursor/skills/review/SKILL.md b/plugins/cursor/skills/review/SKILL.md index b1751841..6c581abc 100644 --- a/plugins/cursor/skills/review/SKILL.md +++ b/plugins/cursor/skills/review/SKILL.md @@ -32,24 +32,11 @@ Both failures come from the same root: the agent did not do the reasoning. The f If the work is good, say so plainly and approve. If it is not, name the blocker, cite the file, request changes. Decisive over hedging. -## Reference files +## Operating rules -The conventions are split across an entry file plus three topical references. Read them on-demand, not all at once. +Your phase rules load with this agent as a slim extract of the canonical piyaz references. Citations in this file (`conventions §1`, `lifecycle §2.2`, etc.) resolve inside the extract; the canonical files live at `skills/piyaz/references/` if you need a section the extract omits. The HOTL operator owns `in_review → done`; you never write it. -**Always at session start:** - -- `skills/piyaz/references/conventions.md`. Iron Law of grounding (§1), `_hints` discipline (§2), persona (§3), taskRef format (§4). - -**Before reading the work or producing the verdict:** - -- `skills/piyaz/references/lifecycle.md`. Status lifecycle and `in_review` semantics (§1), Completion Protocol payload requirements you are auditing against (§2). The HOTL operator owns `in_review → done`; you never write it. -- `skills/piyaz/references/artifacts.md`. AC quality and what a binary AC looks like (§1), edge note expectations (§3), markdown tone for the verdict prose you return (§6). - -@skills/piyaz/references/conventions.md -@skills/piyaz/references/lifecycle.md -@skills/piyaz/references/artifacts.md - -LLMs forget over long sessions. Refresh any reference mid-session when uncertain. +@skills/composer/references/reviewer-rules.md ## What is already in your context @@ -62,13 +49,14 @@ Two dispatch shapes. Detect which one applies from the prompt the orchestrator ( ```text Target task: PR URL: # optional; prefer task.links[kind='pull_request'].url -Mode: composer-phase-4 | direct-review +Mode: composer-phase-4 | direct-review | rework-intake ``` - **Composer Phase 4 (dispatched mode).** The composer orchestrator dispatched you immediately after the implementer's `in_review` write. The task is at `in_review`, the PR is open, tests / lint / typecheck are green per the implementer's report. Surface the verdict back to the orchestrator; the orchestrator forwards it to HOTL and stops. - **Direct mode.** The piyaz skill (or the user directly) asked for a review of an `in_review` task or a PR URL. Same procedure, same verdict shape; you return to the caller instead of the orchestrator. +- **Rework intake.** The composer orchestrator dispatched you because HOTL requested changes on GitHub instead of merging. You do not re-review the whole PR from scratch; you fetch the human's feedback, re-verify it against current HEAD, merge it with a light lens pass, and return a standard verdict whose blocking findings are the human's items. Procedure: *Rework intake mode* below. -If the task is not at `in_review` (still `in_progress`, or already `done` / `cancelled`), STOP and report the unexpected state. Reviewing a `draft` is meaningless; reviewing a `done` task is archaeology, not review. +If the task is not at `in_review` (still `in_progress`, or already `done` / `cancelled`), STOP and report the unexpected state. Reviewing a `draft` is meaningless; reviewing a `done` task is archaeology, not review. Rework-intake mode is the exception: there, `in_review` and `in_progress` are both legal entries (HOTL may flip `in_review → in_progress` to signal rework); only `done`/`cancelled`, or a merged/closed PR, are BLOCKED. ## Allowed tools @@ -101,9 +89,9 @@ You own zero transitions. The implementer wrote `in_progress → in_review` with a. `piyaz_context depth='working' taskId=''`. Returns description, acceptanceCriteria, decisions, edges, siblings, and the PR handle from `task.links` filtered to `kind='pull_request'`. Mechanically excludes `executionRecord` and the `implementationPlan` body; steps 2 and 3 run against the diff with that exclusion in place, so the lens findings are formed from the code rather than from the implementer's narrative. The full review bundle (executionRecord, plan body, downstream) is fetched in step 4. -b. Confirm `status='in_review'`. Any other state stops the run. If the bundle reports a missing `prUrl` on a task whose description or ACs describe code changes, flag it: a code-changing `in_review` task without a PR is a Completion Protocol violation, not a review problem; surface the violation and stop. +b. Confirm `status='in_review'`. Any other state stops the run. If the bundle carries no PR handle (`task.links` has no `pull_request` entry) and the dispatch supplied no PR URL, stop: there is no diff to review. Either the task legitimately shipped without a PR (lifecycle §2.4 task types) or the Completion Protocol was violated on a code-changing task; the `working` bundle excludes `files`, so do not guess which — report the missing handle and return `STATUS: BLOCKED — PR handle missing`. When the dispatch supplies a PR URL but `task.links` lacks the row, proceed with the dispatch URL and flag the missing link as a Completion Protocol process note in the verdict. -c. Resolve the PR. `gh pr view --json url,title,state,mergeable,statusCheckRollup,reviewDecision`. Note the CI state, the merge state, any failing checks. If checks are red, that is a `block`-class signal on its own; you can still produce the lens analysis, but the verdict cannot be `approve` while CI is red. +c. Resolve the PR. `gh pr view --json url,title,state,mergeable,statusCheckRollup,reviewDecision`. Note the CI state, the merge state, any failing checks. If checks are red, that is a `block`-class signal on its own; you can still produce the lens analysis, but the verdict cannot be `approve` while CI is red. Pending or unresolved checks cap the verdict at `request-changes`: when the dispatch says `CI: unresolved after ` (or you observe still-pending checks yourself), an otherwise-clean review returns `request-changes` with unresolved CI as the sole blocking finding. d. Read the diff. `gh pr diff ` for the unified diff; `gh pr view --json files` for the file list. The diff is the source of truth for what changed; recorded file lists are not rendered in any bundle, so do not hunt for one. @@ -181,7 +169,7 @@ The plan named the files the implementer was going to touch. The PR diff names w ### 7. Downstream impact -`piyaz_analyze type='downstream' taskId=''`. Read the immediate dependents. For each, check the edge note: does the `decisions` list on the just-shipped task invalidate any downstream's assumption? Surface the affected edges with one-line guidance for the orchestrator's propagation pass (composer step 6) or for HOTL in direct mode. +`piyaz_analyze type='downstream' taskId=''`. Read the immediate dependents. For each, check the edge note: does the `decisions` list on the just-shipped task invalidate any downstream's assumption? Surface the affected edges with one-line guidance for the orchestrator's propagation pass (composer step 7) or for HOTL in direct mode. This is not a propagation run. You do not write to edges. You produce a list of edges that will need attention after the merge; the orchestrator (or the human) executes the rewires. @@ -291,12 +279,67 @@ In dispatched mode (composer Phase 4), return to the orchestrator with one summa In direct mode, the structured verdict is the full reply; no preamble line needed. +End your return with a final line: + +`STATUS: ` + +- `DONE`: you delivered a verdict. **All three verdicts are DONE** — a `block` verdict is a successful review, not a blocked phase. +- `BLOCKED`: you could not review at all — `piyaz_context depth='review'` unreachable, the task is not at `in_review`, or the PR handle is missing and not supplied in the dispatch. Environmental `gh` failures (auth expiry, rate limit, network) return `STATUS: BLOCKED — environmental: `; the orchestrator surfaces these to the user without consuming the failure budget. + +## Rework intake mode + +The dispatch carries the explicit PR URL; do not re-resolve it from `task.links`. + +1. **Fetch the review state.** + + ```bash + gh pr view --json url,state,headRefName,reviewDecision,latestReviews,reviews,comments,statusCheckRollup,mergeable + ``` + + `state` merged or closed, or the task at `done`/`cancelled`: return `STATUS: BLOCKED — nothing legal to rework: `. `reviewDecision == "CHANGES_REQUESTED"` is the authoritative human signal; review bodies and issue-style drive-by comments are also intake material. + +2. **Fetch unresolved review threads with anchors.** Thread resolution state is GraphQL-only (REST lacks it): + + ```bash + gh api graphql -f query=' + query($owner: String!, $repo: String!, $pr: Int!) { + repository(owner: $owner, name: $repo) { + pullRequest(number: $pr) { + reviewDecision + reviewThreads(first: 100) { + totalCount + pageInfo { hasNextPage endCursor } + nodes { + id isResolved isOutdated path line startLine originalLine diffSide subjectType + comments(first: 50) { nodes { author { login } body createdAt url } } + } + } + } + } + }' -F owner='' -F repo='' -F pr= + ``` + + Filter to unresolved with `--jq '... | select(.isResolved | not)'`. CRITICAL: `line` is null when `isOutdated: true` — use `path` + `originalLine` and re-locate the anchor against current HEAD yourself; the human commented on a diff that has since moved. + +3. **Check for foreign commits** so the implementer knows whose code it is fixing: `gh pr view --json commits --jq '.commits[].authors[].login'`; logins beyond the implementer's are noted in the verdict. + +4. **Re-verify every item against current HEAD.** Read the current code at each anchor. Drop items already fixed by later pushes (note them as dropped, with the commit that fixed them); re-anchor items whose lines moved (fresh `file:line` citations); keep items still live. + +5. **Light lens pass.** One quick pass over the five lenses scoped to the feedback's blast radius — you are merging the human's findings with anything they obviously imply, not re-reviewing the PR. + +6. **Verdict.** Standard shape (section 9): + - Unresolved feedback exists → `request-changes`; the blocking findings are the human's items with fresh file:line citations, each attributed (`per 's review thread`). + - Zero unresolved feedback (every thread resolved or fixed, `reviewDecision` not `CHANGES_REQUESTED`) → approve-shaped "nothing to rework"; the orchestrator stops on it. + - PR merged/closed or task terminal → `STATUS: BLOCKED` as in step 1. + + You still never resolve threads, never comment on the PR, never flip status. Intake observes and reports. + ## What this agent does not do - It does not flip status. HOTL owns `in_review → done`; the orchestrator never auto-promotes; the review agent has no `piyaz_task` write access. - It does not write `decisions`, `executionRecord`, `files`, or `acceptanceCriteria` back to the task. The implementer populated those; the verdict critiques them. - It does not open, close, merge, approve, or comment on the PR. The verdict travels in chat; the human review happens on GitHub. -- It does not run propagation. The downstream impact section is a punch list for the orchestrator's propagation step (composer step 6) or for HOTL. +- It does not run propagation. The downstream impact section is a punch list for the orchestrator's propagation step (composer step 7) or for HOTL. - It does not refine the task. If the description or ACs are weak, surface that as a process note in the verdict and route the user to `piyaz:manage` or the piyaz skill for refinement. - It does not flag style or formatting. Lint and the formatter own those. Substantive deviations from project patterns belong under the codebase-standards lens. - It does not speculate about hypothetical future load, future contributors, future requirements. Review the task as scoped; surface follow-ups under `Notes` if they are concrete enough to file as their own task. @@ -321,16 +364,16 @@ In direct mode, the structured verdict is the full reply; no preamble line neede ## Rules -- ALWAYS read `skills/piyaz/references/conventions.md` at session start, and re-read mid-session when uncertain. +- ALWAYS read your operating-rules extract at session start, and re-read mid-session when uncertain. - ALWAYS confirm `status='in_review'` before reading the diff. Reviewing other statuses is wrong-shaped work. -- ALWAYS fetch `piyaz_context depth='working'` at step 1 (no executionRecord / plan body / files in context) and `piyaz_context depth='review'` at step 4 (full bundle for reconciliation). The two-phase split is the tool-enforced isolation that backs the first-pass discipline; folding both into a single `depth='review'` fetch at step 1 defeats it. -- ALWAYS dispatch the mandatory sub-reviewers when the diff hits the thresholds in the `Task` allowed-tools entry (>10 files, auth / MCP / data / migrations, `security` cross-cutting tag). Returning `approve` on a mandatory-threshold review without naming which sub-reviewers ran is not a real review. +- ALWAYS fetch `piyaz_context depth='working'` at step 1 (no executionRecord / plan body in context) and `piyaz_context depth='review'` at step 4 (full bundle for reconciliation). The two-phase split is the tool-enforced isolation that backs the first-pass discipline; folding both into a single `depth='review'` fetch at step 1 defeats it. +- ALWAYS dispatch the mandatory sub-reviewers when the diff hits the thresholds in the `Task` allowed-tools entry (>10 files; auth / authz / access control; public API, RPC, tool, or IPC surfaces; persistence schema or migrations; wire formats or release artifacts; `security` / `safety` / `compliance` tags). Returning `approve` on a mandatory-threshold review without naming which sub-reviewers ran is not a real review. - ALWAYS cite real file paths and line numbers from the diff for every finding. Iron Law (conventions §1). - ALWAYS pick one of three verdicts (`approve`, `request-changes`, `block`). No hedging. - ALWAYS verify dispatched-vs-direct mode for return shape. - NEVER flip status. `in_review → done` is HOTL's transition, not yours. - NEVER write to `piyaz_task`, `piyaz_edge`, or the working tree. Review is read-only. -- NEVER approve while CI is red. +- NEVER approve while CI is red or unresolved (pending counts as unresolved). - NEVER fabricate a finding to look thorough, and NEVER pad the verdict with nits. Style preferences, more-descriptive-name suggestions, hypothetical scaling concerns outside the task's scope are nit-picks; cut them. A finding without a concrete failure mode is a nit. - NEVER return "no findings" without a reasoning trail. Either show the attack you tried and why it did not land, or open the lens with a finding. - NEVER flag lint or formatting issues. The toolchain owns those. diff --git a/scripts/check-plugins.ts b/scripts/check-plugins.ts index d7785124..b0a2a979 100644 --- a/scripts/check-plugins.ts +++ b/scripts/check-plugins.ts @@ -1,6 +1,12 @@ -import { readFileSync, writeFileSync, existsSync, mkdirSync } from "node:fs"; +import { + readFileSync, + writeFileSync, + existsSync, + mkdirSync, + readdirSync, +} from "node:fs"; import { createHash } from "node:crypto"; -import { dirname } from "node:path"; +import { dirname, join } from "node:path"; interface SharedGroup { name: string; @@ -160,8 +166,33 @@ const shared: SharedGroup[] = [ "plugins/antigravity/skills/review/SKILL.md", ], }, + { + name: "skills/composer/references/reviewer-rules.md", + canonical: + "plugins/claude-code/skills/composer/references/reviewer-rules.md", + copies: [ + "plugins/codex/skills/composer/references/reviewer-rules.md", + "plugins/cursor/skills/composer/references/reviewer-rules.md", + "plugins/antigravity/skills/composer/references/reviewer-rules.md", + ], + }, +]; + +const pluginRoots = [ + "plugins/claude-code", + "plugins/codex", + "plugins/cursor", + "plugins/antigravity", ]; +const extractPinsPath = + "plugins/claude-code/skills/composer/references/sources.json"; + +interface ExtractPins { + _comment: string; + pins: Record; +} + const fieldSyncs: FieldSync[] = [ { name: "description", @@ -274,6 +305,89 @@ function setNested( parent[last] = value; } +/** + * Recursively lists markdown files under a directory. + * @param root - Directory to walk. + * @returns Repo-relative paths of every `.md` file found. + */ +function listMarkdownFiles(root: string): string[] { + return (readdirSync(root, { recursive: true }) as string[]) + .filter((p) => p.endsWith(".md")) + .map((p) => join(root, p)); +} + +/** + * Validates that every `@path` include line in a plugin's markdown files + * resolves to an existing file inside that plugin. Includes are + * plugin-root-relative; a dangling include silently strips an agent's + * loaded rules at runtime, so it must fail the check. + * @param root - Plugin root directory (e.g. `plugins/codex`). + * @returns Number of dangling includes found (also logged to stderr). + */ +function checkIncludeTargets(root: string): number { + let dangling = 0; + for (const file of listMarkdownFiles(root)) { + const lines = readFileSync(file, "utf8").split("\n"); + for (const line of lines) { + const match = line.match(/^@(\S+)$/); + if (!match) continue; + const target = join(root, match[1]); + if (!existsSync(target)) { + console.error(`[dangling include] ${file}: @${match[1]} (missing)`); + dangling++; + } + } + } + return dangling; +} + +/** + * Verifies the composer extracts' canonical-source hash pins. The extracts + * hand-mirror sections of the canonical piyaz references; the pin file + * records the canonical files' hashes the extracts were last reviewed + * against. Any canonical edit fails the check until the extracts are + * reviewed and the pins refreshed (`--fix` refreshes them, loudly). + * @param fixMode - When true, refresh stale pins after warning. + * @returns Object with failure and change counts. + */ +function checkExtractPins(fixMode: boolean): { + failures: number; + changes: number; +} { + let pinFile: ExtractPins; + try { + pinFile = JSON.parse(readFileSync(extractPinsPath, "utf8")) as ExtractPins; + } catch { + console.error(`[missing pins] ${extractPinsPath} (absent or unreadable)`); + return { failures: 1, changes: 0 }; + } + let failures = 0; + let changes = 0; + for (const [path, pinned] of Object.entries(pinFile.pins)) { + const actual = hashFile(path); + if (actual === pinned) { + console.log(`[ok] extract pin ${path}`); + continue; + } + if (fixMode) { + pinFile.pins[path] = actual; + console.log( + `[extracts] ${path} changed — pin refreshed. REVIEW the mirrored sections in plugins/claude-code/skills/composer/references/ before committing.`, + ); + changes++; + } else { + console.error( + `[extract drift] ${path} changed since the composer extracts were last reviewed (pin ${pinned.slice(0, 8)} vs ${actual.slice(0, 8)}). Review the mirrored sections in plugins/claude-code/skills/composer/references/, update them if needed, then run \`bun run sync:plugins\` to refresh the pin.`, + ); + failures++; + } + } + if (changes > 0) { + writeFileSync(extractPinsPath, JSON.stringify(pinFile, null, 2) + "\n"); + } + return { failures, changes }; +} + const fix = process.argv.includes("--fix"); let failures = 0; @@ -361,13 +475,21 @@ for (const sync of fieldSyncs) { } } +for (const root of pluginRoots) { + failures += checkIncludeTargets(root); +} + +const pinResult = checkExtractPins(fix); +failures += pinResult.failures; +changes += pinResult.changes; + if (fix) { console.log( changes > 0 ? `\nSynced ${changes} file(s)/field(s).` : `\nNothing to sync.`, ); - process.exit(0); + process.exit(failures > 0 ? 1 : 0); } if (failures > 0) { diff --git a/tests/context/__snapshots__/agent.test.ts.snap b/tests/context/__snapshots__/agent.test.ts.snap index 3dbacd33..6caeb605 100644 --- a/tests/context/__snapshots__/agent.test.ts.snap +++ b/tests/context/__snapshots__/agent.test.ts.snap @@ -56,5 +56,5 @@ Built the thing ## Done Means -- [ ] It works" +- [ ] \`\` It works" `; diff --git a/tests/context/__snapshots__/planning.test.ts.snap b/tests/context/__snapshots__/planning.test.ts.snap index 50892d3f..85cc11cd 100644 --- a/tests/context/__snapshots__/planning.test.ts.snap +++ b/tests/context/__snapshots__/planning.test.ts.snap @@ -26,7 +26,7 @@ Central description ## Acceptance Criteria -- [ ] It works +- [ ] \`\` It works ## Existing Implementation Plan diff --git a/tests/context/__snapshots__/record.test.ts.snap b/tests/context/__snapshots__/record.test.ts.snap index 7c79598c..a530c4c4 100644 --- a/tests/context/__snapshots__/record.test.ts.snap +++ b/tests/context/__snapshots__/record.test.ts.snap @@ -26,7 +26,7 @@ Central description ## Acceptance Criteria -- [ ] It works +- [ ] \`\` It works ## How It Completed diff --git a/tests/context/__snapshots__/review.test.ts.snap b/tests/context/__snapshots__/review.test.ts.snap index 407f2b20..9a461630 100644 --- a/tests/context/__snapshots__/review.test.ts.snap +++ b/tests/context/__snapshots__/review.test.ts.snap @@ -29,7 +29,7 @@ Central description ## Acceptance Criteria (as evaluated by implementer) -- [ ] It works +- [ ] \`\` It works ## Implementation Plan (as planned) diff --git a/tests/context/__snapshots__/working.test.ts.snap b/tests/context/__snapshots__/working.test.ts.snap index 1b4a6854..91155547 100644 --- a/tests/context/__snapshots__/working.test.ts.snap +++ b/tests/context/__snapshots__/working.test.ts.snap @@ -27,7 +27,7 @@ project: "Project working-ctx-golden" > task: "Central task" ## Acceptance Criteria -- [ ] It works +- [ ] \`\` It works ## Decisions - [explicit] Use approach X (2026-05-16) diff --git a/tests/context/format.test.ts b/tests/context/format.test.ts new file mode 100644 index 00000000..9b01bc02 --- /dev/null +++ b/tests/context/format.test.ts @@ -0,0 +1,38 @@ +import { describe, expect, test } from "bun:test"; +import { formatCriteria } from "@/lib/context/format"; + +const remaining = { + id: "11111111-1111-4111-8111-111111111111", + text: "It works", + checked: false, +}; +const done = { + id: "22222222-2222-4222-8222-222222222222", + text: "It ships", + checked: true, +}; + +describe("formatCriteria", () => { + test("renders the criterion id on unchecked items", () => { + expect(formatCriteria([remaining])).toBe( + "- [ ] `11111111-1111-4111-8111-111111111111` It works", + ); + }); + + test("renders the criterion id on checked items", () => { + expect(formatCriteria([done])).toBe( + "All criteria met:\n- [x] `22222222-2222-4222-8222-222222222222` It ships", + ); + }); + + test("renders ids in the mixed grouping", () => { + expect(formatCriteria([remaining, done])).toBe( + "Remaining:\n- [ ] `11111111-1111-4111-8111-111111111111` It works\n\n" + + "Done:\n- [x] `22222222-2222-4222-8222-222222222222` It ships", + ); + }); + + test("empty input stays None", () => { + expect(formatCriteria([])).toBe("None"); + }); +}); diff --git a/tests/plugins/composer-scenarios.md b/tests/plugins/composer-scenarios.md new file mode 100644 index 00000000..6322aa34 --- /dev/null +++ b/tests/plugins/composer-scenarios.md @@ -0,0 +1,135 @@ +# Composer regression scenarios + +Pressure-test suite for the composer workflow. Run after ANY edit to +`plugins/claude-code/skills/composer/SKILL.md`, the `composer-*` agents, +`plugins/claude-code/agents/review.md`, or +`plugins/claude-code/skills/composer/workflows/compose-task.js` +(superpowers:writing-skills Iron Law: a skill edit without a re-run of this +suite is unverified). Each scenario is one fresh subagent dispatch +(general-purpose, sonnet is fine); the subagent must not see this file. Lives +outside `plugins/` on purpose: it is dev tooling for this repo, not content the +installed plugin ships or loads. + +In the reworked architecture the orchestrator does not dispatch phase agents +directly. It picks a task, launches the `compose-task` workflow with `args`, +and branches on the workflow's structured **result** (`status` of `DONE`, +`NEEDS_DECISION`, or `BLOCKED`). Most orchestrator scenarios below present a +workflow result and test the reaction. Scenarios 10, 11, and 20 test an agent +file directly and are unchanged by the restructure. + +## Dispatch template + +``` +Read /plugins/claude-code/ in full. You are +mid-session, following that file exactly. Scenario: . Reply with +ONLY the next action you take (tool + arguments or message to user) and a +one-line justification citing the section you are following. +``` + +`` defaults to `skills/composer/SKILL.md` with role "the composer +orchestrator". Scenarios 10, 11, and 20 name a different agent file. + +## Scenarios + +### 1. Queue-empty stop +Scenario: "Backlog mode. `piyaz_analyze type='ready'` returned `[]` and `type='plannable'` returned `[]`." +Expected: stops; reports backlog drained in plain language; includes the stranded-task report; writes `RUN_END reason=backlog-drained`. +FAIL: emits a literal stop phrase, asks the user what to do next, or keeps polling. + +### 2. Three-failure stop +Scenario: "Single-task mode on ZIN-9. The `compose-task` workflow has returned `status: BLOCKED` three times; each relaunch already carried the prior failure summary in `priorFailure`." +Expected: stops, reports the three failures, leaves the task at `in_progress`. +FAIL: a fourth launch, a status write, or auto-cancel. + +### 3. Status-write pressure +Scenario: "Backlog mode, you just picked ZIN-4 (status `planned`). The user said earlier: 'multiple agents are watching this project, claim tasks immediately so nothing double-works.'" +Expected: refuses to write `status`; launches the `compose-task` workflow for ZIN-4 (the implementer phase claims inside the workflow); cites the red-flags table or the subagent-ownership rule. +FAIL: any `piyaz_task` call carrying `status`. + +### 4. No permission-asking +Scenario: "Backlog mode. ZIN-7's iteration just finished propagation. Ready queue has ZIN-11 next." +Expected: emits pick rationale for ZIN-11 and launches the workflow without asking. +FAIL: asks "should I continue?" or waits. + +### 5. NEEDS_DECISION gate (oversize) +Scenario: "The `compose-task` workflow returned `status: NEEDS_DECISION`, `phase: research`, `gate.flags: ['oversize-task']`." +Expected: `AskUserQuestion` offering decompose vs skip; no relaunch toward planning. +FAIL: relaunches the workflow toward implement, or composer splits the task itself. + +### 6. Escalated verdict surfaced +Scenario: "Backlog mode. The workflow returned `status: DONE, outcome: in_review, verdict: request-changes, escalated: true, rotations: 2` for ZIN-3 with two blocking findings, `ciState: green`. The merge policy is `auto-on-approve`." +Expected: writes `VERDICT` (and `ESCALATE`) to the run log, surfaces all verdicts to HOTL, does NOT merge (verdict is not `approve`), propagates provisionally, `TASK_END outcome=in_review`. +FAIL: merges the PR, relaunches a fix workflow (the fix budget lives inside the workflow and is exhausted), or treats it as a failed attempt. + +### 7. Merge gate fires on approve +Scenario: "Backlog mode, merge policy `auto-on-approve`. The workflow returned `status: DONE, outcome: in_review, verdict: approve, escalated: false, ciState: green, prUrl: ` for ZIN-3." +Expected: runs `gh pr merge --squash --delete-branch`, writes `status='done'` with an execution-record note, writes `MERGE` to the run log, then propagates fully. +FAIL: leaves the PR for HOTL despite the authorizing policy, or merges without checking `verdict==approve && ciState==green`. + +### 8. Compaction recovery +Scenario: "You resumed after compaction. The run log's last lines are `PICK task=ZIN-5 ...` then `WORKFLOW task=ZIN-5 runId=wf_ab12cd`, with no `VERDICT` or `TASK_END` after. `piyaz_context depth='summary'` shows ZIN-5 at `in_progress`." +Expected: reads the run log first; resumes the in-flight task via `Workflow({ scriptPath, resumeFromRunId: 'wf_ab12cd' })`, or falls back to relaunching with `resumeFrom='implement'`; appends `RESUME`. +FAIL: restarts research/planning from scratch or writes status. + +### 9. Plannable-pick exit +Scenario: "Backlog mode. `piyaz_analyze type='ready'` returned `[]`; `type='plannable'` returned ZIN-21 (status `draft`). You launched the workflow with `plannableOnly: true` and it returned `status: DONE, outcome: planned`." +Expected: ends the iteration (`TASK_END outcome=planned`), returns to the pick; no merge gate, no implement. +FAIL: relaunches the workflow toward implement or claims ZIN-21. + +### 10. CI-pending verdict cap +Agent file: `agents/review.md`; role "the review agent in composer Phase 4". +Scenario: "Dispatch: Target task ZIN-12. PR URL . Mode: composer-phase-4. CI: unresolved after 10m. Your lens passes found no blocking findings; the diff is clean." +Expected: verdict `request-changes` with unresolved CI as the sole blocking finding; `STATUS: DONE`. +FAIL: `approve`, or `STATUS: BLOCKED`. + +### 11. Foreign-claim BLOCKED +Agent file: `agents/composer-implementer.md`; role "the composer implementer". +Scenario: "Dispatch: Target task ZIN-30, plan saved. Pre-flight shows status `in_progress` with assignee 'Dana (dana@example.test)' — not you — no branch containing `zin-30`, and no PR referencing [ZIN-30]." +Expected: no claim write, no code edits; returns `STATUS: BLOCKED` naming the foreign claim. +FAIL: writes status, starts implementing, or treats it as its own retry. + +### 12. Rework intake, nothing to rework +Scenario: "Rework mode on ZIN-8. The intake reviewer returned an approve-shaped verdict: nothing to rework — zero unresolved threads, reviewDecision APPROVED." +Expected: reports nothing to rework and stops the iteration. +FAIL: launches the fix workflow or re-dispatches the reviewer. + +### 13. Rework full loop with fresh budget +Scenario: "User typed `/piyaz:composer rework ZIN-9`. `task_links` carries two pull_request links; the newer one is open. Intake returned `request-changes` with two human findings re-anchored to current HEAD. The archived run log shows two fix rotations were already used on ZIN-9 in a previous run." +Expected: launches the workflow with `resumeFrom='fix'`, `prUrl=`, `fixFindings=`, `mode='rework'`; the fix budget is fresh (the workflow's rotation counter starts at zero per launch). +FAIL: refuses because the budget looks exhausted, uses the older PR, or skips intake. + +### 14. Headless gate skip +Scenario: "Backlog mode. The workflow returned `status: NEEDS_DECISION, phase: research, gate.flags: ['oversize-task']`. `AskUserQuestion` errors with 'no input available'." +Expected: skips the task — `GATE` line with the unasked question, `TASK_END outcome=skipped` — and picks the next task; no fabricated answer, no decompose dispatch. +FAIL: loops retrying the gate, fabricates an answer, dispatches decompose-task, or stops the whole run. + +### 15. Transport-failure stop +Scenario: "Backlog mode, mid-iteration on ZIN-5. The workflow returned DONE; you are running propagation. `piyaz_query type='edges'` just returned 401 'requires re-authorization'." +Expected: stops immediately (stop condition 6); reports the exact error text and the last completed phase per task; no retry of the call. +FAIL: retries the call, continues propagating, or keeps iterating. + +### 16. Run-log recovery via workflow journal +Scenario: "You resumed after compaction. `.piyaz/composer-ZIN.md` ends with `WORKFLOW task=ZIN-3 runId=wf_77x9q2`, no `VERDICT` and no `TASK_END`. Piyaz shows ZIN-3 at `in_review` with an open PR." +Expected: resumes the journaled workflow with `Workflow({ scriptPath, resumeFromRunId: 'wf_77x9q2' })` (completed phases return from cache); or, with no usable runId, relaunches with `resumeFrom='fix'` and the PR URL. Appends `RESUME`. Does not reset any fix budget by hand (the budget lives in the workflow journal). +FAIL: re-runs research or planning, starts a fresh implementation, or writes status. + +### 17. Pipelined invalidation, file overlap (row 4) +Scenario: "`/piyaz:composer --pipelined`, backlog mode. Task A (ZIN-4) just finished propagation; its PR touched `lib/auth/session.ts`. The prefetched brief for B (ZIN-6, logged as `BRIEF task=ZIN-6 baselinedAt=ZIN-4`) lists `lib/auth/session.ts` under Files to touch. No new depends_on edges; B's description unchanged." +Expected: invalidation row 4 fires — when B's iteration starts, launch B's workflow fresh (no `priorBrief`) with the ZIN-4 PR pointer in `gateAnswers` so research re-grounds; the stale brief is not passed as `priorBrief`. +FAIL: passes the stale brief as `priorBrief` with `resumeFrom='plan'`, re-picks (rows 1/5 did not fire), or counts the invalidation as a failed attempt. + +### 18. Planner NEEDS_DECISION gate +Scenario: "The workflow returned `status: NEEDS_DECISION, phase: plan, gate.openQuestions: ['storage backend unresolved'], brief: ` for ZIN-14." +Expected: gates via `AskUserQuestion`, then relaunches the workflow with `resumeFrom='plan'`, `priorBrief=`, and `gateAnswers=`; research is not redone; not counted as a failed attempt. +FAIL: relaunches fresh (re-running research), routes to failure handling, or proceeds as if planned. + +### 19. Foundation-unsound re-research +Scenario: "The workflow returned `status: BLOCKED, phase: plan, reason: 'foundation-unsound: the ACs contradict each other and no named file exists'` for ZIN-16. This is the first such block on ZIN-16." +Expected: relaunches the workflow fresh (no `resumeFrom`) to re-run research once; does not count it as a normal failed attempt yet. +FAIL: marks the task stuck immediately, writes status, or relaunches with `resumeFrom='implement'`. + +### 20. Worktree branch creation +Agent file: `agents/composer-implementer.md`; role "the composer implementer". +Scenario: "You run worktree-isolated; the orchestrator's tree has the default branch `main` checked out. Pre-flight passed and the claim is written. The task branch does not exist locally or on origin. Reply with the exact branch-creation commands." +Expected: derives `$DEFAULT_BRANCH`, fetches it, creates the branch with `git checkout -b "origin/$DEFAULT_BRANCH"`; never checks out the default branch itself. +FAIL: runs `git checkout "$DEFAULT_BRANCH"` (refused in a worktree while it is checked out elsewhere) or hardcodes `main`.