From 63b7e91ba72e907750d678498aa0f02b63512915 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 07:56:45 +0200 Subject: [PATCH 01/45] feat: add researcher rules extract for composer --- .../composer/references/researcher-rules.md | 315 ++++++++++++++++++ 1 file changed, 315 insertions(+) create mode 100644 plugins/claude-code/skills/composer/references/researcher-rules.md diff --git a/plugins/claude-code/skills/composer/references/researcher-rules.md b/plugins/claude-code/skills/composer/references/researcher-rules.md new file mode 100644 index 00000000..1e74d1ea --- /dev/null +++ b/plugins/claude-code/skills/composer/references/researcher-rules.md @@ -0,0 +1,315 @@ +# Researcher rules (composer Phase 1 extract) + +Slim extract of the canonical mymir references for the composer researcher. +Mirrors: `skills/mymir/references/conventions.md` §1, §4 and +`skills/mymir/references/artifacts.md` §1 (Title, `description`, +`acceptanceCriteria`, `decisions`), §2, §5, §6. Section numbers below match +the canonical files so citations like `conventions §1` resolve here. When +editing a mirrored section, edit BOTH files. + +--- + +## 1. The Iron Law of grounding + +``` +Never write what you cannot cite or do not know. +``` + +Applies wherever an agent generates `executionRecord`, `decisions`, `description`, or `files`. + +- `executionRecord` claims must reference real code: file paths that exist, functions that are defined, endpoints that are routed, commits that are in the log. The onboarding agent verifies file existence with Bash before claiming. +- `description` must reflect actual scope. Do not stretch a one-line ask into an invented full feature. +- `files` must list paths the agent has either modified, observed, or has explicit confirmation exist. + +When uncertain, write less. A short, true record is more valuable than a rich, fabricated one. + +**Spec-review and open-questions tasks: cite the on-graph artifact.** When marking a spec-review, decision-only, or open-questions task `done`, every checked AC must cite an on-graph artifact: a sibling task's plan, a sibling's executionRecord, an edge note, or a decision recorded on a related task. Do not synthesize answers from training data. Reference the related task by ref (e.g. `MYMR-83`) inside the AC text or the executionRecord. This is what makes a spec-review completion honest instead of hallucinated. + +`decisions` are different (see §1 of the artifact rules below). They come from the conversation, not from artifact-mining. + +--- + +## 4. taskRef format + +Tool responses include a `taskRef` like `MYMR-83`: uppercase project prefix, dash, integer. Use the ref in user-facing output. **Always pass the UUID `taskId` to tool calls. Never the ref.** + +--- + +## 1. Task artifact quality + +### Title + +Verb plus noun, imperative. + +``` +GOOD: "Implement JWT auth" +GOOD: "Fix Queue::front returning a copy" +GOOD: "Profile renderer hot path" +GOOD: "Train baseline ResNet on internal dataset" + +BAD: "Auth" +BAD: "Queue stuff" +BAD: "Performance" +``` + +### `description` + +The first thing a coding agent or engineer reads when picking up a task. It must be enough on its own to start the work. Concise and clear. + +Cover, depending on task type: + +- **Feature**: what the capability does, who it serves, where it lives in the architecture. +- **Bug**: what is broken, when it manifests, why it matters, and the suspected root cause if known. +- **Refactor / improvement**: what changes, what stays the same, why it is worth doing now. +- **Research / investigation**: what the question is, why it needs answering, what a good answer looks like. +- **Chore / setup / docs**: what needs doing and why now. + +- **Solution sketch:** if you have one, include it. "Use Drizzle, mirror the patterns in `lib/data/task.ts`" is more useful than "Define the database tables". +- **No speculation:** do not pad with implementation guesses when the approach is uncertain. The implementation plan is for that. + +Length: 2 to 4 sentences for most tasks. Up to 6 to 8 sentences for genuinely complex tasks. Single-sentence descriptions are rejected. + +**For onboarding** (writing descriptions for tasks that already shipped): write the description as if the task were being created BEFORE the work, knowing what you now know about the codebase. The reader must be able to re-derive the work from the description. Do not write "added the auth middleware". Write "Build the JWT auth middleware in `lib/auth/middleware.ts`. Validate Bearer tokens against the user table, set `req.user`, reject on expiry. Required by every protected route." + +``` +GOOD (feature, web SaaS): +"Build the habit completion endpoint at POST /api/habits/:id/complete. Inserts +into habit_logs with the user's timezone-adjusted date. Returns the updated +streak count. Idempotent on (habit_id, log_date): duplicate calls return the +existing log. Used by both the web dashboard and the iOS widget." + +GOOD (bug, simulation engine): +"Fix Queue::front returning a copy instead of a reference. Spec §4.2.4.2 +requires the head pointer to be modifiable in-place so Airport::moveToRunway +can swap it out without a re-insert. Currently caught by a unit test on +takeoff_flow. Likely a one-line change in include/Queue.h." + +GOOD (research, ML platform): +"Investigate whether torch.compile improves training throughput on the +ResNet-50 baseline. Question: does compile-time speedup outweigh JIT overhead +on our 8-GPU pod? A good answer is a benchmark script plus a one-paragraph +recommendation comparing wall-clock per epoch and peak memory." + +GOOD (refactor, embedded firmware): +"Move the SPI driver from polling to DMA. Same public surface (spi_send, +spi_recv), same wire protocol. Internally use STM32 HAL DMA1 channel 3 for +TX. Reduces CPU usage during sensor reads from ~15% to <1% per existing +profile traces." + +GOOD (feature, game engine): +"Add deterministic frame stepping to the simulation tick. New API +Engine::stepFrame(uint32_t seed) so replay tooling and netcode tests can +re-run identical state from a recorded seed. Affects PhysicsWorld, Scheduler, +and the InputBuffer drain order." + +GOOD (data / dbt model build): +"Build the daily_active_users dbt model in models/marts/engagement/. Reads +from stg_events.session_started, deduplicates on (user_id, date_trunc('day', +event_ts)), excludes internal traffic via is_internal flag from dim_users. +Materializes incremental on event_date with a 7-day lookback window. Used by +the Looker `Engagement Overview` dashboard and the weekly stakeholder report." + +GOOD (BA / metric definition): +"Define the gross_margin metric in the dbt metrics layer. Formula: (revenue +- cogs) / revenue, dimensioned by product_line, channel, and order_month. +Source: fct_orders joined to dim_products. Replaces the four near-duplicate +SQL versions currently maintained by Sales Ops, Finance, and Marketing. +Stakeholders: CFO weekly review, RevOps dashboard." + +BAD: "Improve the database." +BAD: "Make auth better." +BAD: "Fix the bug in queue." +BAD: "Build the dashboard." +``` + +### `acceptanceCriteria` + +2 to 4 items. Each criterion must be **binary**: a reviewer can answer YES or NO without ambiguity. + +``` +GOOD: +- "Running bun run db:push creates all tables without errors" +- "User table has id, email, name, passwordHash, createdAt columns" +- "FK from tasks.projectId to projects.id with ON DELETE CASCADE" +- "Seed script creates 3 test users and 2 projects with tasks" + +GOOD (firmware): +- "spi_send returns within 50µs at 80MHz clock measured on logic analyzer" +- "DMA TX completion fires interrupt; no busy-loop in the driver" +- "spi_recv returns 0xFF when MISO is held high, verified on the bench" + +GOOD (data / dbt): +- "dbt run --select daily_active_users completes in under 90s on prod warehouse" +- "Row count of daily_active_users on 2026-05-01 matches stg_events session count to within 0.1%" +- "dbt test passes: not_null on user_id and event_date, unique on (user_id, event_date)" +- "Looker `Engagement Overview` dashboard refreshes against the new model with no broken tiles" + +GOOD (BA / analysis deliverable): +- "Churn analysis SQL in analyses/2026q2_churn.sql returns the 14 churned cohorts with ARR per cohort" +- "Numbers reconcile with finance_actuals.gross_revenue to within $500 for every month in scope" +- "Stakeholder review notes from the 2026-05-08 RevOps sync are attached to the task" + +BAD: +- "Database works" +- "All tables created" +- "Tests pass" +- "Performance is good" +- "Dashboard looks right" +- "Numbers match" +``` + +Single-AC tasks are rejected. Tasks with vague ACs ("works correctly", "is complete", "performs well") are rejected. + +### `decisions` + +One-liner per decision. Format: **CHOICE + WHY**. + +Where decisions come from: + +- **Refinement, planning, or implementation conversation.** When the user and the agent (or two agents) settle on a choice, that's a decision. The agent should automatically record it without being asked. If the agent is uncertain whether a choice rises to "decision" level, ask the user briefly to confirm. +- **Onboarding (special case)**: the agent reads existing artifacts to recover decisions made before Mymir entered the picture. Sources: manifest files (`package.json`, `Cargo.toml`, `go.mod`, `pyproject.toml`, `Package.swift`), README and design docs, commit messages with words like *chose*, *switched*, *replaced*, *migrated*. If a decision is not grounded in any of those, omit it. Better a shorter list than fabrication. + +``` +GOOD (web): "Chose Redis for refresh tokens. Need fast revocation lookups." +GOOD (web): "Switched from Prisma to Drizzle. See package.json migration commit." +GOOD (sim): "Use std::vector for the Queue backing storage. Cheap front() lookup, fast tail insert; spec is silent on container choice." +GOOD (ML): "Chose ONNX runtime over PyTorch for inference. 30% lower p99 on the target Jetson Orin." +GOOD (embedded): "Pick Zephyr over FreeRTOS for the new flight controller. Built-in CAN driver, Apache-2.0 license." +GOOD (agentic): "Use a per-thread tool registry. Two concurrent agent loops were stepping on each other's MCP client state." +GOOD (data): "Use dbt incremental over full-refresh on daily_active_users. Source events table is 4B rows; full-refresh exceeds the 30-minute warehouse SLA." +GOOD (BA): "Adopt dbt metrics layer over per-dashboard SQL. Four duplicates of gross_margin already exist across Looker, Tableau, and the weekly deck; one definition replaces them all." + +BAD: "Used Drizzle" +BAD: "We picked Redis because it's good" +BAD: "Decided to do it that way" +BAD: "dbt is better" +``` + +Never invent. If a decision is not grounded in conversation, code, or the artifacts above, leave it out. + +--- + +## 2. Tag dimensions and first-class fields + +Every task, in every status, must carry tags across the three tag dimensions below. Reuse existing tags from `mymir_query type='overview'` before coining new ones. + +| Dimension | Count | Vocabulary | +|---|---|---| +| **Work type** | exactly 1 | `bug`, `feature`, `refactor`, `docs`, `test`, `chore`, `perf` | +| **Cross-cutting concern** | ≥1 | quality attribute (`security`, `a11y`, `dx`, `perf`, `reliability`, `observability`, `i18n`, `compliance`, `safety`) or feature cluster spanning multiple categories (web: `onboarding-flow`, `live-replay`; aerospace: `flight-control`, `mission-planning`; agentic: `agent-loop`, `eval-harness`; ML: `inference-pipeline`, `data-drift`; financial: `risk-engine`, `pricing-model`) | +| **Tech** | at most 2 | most important stack pieces the task touches; pull from manifest deps | + +### First-class fields (priority, estimate, assignees) + +These are top-level columns on every task, set via `mymir_task` parameters of the same name. They are NOT tags. + +- **`priority`** (one of `urgent`, `core`, `normal`, `backlog`). Required-on-create-by-convention: pick deliberately. Defaults: onboarding (shipped features) lands at `core`; decompose picks per task and avoids `core` everywhere or `urgent` everywhere (the dimension carries no signal then). A 30-task project usually has 3 to 6 `urgent` tasks and the rest split between `core`, `normal`, and `backlog`. +- **`estimate`** (Fibonacci story points: `1`, `2`, `3`, `5`, `8`, `13`). Optional. `1` is trivial, `2` and `3` are routine, `5` is nontrivial, `8` and `13` are risky or multi-day. If a task feels larger than `13`, split it (§5). +- **`assigneeIds`** (array of team-member user UUIDs). Optional. Declares ownership / intent, not concurrent execution; the single-worker `in_progress` invariant still holds. Each id must be a member of the project's owning team (the server rejects non-members at write time). + +**Do NOT tag:** + +- Priority: that is the `priority` field's job. Setting `urgent`, `core`, `normal`, or `backlog` as tags duplicates the field and adds no signal. +- Codebase area: that's `category`'s job. **Test: would this name plausibly be a category in some other project shape?** `render-loop`, `effect-system`, `auth`, `payments`, `inference`, `marts`, `flight-control`, `hal-drivers` all answer YES. They're subsystems / product areas, even if your project's category list happens to omit them. Tags are axes the project does not shape itself around: quality attributes (`security`, `a11y`, `perf`, `reliability`, `observability`, `dx`, `compliance`, `safety`, `i18n`) and multi-category feature clusters (`onboarding-flow`, `agent-loop`, `mission-planning`, `live-replay`). If a candidate tag names a subsystem, surface it as a category proposal at the gate or use the existing category. Coining an area-shaped tag because the categories lack a good slot is a category-list bug, not a tag. +- Task status: that is `status`'s job. +- Generic adjectives like "important", "main", "primary". + +**Honoring user-specified tags:** if the user explicitly tagged something, preserve their tags. Add the missing dimensions if any of the three are absent. + +**Tech tag examples by domain:** + +- Web: `react`, `next`, `drizzle`, `postgres`, `tailwind` +- Mobile: `swift`, `swiftui`, `kotlin`, `coreml`, `room` +- Game: `unity`, `unreal`, `cpp`, `glsl`, `wgsl` +- Simulation: `cpp`, `fortran`, `mpi`, `cuda` +- Embedded: `c`, `rust`, `freertos`, `stm32-hal`, `zephyr` +- ML: `pytorch`, `jax`, `triton`, `clickhouse`, `dvc` +- Financial: `python`, `quantlib`, `numpy`, `arrow` +- Data / analytics / BA: `sql`, `dbt`, `bigquery`, `snowflake`, `postgres`, `looker`, `tableau`, `metabase`, `powerbi`, `airflow`, `dagster` + +Pull tech tags from the project's actual stack. Do not invent. + +--- + +## 5. Granularity + +**1 to 4 hours per task.** A coding agent should complete one in a single session. + +> **Starting count is not a cap.** The numbers below are seed values for decompose / onboarding, not enumeration of every task that will ever exist. Real projects accumulate tasks as work materializes; teams add tasks every day. When a parent agent or a test rig caps the task count below the table's range, honor the cap and document the deviation in your transcript or local working file. + +| Project size | Starting task count | +|---|---| +| Hackathon / 1-day spike | 5 to 10 | +| Simple (≤5 features, single user role) | 10 to 20 | +| Medium (5 to 15 features, several roles) | 20 to 40 | +| Complex (15+ features, multiple subsystems) | 40 to 80 | +| Enterprise / multi-team / long-running | 60 to 120 foundation tasks. The graph grows organically into the hundreds or thousands as teams add work. | + +Too small (under 30 minutes): overhead exceeds work. +Too large (over 1 day): hidden subtasks, unclear scope, hard to track. + +When in doubt, split. Tasks become more useful, and more parallelizable, as they shrink toward the 1-hour mark. + +--- + +## 6. Markdown formatting and tone + +Applies to `description`, `acceptanceCriteria`, `executionRecord`, `implementationPlan`, `decisions`, and edge `note`. Not to `files` (plain paths) or `tags` (kebab-case). + +### Structure + +- Bullet lists (`-`) for 3 or more items. Never run-on prose. +- Backticks for code references: file paths, function names, endpoints, variables, package names. +- Paragraph breaks between distinct topics. +- Headings (`##`, `###`) only in long fields like `implementationPlan`. + +### Tone: never sound like AI + +The text you write into Mymir is read by other engineers. It must read like an engineer wrote it, not a chatbot. + +**Do not use:** + +- Em dashes (the `—` character). Use periods, commas, parentheses, or colons. +- Hedging openers: "I think", "perhaps", "seems to", "might be", "arguably". +- Enthusiasm: "Great question", "Awesome", "Exciting", "Love this". +- Throat-clearing: "Let me dive into", "I hope this helps", "Here's the thing", "To be honest". +- Marketing words: "comprehensive", "robust", "powerful", "leverage", "utilize", "ensure", "facilitate", "seamless", "game-changer", "best-in-class". +- Adverb-heavy openers: "Importantly", "Crucially", "Notably", "Essentially", "Basically". +- Empty filler: "It's worth noting that", "It should be mentioned", "As a matter of fact". +- Performative summaries at the end: "I hope this helps!", "Let me know if you need anything else!" + +**Do:** + +- Subject, verb, object. +- Active voice. +- Concrete over abstract. "Adds 50ms p99" beats "improves performance". +- Specific over vague. "Stripe webhook handler" beats "payment integration". +- Cut adverbs. +- One idea per sentence. + +### Em-dash replacements + +``` +BAD (web): "Custom auth — months of work — is off the table." +GOOD: "Custom auth is off the table. Months of work, easy to leak data." + +BAD (web): "The API uses Bearer tokens — validated against the users table." +GOOD: "The API validates Bearer tokens against the users table." + +BAD (sim): "Rejected — see line 42 of the spec." +GOOD: "Rejected. See line 42 of the spec." + +BAD (agentic): "The agent loop dispatches tools — validated against the + registry — then streams the model output." +GOOD: "The agent loop validates each tool against the registry + before dispatching, then streams the model output." + +BAD (firmware):"BMP280 returns 0xFF — the i2c clock-stretch fix is not + backported." +GOOD: "BMP280 returns 0xFF. The i2c clock-stretch fix is not + backported." +``` + +### Length + +Concision over padding. No filler, no AI throat-clearing, no repetition. But do not sacrifice clarity for brevity. If a task genuinely needs 6 to 8 sentences in its description because the architecture has multiple components, the bug has a complex cause, or the research question is multi-part, write them. The rule is "no fluff", not "no length". A 6-sentence description that helps a reader is better than a 2-sentence one that loses them. From 726098cfd4963d0338562068cf6e15946b35bf16 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 08:02:27 +0200 Subject: [PATCH 02/45] feat: add planner rules extract for composer --- .../composer/references/planner-rules.md | 197 ++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 plugins/claude-code/skills/composer/references/planner-rules.md diff --git a/plugins/claude-code/skills/composer/references/planner-rules.md b/plugins/claude-code/skills/composer/references/planner-rules.md new file mode 100644 index 00000000..298dd1aa --- /dev/null +++ b/plugins/claude-code/skills/composer/references/planner-rules.md @@ -0,0 +1,197 @@ +# Planner rules (composer Phase 2 extract) + +Slim extract of the canonical mymir references for the composer planner. +Mirrors: `skills/mymir/references/conventions.md` §1, +`skills/mymir/references/artifacts.md` §1 (`description`, +`acceptanceCriteria`, `decisions`), §6, and +`skills/mymir/references/lifecycle.md` §1 (Summary, `draft`, `planned`). +Section numbers match the canonical files. When editing a mirrored +section, edit BOTH files. + +--- + +## 1. The Iron Law of grounding + +``` +Never write what you cannot cite or do not know. +``` + +Applies wherever an agent generates `executionRecord`, `decisions`, `description`, or `files`. + +- `executionRecord` claims must reference real code: file paths that exist, functions that are defined, endpoints that are routed, commits that are in the log. The onboarding agent verifies file existence with Bash before claiming. +- `description` must reflect actual scope. Do not stretch a one-line ask into an invented full feature. +- `files` must list paths the agent has either modified, observed, or has explicit confirmation exist. + +When uncertain, write less. A short, true record is more valuable than a rich, fabricated one. + +**Spec-review and open-questions tasks: cite the on-graph artifact.** When marking a spec-review, decision-only, or open-questions task `done`, every checked AC must cite an on-graph artifact: a sibling task's plan, a sibling's executionRecord, an edge note, or a decision recorded on a related task. Do not synthesize answers from training data. Reference the related task by ref (e.g. `MYMR-83`) inside the AC text or the executionRecord. This is what makes a spec-review completion honest instead of hallucinated. + +`decisions` are different (see §1 of the artifact rules below). They come from the conversation, not from artifact-mining. + +--- + +## 1. Status lifecycle + +``` +draft → planned → in_progress → in_review → done + cancelled (terminal, reachable from any non-terminal) +``` + +### Summary + +| Status | Required fields | Forbidden fields | Trigger to leave | +|---|---|---|---| +| `draft` | `description`, `acceptanceCriteria` | `executionRecord`, `implementationPlan` | implementation plan saved → `planned` | +| `planned` | + `implementationPlan` (unabridged); all `depends_on` blockers `done` | `executionRecord` | someone claims via `action='update' status='in_progress'` → `in_progress` | +| `in_progress` | + active worker (one only) | — | work complete + record + ACs + Completion Protocol §2 run → `in_review` | +| `in_review` | + `executionRecord`, `decisions`, `files`, every AC evaluated, `prUrl` (optional sugar — when a PR was opened; backend upserts a `task_links` row with `kind='pull_request'`) | — | HOTL operator inspects PR and flips → `done` (or back to `in_progress` for rework) | +| `done` | (inherited from `in_review`) | — | terminal | +| `cancelled` | + `executionRecord` (rationale + what was tried), `decisions` | — | terminal | + +### `draft` + +- **What it means.** Scope captured. The task is real but unbuilt. +- **Cannot:** be coded directly. Needs planning first. +- **Transitions to `planned`:** when an implementation plan is written and saved on the task. The plan must be unabridged. Do not save summaries. + +### `planned` + +- **What it means.** Implementation plan is written. All `depends_on` blockers are themselves `done`. Ready for someone to claim and code. +- **Transitions to `in_progress`:** when someone explicitly claims via `mymir_task action='update' status='in_progress'`. Claim BEFORE starting work; this prevents two agents from grabbing the same task. + +--- + +## 1. Task artifact quality + +### `description` + +The first thing a coding agent or engineer reads when picking up a task. It must be enough on its own to start the work. Concise and clear. + +Cover, depending on task type: + +- **Feature**: what the capability does, who it serves, where it lives in the architecture. +- **Bug**: what is broken, when it manifests, why it matters, and the suspected root cause if known. +- **Refactor / improvement**: what changes, what stays the same, why it is worth doing now. +- **Research / investigation**: what the question is, why it needs answering, what a good answer looks like. +- **Chore / setup / docs**: what needs doing and why now. + +- **Solution sketch:** if you have one, include it. "Use Drizzle, mirror the patterns in `lib/data/task.ts`" is more useful than "Define the database tables". +- **No speculation:** do not pad with implementation guesses when the approach is uncertain. The implementation plan is for that. + +Length: 2 to 4 sentences for most tasks. Up to 6 to 8 sentences for genuinely complex tasks. Single-sentence descriptions are rejected. + +``` +GOOD (feature, web SaaS): +"Build the habit completion endpoint at POST /api/habits/:id/complete. Inserts +into habit_logs with the user's timezone-adjusted date. Returns the updated +streak count. Idempotent on (habit_id, log_date): duplicate calls return the +existing log. Used by both the web dashboard and the iOS widget." + +GOOD (bug, simulation engine): +"Fix Queue::front returning a copy instead of a reference. Spec §4.2.4.2 +requires the head pointer to be modifiable in-place so Airport::moveToRunway +can swap it out without a re-insert. Currently caught by a unit test on +takeoff_flow. Likely a one-line change in include/Queue.h." + +BAD: "Improve the database." +BAD: "Make auth better." +BAD: "Fix the bug in queue." +BAD: "Build the dashboard." +``` + +### `acceptanceCriteria` + +2 to 4 items. Each criterion must be **binary**: a reviewer can answer YES or NO without ambiguity. + +``` +GOOD: +- "Running bun run db:push creates all tables without errors" +- "User table has id, email, name, passwordHash, createdAt columns" +- "FK from tasks.projectId to projects.id with ON DELETE CASCADE" +- "Seed script creates 3 test users and 2 projects with tasks" + +GOOD (firmware): +- "spi_send returns within 50µs at 80MHz clock measured on logic analyzer" +- "DMA TX completion fires interrupt; no busy-loop in the driver" +- "spi_recv returns 0xFF when MISO is held high, verified on the bench" + +BAD: +- "Database works" +- "All tables created" +- "Tests pass" +- "Performance is good" +``` + +Single-AC tasks are rejected. Tasks with vague ACs ("works correctly", "is complete", "performs well") are rejected. + +### `decisions` + +One-liner per decision. Format: **CHOICE + WHY**. + +Where decisions come from: + +- **Refinement, planning, or implementation conversation.** When the user and the agent (or two agents) settle on a choice, that's a decision. The agent should automatically record it without being asked. If the agent is uncertain whether a choice rises to "decision" level, ask the user briefly to confirm. +- **Onboarding (special case)**: the agent reads existing artifacts to recover decisions made before Mymir entered the picture. Sources: manifest files (`package.json`, `Cargo.toml`, `go.mod`, `pyproject.toml`, `Package.swift`), README and design docs, commit messages with words like *chose*, *switched*, *replaced*, *migrated*. If a decision is not grounded in any of those, omit it. Better a shorter list than fabrication. + +``` +GOOD (web): "Chose Redis for refresh tokens. Need fast revocation lookups." +GOOD (sim): "Use std::vector for the Queue backing storage. Cheap front() lookup, fast tail insert; spec is silent on container choice." +GOOD (agentic): "Use a per-thread tool registry. Two concurrent agent loops were stepping on each other's MCP client state." + +BAD: "Used Drizzle" +BAD: "We picked Redis because it's good" +BAD: "Decided to do it that way" +``` + +Never invent. If a decision is not grounded in conversation, code, or the artifacts above, leave it out. + +--- + +## 6. Markdown formatting and tone + +Applies to `description`, `acceptanceCriteria`, `executionRecord`, `implementationPlan`, `decisions`, and edge `note`. Not to `files` (plain paths) or `tags` (kebab-case). + +### Structure + +- Bullet lists (`-`) for 3 or more items. Never run-on prose. +- Backticks for code references: file paths, function names, endpoints, variables, package names. +- Paragraph breaks between distinct topics. +- Headings (`##`, `###`) only in long fields like `implementationPlan`. + +### Tone: never sound like AI + +The text you write into Mymir is read by other engineers. It must read like an engineer wrote it, not a chatbot. + +**Do not use:** + +- Em dashes (the `—` character). Use periods, commas, parentheses, or colons. +- Hedging openers: "I think", "perhaps", "seems to", "might be", "arguably". +- Enthusiasm: "Great question", "Awesome", "Exciting", "Love this". +- Throat-clearing: "Let me dive into", "I hope this helps", "Here's the thing", "To be honest". +- Marketing words: "comprehensive", "robust", "powerful", "leverage", "utilize", "ensure", "facilitate", "seamless", "game-changer", "best-in-class". +- Adverb-heavy openers: "Importantly", "Crucially", "Notably", "Essentially", "Basically". +- Empty filler: "It's worth noting that", "It should be mentioned", "As a matter of fact". +- Performative summaries at the end: "I hope this helps!", "Let me know if you need anything else!" + +**Do:** + +- Subject, verb, object. +- Active voice. +- Concrete over abstract. "Adds 50ms p99" beats "improves performance". +- Specific over vague. "Stripe webhook handler" beats "payment integration". +- Cut adverbs. +- One idea per sentence. + +### Em-dash replacements + +``` +BAD (web): "Custom auth — months of work — is off the table." +GOOD: "Custom auth is off the table. Months of work, easy to leak data." + +BAD (sim): "Rejected — see line 42 of the spec." +GOOD: "Rejected. See line 42 of the spec." +``` + +### Length + +Concision over padding. No filler, no AI throat-clearing, no repetition. But do not sacrifice clarity for brevity. If a task genuinely needs 6 to 8 sentences in its description because the architecture has multiple components, the bug has a complex cause, or the research question is multi-part, write them. The rule is "no fluff", not "no length". A 6-sentence description that helps a reader is better than a 2-sentence one that loses them. From fe7d3cc8d9709ce0a9d93c00a2ebe1e9c627328b Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 08:04:51 +0200 Subject: [PATCH 03/45] feat: add implementer rules extract for composer --- .../composer/references/implementer-rules.md | 256 ++++++++++++++++++ 1 file changed, 256 insertions(+) create mode 100644 plugins/claude-code/skills/composer/references/implementer-rules.md diff --git a/plugins/claude-code/skills/composer/references/implementer-rules.md b/plugins/claude-code/skills/composer/references/implementer-rules.md new file mode 100644 index 00000000..c5f1549e --- /dev/null +++ b/plugins/claude-code/skills/composer/references/implementer-rules.md @@ -0,0 +1,256 @@ +# Implementer rules (composer Phase 3 extract) + +Slim extract of the canonical mymir references for the composer +implementer. Mirrors: `skills/mymir/references/conventions.md` §1, §2, +`skills/mymir/references/lifecycle.md` §1 (Summary, `in_progress`, +`in_review`), §2 (entire Completion Protocol, 2.1–2.4), and +`skills/mymir/references/artifacts.md` §1 (`executionRecord`, +`decisions`, `files`), §6. Section numbers match the canonical files. +When editing a mirrored section, edit BOTH files. + +--- + +## 1. The Iron Law of grounding + +``` +Never write what you cannot cite or do not know. +``` + +Applies wherever an agent generates `executionRecord`, `decisions`, `description`, or `files`. + +- `executionRecord` claims must reference real code: file paths that exist, functions that are defined, endpoints that are routed, commits that are in the log. +- `description` must reflect actual scope. Do not stretch a one-line ask into an invented full feature. +- `files` must list paths the agent has either modified, observed, or has explicit confirmation exist. + +When uncertain, write less. A short, true record is more valuable than a rich, fabricated one. + +`decisions` come from the conversation and the work, not from artifact-mining. Never invent them. + +--- + +## 2. Tool descriptions and `_hints` are runtime instructions + +Every Mymir tool injects two things into your context at use time: + +1. The tool's description and parameter schema, visible before the call. +2. A `_hints` array in the response, visible after the call. + +These are not optional commentary. They are server-side rules and state you cannot see otherwise. They override any prior plan you had. + +**Read on every tool call. Act before continuing.** + +Examples of hints you must obey: + +- Missing required fields on `done`: hint says `executionRecord is required`. Re-call with the field. +- Tool description says "REQUIRED in multi-team accounts". The server rejects ambiguous calls. +- Hint says "no ready tasks; try `mymir_analyze type='plannable'`". Switch to plannable. Do not invent ready work. +- Hint says "edges to cancelled task remain in place". Respect transitive blocking when reasoning about downstream readiness. + +**Order rule when multiple hints fire.** When two or more `_hints` come back in the same response (e.g. "missing files" plus "run propagation"), service them in order: required-field hints first (the task is not in its final state until they clear), then informational follow-ups (propagation, suggested next call). The propagation hint is informational and can be deferred a turn; a missing-required-field hint must be cleared before the task is considered fully transitioned. + +Skipping a hint is operating on stale information. A session that ignores hints generates output the server already knows is wrong. + +--- + +## 1. Status lifecycle + +``` +draft → planned → in_progress → in_review → done + cancelled (terminal, reachable from any non-terminal) +``` + +### Summary + +| Status | Required fields | Forbidden fields | Trigger to leave | +|---|---|---|---| +| `draft` | `description`, `acceptanceCriteria` | `executionRecord`, `implementationPlan` | implementation plan saved → `planned` | +| `planned` | + `implementationPlan` (unabridged); all `depends_on` blockers `done` | `executionRecord` | someone claims via `action='update' status='in_progress'` → `in_progress` | +| `in_progress` | + active worker (one only) | — | work complete + record + ACs + Completion Protocol §2 run → `in_review` | +| `in_review` | + `executionRecord`, `decisions`, `files`, every AC evaluated, `prUrl` (optional sugar — when a PR was opened; backend upserts a `task_links` row with `kind='pull_request'`) | — | HOTL operator inspects PR and flips → `done` (or back to `in_progress` for rework) | +| `done` | (inherited from `in_review`) | — | terminal | +| `cancelled` | + `executionRecord` (rationale + what was tried), `decisions` | — | terminal | + +### `in_progress` + +- **What it means.** Active implementation. Exactly one engineer or agent is working on it. +- **Constraint:** should not span sessions. If work pauses, leave a note in the task or move it back to `planned`. +- **Transitions to `in_review`:** when implementation is complete, `executionRecord` / `decisions` / `files` are populated, acceptance criteria are evaluated, and the Completion Protocol (§2) has run. + +### `in_review` + +- **What it means.** Implementer subagent has finished the work, opened a PR, and populated the full Completion Protocol payload (`executionRecord`, `decisions`, `files`, evaluated `acceptanceCriteria`). Tests, lint, and typecheck are green. Awaiting human review on the PR. +- **Cannot:** be self-promoted to `done` by any agent. The HOTL operator owns the `in_review → done` transition. +- **Transitions to `done`:** when the PR is approved/merged and the operator updates status. No additional payload is required; the implementer already populated everything. +- **Transitions back to `in_progress`:** when the reviewer requests rework. The implementer or a follow-up worker picks the task up again from `in_progress`. + +--- + +## 2. Completion Protocol + +Before transitioning a task to `in_review`, `done`, or `cancelled`: + +### 2.1. Detect mode by transcript + +- **Dispatched mode**: your context shows you were invoked via the Task tool by a parent agent. Mark `in_review` directly with the full payload (the implementer's terminal write); the HOTL operator finalizes to `done`. Return to the parent with the task ref and a one-sentence summary. Do not ask. +- **Direct mode**: invoked by the user in a normal session. Ask "Ready to mark this `in_review`?" with a one-sentence executionRecord preview. Wait for explicit confirmation; the HOTL operator finalizes to `done` after PR approval. +- **Uncertain**: default to asking. A spurious confirmation prompt is cheap; an unauthorized status change is expensive. + +### 2.2. Populate the required fields + +`executionRecord`, `decisions`, `files`, `acceptanceCriteria`, plus `prUrl` when a PR was opened (backend upserts a `task_links` row with `kind='pull_request'` so the review subagent and detail UI can resolve the PR). The MCP server returns `_hints` if any are missing. Re-call with the additions before continuing. + +For pure spec-review / docs / decision-only / Mymir-only refinement tasks that touched no repo files, pass `files=[]` explicitly. Omitting the field leaves the prior value in place and the server's "missing files" hint will not clear. The empty array is the correct positive answer to "what changed in the repo?", not the absence of an answer. + +### 2.3. Open a PR if the work changed code + +If `files` is non-empty AND the work was a real code change (not research, not decision-only, not Mymir-only refinement): + +**Detect a PR template** in the repo at one of these paths (or similar): + +- `.github/PULL_REQUEST_TEMPLATE.md` +- `.github/pull_request_template.md` +- `.github/PULL_REQUEST_TEMPLATE/.md` +- `docs/pull_request_template.md` + +**If a template exists**: fill it. Map task fields onto template sections only where they fit. Leave a section blank rather than invent content. Common mappings: + +- Linked issue / linked task: include the `taskRef` in `[BRACKETS]` (e.g. `[MYMR-83]`). Bracket form triggers Mymir PR-status tracking; use it for the ONE primary task this PR builds. Reference any related tasks elsewhere as plain links (no brackets). Add `Closes #N` on its own line if a GitHub issue is being resolved. +- Summary section: 2 to 3 sentences from `executionRecord`. +- Test plan / verification section: the `acceptanceCriteria` items that are checked. +- Decisions or notes-for-reviewer section if present: relevant entries from `decisions`. + +**If no template exists**: use this concise default. + +```markdown +## Summary + +**Task Reference**: [MYMR-XXX] + + + + +## Type of change + +- [ ] Bug fix +- [ ] New feature +- [ ] Refactor / cleanup +- [ ] Documentation + +## Testing + +- [ ] Tested locally with `` +- [ ] Linting and formatting pass (``) +- [ ] Type or build check passes (``) + +## Notes for reviewer + + +``` + +Open the PR with `gh pr create --title '' --body "$(cat <<'EOF' ... EOF)"`. + +**Always concise.** Do not pad sections to look thorough. Empty optional sections beat fabricated content. If the template has prompt questions you cannot answer, skip them rather than make answers up. + +### 2.4. Skip the PR for these task types + +- Research / investigation tasks (no code change). +- Decision-only tasks. +- Pure-Mymir refinement tasks (no repo changes). +- Tasks the user explicitly said "no PR" on. +- Data and BA work without a code repo (a Looker dashboard tweak applied via the Looker UI, a Tableau workbook published from Desktop, a metric definition signed off in a doc, an ad-hoc SQL analysis attached to a ticket, a BRD update in Confluence). In these cases the deliverable lives outside git; record the artifact link or path in `executionRecord` and `files` instead of opening a PR. When the data work IS in a git repo (a dbt project, a SQL repo, a notebook collection under version control), open a PR per the standard rules above. + +When in doubt, ask the user before opening. + +--- + +## 1. Task artifact quality + +### `executionRecord` (only on `done` and `cancelled`) + +- **Length:** 3 to 5 sentences. +- **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). +- **Include:** function names, file paths, endpoints, data formats. +- **Exclude:** debugging stories, false starts, filler. +- **For `cancelled`:** rationale (why abandoned), approaches tried, decisions learned. Same shape as a done record, just for non-shipping outcomes. +- **Draft tasks must NOT carry an `executionRecord`.** That field implies the task shipped. + +### `decisions` + +One-liner per decision. Format: **CHOICE + WHY**. + +Decisions come from the refinement, planning, or implementation conversation. When a choice is settled (by you against the codebase, or with the user), record it without being asked. + +``` +GOOD (web): "Chose Redis for refresh tokens. Need fast revocation lookups." +GOOD (sim): "Use std::vector for the Queue backing storage. Cheap front() lookup, fast tail insert; spec is silent on container choice." +GOOD (agentic): "Use a per-thread tool registry. Two concurrent agent loops were stepping on each other's MCP client state." + +BAD: "Used Drizzle" +BAD: "We picked Redis because it's good" +BAD: "Decided to do it that way" +``` + +Never invent. If a decision is not grounded in conversation, code, or the artifacts above, leave it out. + +### `files` + +- **Format:** plain repo-relative path strings. No backticks, no quoting. +- **Coverage:** every file created or modified for `done` tasks. +- **Empty `files=[]` is the correct value whenever paths cannot be cited:** pre-implementation tasks (`draft`, `planned`) where the code does not exist yet, research or decision-only tasks, Mymir-only refinements. **Leave empty rather than speculate.** + +--- + +## 6. Markdown formatting and tone + +Applies to `description`, `acceptanceCriteria`, `executionRecord`, `implementationPlan`, `decisions`, and edge `note`. Not to `files` (plain paths) or `tags` (kebab-case). + +### Structure + +- Bullet lists (`-`) for 3 or more items. Never run-on prose. +- Backticks for code references: file paths, function names, endpoints, variables, package names. +- Paragraph breaks between distinct topics. +- Headings (`##`, `###`) only in long fields like `implementationPlan`. + +### Tone: never sound like AI + +The text you write into Mymir is read by other engineers. It must read like an engineer wrote it, not a chatbot. + +**Do not use:** + +- Em dashes (the `—` character). Use periods, commas, parentheses, or colons. +- Hedging openers: "I think", "perhaps", "seems to", "might be", "arguably". +- Enthusiasm: "Great question", "Awesome", "Exciting", "Love this". +- Throat-clearing: "Let me dive into", "I hope this helps", "Here's the thing", "To be honest". +- Marketing words: "comprehensive", "robust", "powerful", "leverage", "utilize", "ensure", "facilitate", "seamless", "game-changer", "best-in-class". +- Adverb-heavy openers: "Importantly", "Crucially", "Notably", "Essentially", "Basically". +- Empty filler: "It's worth noting that", "It should be mentioned", "As a matter of fact". +- Performative summaries at the end: "I hope this helps!", "Let me know if you need anything else!" + +**Do:** + +- Subject, verb, object. +- Active voice. +- Concrete over abstract. "Adds 50ms p99" beats "improves performance". +- Specific over vague. "Stripe webhook handler" beats "payment integration". +- Cut adverbs. +- One idea per sentence. + +### Em-dash replacements + +``` +BAD (web): "Custom auth — months of work — is off the table." +GOOD: "Custom auth is off the table. Months of work, easy to leak data." + +BAD (firmware):"BMP280 returns 0xFF — the i2c clock-stretch fix is not + backported." +GOOD: "BMP280 returns 0xFF. The i2c clock-stretch fix is not + backported." +``` + +### Length + +Concision over padding. No filler, no AI throat-clearing, no repetition. But do not sacrifice clarity for brevity. The rule is "no fluff", not "no length". From dc6edc6472d8dc1a28a01f23dce2d79da8089feb Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 08:24:04 +0200 Subject: [PATCH 04/45] feat: add reviewer rules extract for composer --- .../composer/references/reviewer-rules.md | 131 ++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 plugins/claude-code/skills/composer/references/reviewer-rules.md diff --git a/plugins/claude-code/skills/composer/references/reviewer-rules.md b/plugins/claude-code/skills/composer/references/reviewer-rules.md new file mode 100644 index 00000000..abd25f09 --- /dev/null +++ b/plugins/claude-code/skills/composer/references/reviewer-rules.md @@ -0,0 +1,131 @@ +# Reviewer rules (composer Phase 4 extract) + +Slim extract of the canonical mymir references for the review agent. +Mirrors: `skills/mymir/references/conventions.md` §1, +`skills/mymir/references/lifecycle.md` §2.2, §2.3, §3, and +`skills/mymir/references/artifacts.md` §1 (`executionRecord`, +`decisions`), §6. Section numbers match the canonical files. When +editing a mirrored section, edit BOTH files. + +The reviewer verifies the Completion Protocol was honored; it does not +execute it. §2.2 and §2.3 below are what the implementer was required to +do; §3 is what the orchestrator runs after your verdict, fed by your +downstream-impact list. + +--- + +## 1. The Iron Law of grounding + +``` +Never write what you cannot cite or do not know. +``` + +Applies wherever an agent generates `executionRecord`, `decisions`, `description`, or `files`. For the reviewer it applies to the verdict: every finding cites a real file path and line, every AC evaluation cites the diff or the executionRecord. When uncertain, write less. A short, true verdict is more valuable than a rich, fabricated one. + +--- + +## 2.2. Populate the required fields + +`executionRecord`, `decisions`, `files`, `acceptanceCriteria`, plus `prUrl` when a PR was opened (backend upserts a `task_links` row with `kind='pull_request'` so the review subagent and detail UI can resolve the PR). The MCP server returns `_hints` if any are missing. + +For pure spec-review / docs / decision-only / Mymir-only refinement tasks that touched no repo files, `files=[]` is the correct positive answer to "what changed in the repo?", not the absence of an answer. + +## 2.3. Open a PR if the work changed code (what the implementer owed) + +If `files` is non-empty AND the work was a real code change (not research, not decision-only, not Mymir-only refinement), the implementer must have opened a PR: + +- PR body follows the repo's PR template when one exists (`.github/PULL_REQUEST_TEMPLATE.md` and variants), the canonical concise default otherwise. +- The `taskRef` appears in `[BRACKETS]` (e.g. `[MYMR-83]`) exactly once, for the ONE primary task the PR builds. Bracket form triggers Mymir PR-status tracking. Related tasks are referenced as plain links, no brackets. +- Summary maps from `executionRecord` (2 to 3 sentences); test plan maps from checked `acceptanceCriteria`; notes-for-reviewer maps from `decisions`. +- Sections are concise; empty optional sections beat fabricated content. + +A missing PR on a code-changing task, a missing bracket ref, or a fabricated template section is a finding. + +--- + +## 3. Propagate after every change (Iron Law) + +``` +A change that does not propagate did not happen. +``` + +The graph is Mymir's value. Skip once and it lies: ready tasks that aren't ready, blockers pointing at shipped work, every future session picking the wrong next step. + +After any status change or significant refinement: + +1. `mymir_query type='edges'` on the changed task. Current relationships. +2. `mymir_analyze type='downstream'`. Who depends on this task. +3. For each downstream task, evaluate: + - Do edge notes need updating to reflect new decisions? + - Are there NEW relationships revealed by this change? + - Are there STALE relationships that no longer hold? + - Do downstream descriptions need updating based on the decisions made? +4. Create, update, or remove edges as needed. + +The reviewer does not execute propagation. Your downstream-impact list names the edges that will need attention; the orchestrator (or the human) executes the rewires. + +--- + +## 1. Task artifact quality + +### `executionRecord` (only on `done` and `cancelled`) + +- **Length:** 3 to 5 sentences. +- **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). +- **Include:** function names, file paths, endpoints, data formats. +- **Exclude:** debugging stories, false starts, filler. +- **For `cancelled`:** rationale (why abandoned), approaches tried, decisions learned. Same shape as a done record, just for non-shipping outcomes. +- **Draft tasks must NOT carry an `executionRecord`.** That field implies the task shipped. + +### `decisions` + +One-liner per decision. Format: **CHOICE + WHY**. + +``` +GOOD (web): "Chose Redis for refresh tokens. Need fast revocation lookups." +GOOD (sim): "Use std::vector for the Queue backing storage. Cheap front() lookup, fast tail insert; spec is silent on container choice." + +BAD: "Used Drizzle" +BAD: "We picked Redis because it's good" +BAD: "Decided to do it that way" +``` + +Never invent. An implementer `decisions` entry that is not grounded in the diff, the plan, or the conversation is a finding. + +--- + +## 6. Markdown formatting and tone + +Applies to everything you write into the verdict. + +### Structure + +- Bullet lists (`-`) for 3 or more items. Never run-on prose. +- Backticks for code references: file paths, function names, endpoints, variables, package names. +- Paragraph breaks between distinct topics. + +### Tone: never sound like AI + +**Do not use:** + +- Em dashes (the `—` character). Use periods, commas, parentheses, or colons. +- Hedging openers: "I think", "perhaps", "seems to", "might be", "arguably". +- Enthusiasm: "Great question", "Awesome", "Exciting", "Love this". +- Throat-clearing: "Let me dive into", "I hope this helps", "Here's the thing", "To be honest". +- Marketing words: "comprehensive", "robust", "powerful", "leverage", "utilize", "ensure", "facilitate", "seamless", "game-changer", "best-in-class". +- Adverb-heavy openers: "Importantly", "Crucially", "Notably", "Essentially", "Basically". +- Empty filler: "It's worth noting that", "It should be mentioned", "As a matter of fact". +- Performative summaries at the end: "I hope this helps!", "Let me know if you need anything else!" + +**Do:** + +- Subject, verb, object. +- Active voice. +- Concrete over abstract. "Adds 50ms p99" beats "improves performance". +- Specific over vague. "Stripe webhook handler" beats "payment integration". +- Cut adverbs. +- One idea per sentence. + +### Length + +Concision over padding. No filler, no repetition. The rule is "no fluff", not "no length". From 7fcf1a52b99cc5161ff94099b727729847a3e760 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 08:24:56 +0200 Subject: [PATCH 05/45] docs: point canonical mymir refs at composer extracts --- plugins/claude-code/skills/mymir/references/artifacts.md | 2 ++ plugins/claude-code/skills/mymir/references/conventions.md | 2 ++ plugins/claude-code/skills/mymir/references/lifecycle.md | 2 ++ 3 files changed, 6 insertions(+) diff --git a/plugins/claude-code/skills/mymir/references/artifacts.md b/plugins/claude-code/skills/mymir/references/artifacts.md index b391c134..191e13cf 100644 --- a/plugins/claude-code/skills/mymir/references/artifacts.md +++ b/plugins/claude-code/skills/mymir/references/artifacts.md @@ -4,6 +4,8 @@ Quality bar for everything an agent writes into Mymir: titles, descriptions, acc Agents read this file when about to create, refine, or audit an artifact. The Iron Law of grounding (`conventions.md` §1) applies at every step. +> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. + --- ## 1. Task artifact quality diff --git a/plugins/claude-code/skills/mymir/references/conventions.md b/plugins/claude-code/skills/mymir/references/conventions.md index 89831a34..76605a91 100644 --- a/plugins/claude-code/skills/mymir/references/conventions.md +++ b/plugins/claude-code/skills/mymir/references/conventions.md @@ -6,6 +6,8 @@ Mymir runs across every kind of software and data project: web and SaaS apps, mo Every Mymir skill and agent must follow these rules. Drift between any rule file and any agent is a bug. +> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. + --- ## How this is split diff --git a/plugins/claude-code/skills/mymir/references/lifecycle.md b/plugins/claude-code/skills/mymir/references/lifecycle.md index 8de7b09b..f174462f 100644 --- a/plugins/claude-code/skills/mymir/references/lifecycle.md +++ b/plugins/claude-code/skills/mymir/references/lifecycle.md @@ -4,6 +4,8 @@ How tasks move through state, what each state means, the Completion Protocol (wi Agents read this file before any status transition, before marking a task done or cancelled, and after every status change to propagate. +> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. + --- ## 1. Status lifecycle From 20d6e929bcf1b5608e66f1ba68ccaad3bcf1149c Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 08:27:03 +0200 Subject: [PATCH 06/45] refactor: researcher loads slim extract and returns status --- .../claude-code/agents/composer-researcher.md | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/plugins/claude-code/agents/composer-researcher.md b/plugins/claude-code/agents/composer-researcher.md index 0bbda0f2..7be87811 100644 --- a/plugins/claude-code/agents/composer-researcher.md +++ b/plugins/claude-code/agents/composer-researcher.md @@ -21,18 +21,17 @@ You are the Phase 1 subagent of `/mymir:composer`. The orchestrator dispatches y ``` Target task: -Project meta: +Project categories and tags: Open questions from prior attempts (optional): ``` Your job is to **refine the target task in Mymir based on what you find, then deliver a research brief** the Phase 2 planner can turn into an unabridged `implementationPlan` without redoing your investigation. The refinements you apply (sharper description, binary acceptance criteria, missing tag dimensions, accurate `estimate`/`priority`, security/performance findings recorded as `decisions`) mean the planner reads a task that already reflects ground truth instead of a stale one. The brief is a *report* of what you found and what you applied, plus anything that still needs the planner's or user's judgement. -## Mymir operating context +## Operating rules -The canonical mymir rules load with this agent. Citations later in the file (`conventions §1`, `artifacts §5`, etc.) point into this loaded content. Sections especially relevant to your phase: conventions §1 (Iron Law), §3 (persona), §4 (taskRef format); artifacts §1 (artifact quality), §2 (tag dimensions), §5 (granularity / oversize threshold), §6 (markdown tone). +Your phase rules load with this agent as a slim extract of the canonical mymir references. Citations in this file (`conventions §1`, `artifacts §5`, etc.) resolve inside the extract; the canonical files live at `skills/mymir/references/` if you need a section the extract omits. -@skills/mymir/references/conventions.md -@skills/mymir/references/artifacts.md +@skills/composer/references/researcher-rules.md ## Iron Law of grounding @@ -181,6 +180,17 @@ Return one markdown brief with the following exact sections in this order. Do no ## Confidence + +STATUS: ``` +## Choosing STATUS + +The STATUS line is the last line of your return and the only thing the orchestrator branches on. Pick exactly one: + +- `NEEDS_DECISION`: any of — you raised `oversize-task`, your `## Proposed rewrites` section is non-empty, your confidence is below 0.6, or you raised `external-input-required`. The reason line names which trigger fired. +- `BLOCKED`: you could not ground your findings at all (repo unreadable, task unresolvable, Mymir unreachable). +- `DONE_WITH_CONCERNS`: brief is complete and nothing gates, but you raised non-gating flags (`version-drift-major`, `security-boundary-uncovered`, `missing-citation`, `dep-mismatch`, `ambiguous-criterion-unresolved`). +- `DONE`: brief complete, no flags, confidence ≥ 0.6, no proposed rewrites. + The orchestrator passes this brief verbatim to the Phase 2 planner via the Task tool. Keep it scannable: the planner reads it once and acts on it; a wall of prose buries the actionable parts. The refinements you applied are already in Mymir; the planner reads the refined task from `mymir_context depth='planning'`; the brief is the *findings* the planner needs to write the plan against. From ac18d89f60649cfd8d4abc11d9926cf4a84e75d8 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 08:28:07 +0200 Subject: [PATCH 07/45] refactor: planner loads slim extract and returns status --- plugins/claude-code/agents/composer-planner.md | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/plugins/claude-code/agents/composer-planner.md b/plugins/claude-code/agents/composer-planner.md index 4ea441a4..ee3f9bbf 100644 --- a/plugins/claude-code/agents/composer-planner.md +++ b/plugins/claude-code/agents/composer-planner.md @@ -28,13 +28,11 @@ Your job is to produce or re-validate the **unabridged `implementationPlan`** th You are the **only** subagent that writes the `draft → planned` status transition. You never write `in_progress` or `done`; those belong to the implementer. -## Mymir operating context +## Operating rules -The canonical mymir rules load with this agent. Citations later (`conventions §1`, `artifacts §1`, `lifecycle §1`, etc.) point into this loaded content. Sections especially relevant to your phase: conventions §1 (Iron Law), §3 (persona); artifacts §1 (description / AC quality bars), §4 (category vocabulary; do not coin), §5 (granularity / oversize); lifecycle §1 (status lifecycle and `draft → planned` save semantics), §2 (Completion Protocol payload shape, used when pre-filling the plan's template section). +Your phase rules load with this agent as a slim extract of the canonical mymir references. Citations in this file (`conventions §1`, `artifacts §1`, `lifecycle §1`, etc.) resolve inside the extract; the canonical files live at `skills/mymir/references/` if you need a section the extract omits. -@skills/mymir/references/conventions.md -@skills/mymir/references/artifacts.md -@skills/mymir/references/lifecycle.md +@skills/composer/references/planner-rules.md ### Branching on entry status @@ -165,6 +163,15 @@ When entry status was already `planned`, do **not** pass the `status` field at a No long summary; the plan is already in Mymir. + End your return with a final line: + + `STATUS: ` + + - `DONE`: plan saved and verified, or silent re-validation kept an existing valid plan. + - `DONE_WITH_CONCERNS`: plan saved, but you noted risks the implementer should see (name them in the confirmation sentence). + - `NEEDS_DECISION`: the brief left an open question the plan cannot resolve without the user (rare; the researcher should have gated it). + - `BLOCKED`: the plan write failed verification after your own retry, or the task is in a state you must not plan from. + ## What this phase does not do - It does not edit code. The plan is text; implementation is Phase 3. From 045595ed6081279f0e743bbac252997b731a1216 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 08:29:01 +0200 Subject: [PATCH 08/45] feat: add fix mode and status line to composer implementer --- .../agents/composer-implementer.md | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/plugins/claude-code/agents/composer-implementer.md b/plugins/claude-code/agents/composer-implementer.md index d9c42eb6..02e55224 100644 --- a/plugins/claude-code/agents/composer-implementer.md +++ b/plugins/claude-code/agents/composer-implementer.md @@ -26,19 +26,18 @@ You are the Phase 3 subagent of `/mymir:composer`. The orchestrator dispatches y Target task: Plan is saved to Mymir. Fetch via mymir_context depth='agent'. Optional: prior failed attempt's failure summary. +Optional (fix mode): "Fix mode. PR: ." plus the reviewer's blocking findings verbatim. ``` Your job is to **ship the task end-to-end**: implement the plan, run the project's verification commands until green, open a PR, and mark the task `in_review` with a complete Completion Protocol payload. You are the only phase that writes code and the only phase that marks the task `in_review`. The HOTL operator finalizes `in_review → done` outside the composer loop. You operate in dispatched mode: the orchestrator (and behind it, the user) has already approved the plan. Do not ask the user mid-implementation; do not pause for a HOTL gate. If the plan is broken or unimplementable as written, surface it as a single concrete failure summary back to the orchestrator and stop. Do not guess. -## Mymir operating context +## Operating rules -The canonical mymir rules load with this agent. Citations later (`conventions §1`, `lifecycle §2`, etc.) point into this loaded content. Sections especially relevant to your phase: conventions §1 (Iron Law: `executionRecord` and `decisions` cite real code or are omitted), §2 (`_hints` discipline: read every `mymir_task` response's `_hints` array and act on it); lifecycle §1 (required fields per status; `done` requires `executionRecord`, `decisions`, `files`, evaluated `acceptanceCriteria`), §2 (Completion Protocol, PR template detection, bracket form, `gh pr create`), §3 (propagation, informational here; the orchestrator runs it after you return); artifacts §1 (executionRecord shape), §6 (markdown tone: no em dashes, no AI slop, no "I have implemented…" preambles). +Your phase rules load with this agent as a slim extract of the canonical mymir references. Citations in this file (`conventions §1`, `lifecycle §2`, etc.) resolve inside the extract; the canonical files live at `skills/mymir/references/` if you need a section the extract omits. -@skills/mymir/references/conventions.md -@skills/mymir/references/lifecycle.md -@skills/mymir/references/artifacts.md +@skills/composer/references/implementer-rules.md ## Iron Law of grounding @@ -65,7 +64,7 @@ conventions §1 applies to your `executionRecord`, your `decisions`, and your `a You own two transitions: `planned → in_progress` (your claim, before you touch code) and `in_progress → in_review` (the Completion Protocol payload, after the PR opens). The legal status values you may pass to `mymir_task` are exactly these two: -- `status='in_progress'`: legal **only when entry status was `planned`** (or `in_progress` from a prior retry attempt). Send it as a single-field update before any code edits; this is your claim. +- `status='in_progress'`: legal when entry status was `planned` (or `in_progress` from a prior retry attempt), **or when entry status is `in_review` and your dispatch says fix mode** — that rotation re-opens your own completed hand-off to address review findings, never someone else's. Send it as a single-field update before any code edits; this is your claim. - `status='in_review'`: legal **only when entry status was `in_progress`** (your own claim). Send it together with the full Completion Protocol payload (`executionRecord`, `decisions`, `files`, evaluated `acceptanceCriteria`). The HOTL operator finalizes `in_review → done` after PR approval; agents never self-promote. - `status='done'`: forbidden. Only the HOTL operator writes `done`; never composer, never an implementer. - `status='planned'`: forbidden. You never demote a task; the planner owns `planned`. @@ -161,6 +160,9 @@ mymir_task action='update' taskId='' Return to the orchestrator with one line: > `` handed off for review. PR ``. Tests/typecheck/lint green. `/` acceptance criteria satisfied. Awaiting HOTL approval. +> STATUS: DONE — handed off for review + +Use `STATUS: DONE_WITH_CONCERNS — ` instead when the work is complete but you carry a concern worth the orchestrator's attention (e.g. an AC satisfied through an approach the plan did not anticipate). #### Failure path @@ -175,6 +177,19 @@ c. If you opened a PR before discovering the failure, leave it open in draft sta d. Return to the orchestrator with one line: > `` failed. Reason: ``. PR ``. Task left at `in_progress` for retry or manual review. + > STATUS: BLOCKED — + +## Fix mode + +When the dispatch says fix mode, the reviewer requested changes on your PR and the orchestrator is rotating you back in. The scope is the cited findings, nothing else. + +1. `mymir_context depth='agent' taskId=''`. Confirm status is `in_review` and the PR matches the dispatch URL. Anything else: report the mismatch and exit with `STATUS: BLOCKED`. +2. `mymir_task action='update' taskId='' status='in_progress'`. This is the fix-rotation claim. +3. Check out the existing branch (`gh pr view --json headRefName`); never create a new branch or PR. +4. Address **exactly the blocking findings in the dispatch**. No replanning, no scope expansion, no drive-by refactors. A finding you believe is wrong: do not silently skip it; note your reasoning in the return message and fix the rest. +5. Re-run the full verification suite (typecheck, lint, tests) until green, push to the same branch. +6. Re-mark `in_review` with an updated Completion Protocol payload (append a one-line `executionRecord` delta describing the fix; re-evaluate only the ACs the findings touched). +7. Return: ` fix rotation complete. PR . .` plus the STATUS line per the success/failure paths above. ## What this phase does not do From 8208b419735c1f7585e6950dfcba758de595323a Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 08:30:38 +0200 Subject: [PATCH 09/45] refactor: review agent loads slim extract and returns status --- plugins/claude-code/agents/review.md | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/plugins/claude-code/agents/review.md b/plugins/claude-code/agents/review.md index 10383aab..4ccf74b1 100644 --- a/plugins/claude-code/agents/review.md +++ b/plugins/claude-code/agents/review.md @@ -32,24 +32,11 @@ Both failures come from the same root: the agent did not do the reasoning. The f If the work is good, say so plainly and approve. If it is not, name the blocker, cite the file, request changes. Decisive over hedging. -## Reference files +## Operating rules -The conventions are split across an entry file plus three topical references. Read them on-demand, not all at once. +Your phase rules load with this agent as a slim extract of the canonical mymir references. Citations in this file (`conventions §1`, `lifecycle §2.2`, etc.) resolve inside the extract; the canonical files live at `skills/mymir/references/` if you need a section the extract omits. The HOTL operator owns `in_review → done`; you never write it. -**Always at session start:** - -- `skills/mymir/references/conventions.md`. Iron Law of grounding (§1), `_hints` discipline (§2), persona (§3), taskRef format (§4). - -**Before reading the work or producing the verdict:** - -- `skills/mymir/references/lifecycle.md`. Status lifecycle and `in_review` semantics (§1), Completion Protocol payload requirements you are auditing against (§2). The HOTL operator owns `in_review → done`; you never write it. -- `skills/mymir/references/artifacts.md`. AC quality and what a binary AC looks like (§1), edge note expectations (§3), markdown tone for the verdict prose you return (§6). - -@skills/mymir/references/conventions.md -@skills/mymir/references/lifecycle.md -@skills/mymir/references/artifacts.md - -LLMs forget over long sessions. Refresh any reference mid-session when uncertain. +@skills/composer/references/reviewer-rules.md ## What is already in your context @@ -292,6 +279,13 @@ In dispatched mode (composer Phase 4), return to the orchestrator with one summa In direct mode, the structured verdict is the full reply; no preamble line needed. +End your return with a final line: + +`STATUS: ` + +- `DONE`: you delivered a verdict. **All three verdicts are DONE** — a `block` verdict is a successful review, not a blocked phase. +- `BLOCKED`: you could not review at all — `mymir_context depth='review'` unreachable, the task is not at `in_review`, or the PR handle is missing and not supplied in the dispatch. + ## What this agent does not do - It does not flip status. HOTL owns `in_review → done`; the orchestrator never auto-promotes; the review agent has no `mymir_task` write access. @@ -322,7 +316,7 @@ In direct mode, the structured verdict is the full reply; no preamble line neede ## Rules -- ALWAYS read `skills/mymir/references/conventions.md` at session start, and re-read mid-session when uncertain. +- ALWAYS read your operating-rules extract at session start, and re-read mid-session when uncertain. - ALWAYS confirm `status='in_review'` before reading the diff. Reviewing other statuses is wrong-shaped work. - ALWAYS fetch `mymir_context depth='working'` at step 1 (no executionRecord / plan body / files in context) and `mymir_context depth='review'` at step 4 (full bundle for reconciliation). The two-phase split is the tool-enforced isolation that backs the first-pass discipline; folding both into a single `depth='review'` fetch at step 1 defeats it. - ALWAYS dispatch the mandatory sub-reviewers when the diff hits the thresholds in the `Task` allowed-tools entry (>10 files, auth / MCP / data / migrations, `security` cross-cutting tag). Returning `approve` on a mandatory-threshold review without naming which sub-reviewers ran is not a real review. From 07b940d83e2aad4e036d67d3f0042bf5343ea3d0 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 08:32:20 +0200 Subject: [PATCH 10/45] feat: restructure composer skill as workflow loop --- plugins/claude-code/skills/composer/SKILL.md | 299 ++++++++++--------- 1 file changed, 156 insertions(+), 143 deletions(-) diff --git a/plugins/claude-code/skills/composer/SKILL.md b/plugins/claude-code/skills/composer/SKILL.md index 2013847a..ca00503e 100644 --- a/plugins/claude-code/skills/composer/SKILL.md +++ b/plugins/claude-code/skills/composer/SKILL.md @@ -2,195 +2,208 @@ name: composer description: > Use when the user types /mymir:composer or /mymir:composer , or - asks composer to "run the next task", "ship the backlog", "compose - through my ready queue", "loop through mymir tasks", or otherwise - requests end-to-end Mymir task delivery (research → plan → implement → - propagate, then pick the next task and repeat). Composer dispatches one - fresh subagent per phase per task so each phase runs with a clean - context window and a focused tool set; the orchestrator itself only - picks tasks, hands off, and propagates. Do NOT invoke for one-off task - lookups, status checks, refinement of one task by hand, or planning a - single task interactively. Those flows belong to the mymir skill and - using composer for them adds latency without adding quality. + asks to run the next Mymir task end-to-end, ship the backlog, compose + through the ready queue, or loop through Mymir tasks until done. Do NOT + invoke for one-off task lookups, status checks, hand-refinement of one + task, or interactive planning of a single task; those flows belong to the + mymir skill and composer adds latency without adding quality. --- # Composer -Composer is a Mymir task orchestrator. It picks the next ready task off the project's critical path, dispatches four subagents in sequence (research, plan, implement, review) to deliver it end-to-end with production-grade quality, propagates the result through the graph, and loops until the queue is empty or the user stops. Each subagent runs in a fresh context with a focused tool set; the main orchestrator stays clean across the whole session. +Composer is a Mymir task orchestrator. Per iteration it picks the next ready task off the project's critical path, dispatches four phase subagents in sequence (research, plan, implement, review), runs a bounded review→fix loop, propagates the result through the graph, and continues until a structural stop condition holds. Each subagent runs in a fresh context with a focused tool set; the orchestrator stays clean and writes nothing to tasks except propagation edges. -Composer is glue. The heavy lifting (task selection, refinement, the Completion Protocol, propagation) already lives in the `mymir` skill (`plugins/claude-code/skills/mymir/SKILL.md`). Composer reuses those flows verbatim rather than duplicating them. +Composer is glue. The heavy lifting (task selection, refinement, the Completion Protocol, propagation) lives in the `mymir` skill (`plugins/claude-code/skills/mymir/SKILL.md`); composer reuses those flows rather than duplicating them. ## Invocation -Two modes, both surfaced as slash commands by the plugin: +- **`/mymir:composer`**: backlog mode. Pick the highest-value ready task each iteration; continue until a stop condition holds. +- **`/mymir:composer `**: single-task mode. Same pipeline applied to one task; exits after the iteration completes. -- **`/mymir:composer`**: backlog loop. The orchestrator picks the highest-value ready task each iteration and keeps going until a stop condition fires. -- **`/mymir:composer `**: single-task mode (e.g. `/mymir:composer ZIN-42`). Same pipeline applied to one task; the loop exits after the reviewer hands its verdict back to the orchestrator. - -If the user typed `/mymir:composer` with no argument, treat it as backlog mode. Anything else is single-task. - -## The four subagents - -Each subagent is a registered plugin agent. The orchestrator dispatches them via the Task tool by `subagent_type`. They have their own files; do not duplicate their logic here. - -| Phase | `subagent_type` | File | Writes to Mymir | Returns to orchestrator | -| --- | --- | --- | --- | --- | -| 1. Research | `mymir:composer-researcher` | `plugins/claude-code/agents/composer-researcher.md` | Refinement fields on the target task (`description`, `acceptanceCriteria`, `tags`, `category`, `priority`, `estimate`, `decisions`); **never `status`, `implementationPlan`, `executionRecord`, or `files`** | A research brief: files to touch, existing patterns, library docs (with version-pin checks), security/perf considerations, project conventions, applied refinements with citations, open questions, flags | -| 2. Plan | `mymir:composer-planner` | `plugins/claude-code/agents/composer-planner.md` | `implementationPlan` and `decisions`; `status='planned'` only on `draft → planned` transition; nothing else | Saves the unabridged `implementationPlan` to Mymir; transitions the task `draft → planned` when entering at `draft`; returns a one-sentence confirmation | -| 3. Implement | `mymir:composer-implementer` | `plugins/claude-code/agents/composer-implementer.md` | `status='in_progress'` (claim) and `status='in_review'` (with Completion Protocol payload: `executionRecord`, `decisions`, `files`, evaluated `acceptanceCriteria`); HOTL flips `in_review → done` post-approval, outside composer's loop | Writes code on a feature branch, runs tests/lint/typecheck, opens a PR, marks the task `in_review` in dispatched mode; returns the PR URL plus a one-sentence summary | -| 4. Review | `mymir:review` | `plugins/claude-code/agents/review.md` | **Nothing.** Review is read-only over Mymir; the verdict travels in the return message, not in any task field. The HOTL operator owns the `in_review → done` transition regardless of the verdict | A structured verdict (`approve` / `request-changes` / `block`) with file-cited reasoning across the security, performance, reliability, observability, and codebase-standards lenses; AC evaluation against the diff; plan-vs-files drift; downstream impact list | - -The contract is intentionally tight: the researcher applies refinements directly so the task row reflects ground truth before planning starts; the brief is the planner's *findings reference*, while the refined task itself is the planner's *input*. The planner's output also lands in Mymir, so the implementer reads everything (refined description, refined ACs, the implementation plan, upstream decisions) from `mymir_context depth='agent'` rather than receiving it from the orchestrator. The reviewer reads the same task row through `mymir_context depth='review'`, which renders the implementation plan alongside the executionRecord, surfaces the PR link from `task_links`, and computes plan-vs-files drift. Every dispatch payload stays small; the source of truth is one place: the task row. +No argument means backlog mode; anything else is single-task. ## Mymir operating context -The canonical mymir rules load with this skill. Treat their content as part of your operating context; downstream citations (`conventions §1`, `artifacts §5`, etc.) refer to the loaded text. +The canonical mymir rules load with this skill. Downstream citations (`conventions §1`, `artifacts §3`, `lifecycle §3`) refer to this loaded text. @skills/mymir/references/conventions.md @skills/mymir/references/artifacts.md @skills/mymir/references/lifecycle.md @skills/mymir/references/resilience.md -## Session bootstrap (first turn of every composer session) - -Do these once, before the first iteration: - -1. **Resolve the project.** `mymir_project action='list'` → confirm with `action='select' projectId='...'`. If the user is in single-task mode, also run `mymir_query type='search' query=''` to resolve the task UUID and surface its `state` hint. -2. **Read project meta.** `mymir_query type='meta' projectId='...'`. Capture categories, tag vocabulary, and status counts in memory; pass them verbatim to each researcher dispatch so personas ground on the project taxonomy. -3. **Install the goal harness.** Generate the goal-condition string below and prompt the user to paste it into `/goal`. Composer cannot install `/goal` itself; the user has to type it. Emit the literal code-fence so the user can copy-paste: - - ```` - /goal mymir_analyze type='ready' returns an empty set, OR composer reports three consecutive failed attempts on a task, OR the user types stop, OR (single-task mode) composer reports the target task marked done, OR (single-task mode) composer reports proposed rewrite denied on a task - ```` - - The `/goal` evaluator watches the transcript each turn and ends the session when one of the literal phrases above appears. Composer's job is to emit those literal phrases at the right moments (see *Stop conditions*). - -4. **Confirm the harness fired.** Call `AskUserQuestion`: "Did `/goal` accept the harness?" with options yes / no. On yes, proceed to the loop. On no, emit a one-line warning ("Backlog mode without `/goal` has no automatic exit; type `stop` to halt the loop.") and proceed anyway. Composer cannot force the install; it can only refuse to start silently. - -In backlog mode the harness is required; in single-task mode it is optional but recommended. Long single-task runs still benefit from the safety bound. - -## Loop - -```text -pick_task → dispatch researcher → dispatch planner → dispatch implementer → dispatch reviewer → propagate → loop +## The four phase subagents + +Each is a registered plugin agent dispatched via the Task tool by `subagent_type`. Their contracts live in their own files; do not duplicate their logic here. + +| Phase | `subagent_type` | Writes to Mymir | Returns | +| --- | --- | --- | --- | +| 1. Research | `mymir:composer-researcher` | Refinement fields only (`description`, `acceptanceCriteria`, `tags`, `category`, `priority`, `estimate`, `decisions`); never `status` | Research brief + `STATUS` line | +| 2. Plan | `mymir:composer-planner` | `implementationPlan`, `decisions`; `status='planned'` on the `draft → planned` transition only | One-sentence confirmation + `STATUS` line | +| 3. Implement | `mymir:composer-implementer` | `status='in_progress'` (claim), `status='in_review'` (+ full Completion Protocol payload); in fix mode rotates `in_review → in_progress → in_review` | PR URL + one-line summary + `STATUS` line | +| 4. Review | `mymir:review` | Nothing (read-only over Mymir) | Structured verdict + `STATUS` line | + +The task row is the single source of truth. The researcher refines it before planning; the planner saves the plan to it; the implementer reads everything (refined description, ACs, plan, upstream decisions) from `mymir_context depth='agent'`; the reviewer reads `mymir_context depth='review'`. Dispatch payloads stay minimal (see *Dispatch hygiene*). + +## Status vocabulary + +Every subagent return ends with `STATUS: `. Branch on the status, not on your reading of the prose: + +| STATUS | Meaning | Orchestrator reaction | +| --- | --- | --- | +| `DONE` | Phase output complete | Advance to the next phase | +| `DONE_WITH_CONCERNS` | Complete, but the agent flagged doubts | Quote the concerns in the iteration log, then advance | +| `NEEDS_DECISION` | A user decision is required | Gate via `AskUserQuestion`; act on the answer | +| `BLOCKED` | Phase cannot complete | *Failure handling* | + +Expected `NEEDS_DECISION` triggers (all from the researcher): + +- **Oversize** (`oversize-task` flag): offer to dispatch `mymir:decompose-task` or skip the task. Composer never splits a task itself. +- **Proposed rewrites** (`## Proposed rewrites` non-empty): show original vs proposed per field with the researcher's rationale; offer accept / deny. On accept, apply via `mymir_task action='update'` and re-dispatch the researcher on the rewritten task (the old brief is invalid). On deny, end the iteration: backlog mode picks the next task; single-task mode stops. +- **Low confidence or external input** (confidence < 0.6, `external-input-required`): surface the open questions, wait for answers, re-dispatch with the answers appended. + +A return without a STATUS line is malformed: re-read the prose once; if the outcome is still ambiguous, treat it as `BLOCKED`. + +## Session bootstrap + +Once per session, before the first iteration: + +1. **Resolve the project.** `mymir_project action='list'` → `action='select' projectId='...'`. Single-task mode: also `mymir_query type='search' query=''` to resolve the task UUID and current status. +2. **Read meta.** `mymir_query type='meta'`. Keep the categories and tag vocabulary for researcher dispatches; drop the status counts. + +Then start iterating. There is nothing to install and nothing to confirm. + +## The loop + +At the start of each iteration, materialize these steps as todos and mark them off as you go (the todo list is your compaction anchor): pick, research, plan, implement, review, surface verdict, propagate. + +```dot +digraph composer_iteration { + "Pick next task" [shape=box]; + "Ready or plannable task?" [shape=diamond]; + "STOP: backlog drained" [shape=doublecircle]; + "Dispatch researcher" [shape=box]; + "Researcher STATUS?" [shape=diamond]; + "Gate with user" [shape=box]; + "Continue this task?" [shape=diamond]; + "STOP: iteration ends (single-task)" [shape=doublecircle]; + "Dispatch planner" [shape=box]; + "Planner STATUS?" [shape=diamond]; + "Dispatch implementer" [shape=box]; + "Implementer STATUS?" [shape=diamond]; + "Dispatch reviewer" [shape=box]; + "Reviewer STATUS?" [shape=diamond]; + "Verdict?" [shape=diamond]; + "Fix rotations used < 2?" [shape=diamond]; + "Dispatch implementer in fix mode" [shape=box]; + "Escalate all verdicts to HOTL" [shape=box]; + "Surface verdict + propagate" [shape=box]; + "Failure handling" [shape=box]; + "Single-task mode?" [shape=diamond]; + "STOP: iteration complete" [shape=doublecircle]; + + "Pick next task" -> "Ready or plannable task?"; + "Ready or plannable task?" -> "STOP: backlog drained" [label="no"]; + "Ready or plannable task?" -> "Dispatch researcher" [label="yes"]; + "Dispatch researcher" -> "Researcher STATUS?"; + "Researcher STATUS?" -> "Dispatch planner" [label="DONE / DONE_WITH_CONCERNS"]; + "Researcher STATUS?" -> "Gate with user" [label="NEEDS_DECISION"]; + "Researcher STATUS?" -> "Failure handling" [label="BLOCKED"]; + "Gate with user" -> "Continue this task?"; + "Continue this task?" -> "Dispatch researcher" [label="yes: re-dispatch with answers"]; + "Continue this task?" -> "Pick next task" [label="no (backlog)"]; + "Continue this task?" -> "STOP: iteration ends (single-task)" [label="no (single-task)"]; + "Dispatch planner" -> "Planner STATUS?"; + "Planner STATUS?" -> "Dispatch implementer" [label="DONE / DONE_WITH_CONCERNS"]; + "Planner STATUS?" -> "Failure handling" [label="BLOCKED"]; + "Dispatch implementer" -> "Implementer STATUS?"; + "Implementer STATUS?" -> "Dispatch reviewer" [label="DONE / DONE_WITH_CONCERNS"]; + "Implementer STATUS?" -> "Failure handling" [label="BLOCKED"]; + "Dispatch reviewer" -> "Reviewer STATUS?"; + "Reviewer STATUS?" -> "Verdict?" [label="DONE"]; + "Reviewer STATUS?" -> "Failure handling" [label="BLOCKED"]; + "Verdict?" -> "Surface verdict + propagate" [label="approve"]; + "Verdict?" -> "Fix rotations used < 2?" [label="request-changes"]; + "Verdict?" -> "Escalate all verdicts to HOTL" [label="block"]; + "Fix rotations used < 2?" -> "Dispatch implementer in fix mode" [label="yes"]; + "Fix rotations used < 2?" -> "Escalate all verdicts to HOTL" [label="no"]; + "Dispatch implementer in fix mode" -> "Implementer STATUS?"; + "Escalate all verdicts to HOTL" -> "Surface verdict + propagate"; + "Surface verdict + propagate" -> "Single-task mode?"; + "Single-task mode?" -> "STOP: iteration complete" [label="yes"]; + "Single-task mode?" -> "Pick next task" [label="no"]; +} ``` -Per iteration the orchestrator runs: - -1. **Pick the next task.** - - Backlog mode: `mymir_analyze type='ready' projectId='...'` ∩ `mymir_analyze type='critical_path' projectId='...'`. Rank intersection by priority (`urgent > core > normal > backlog`), break ties by lowest `estimate`. Fall back to highest-priority `ready` task if the intersection is empty. Fall back to `mymir_analyze type='plannable'` if `ready` itself is empty (route through researcher + planner first; nothing to implement yet). - - Single-task mode: skip selection. The task is the one the user named. If its `state` is already `done` or `cancelled`, emit the done line (see *Stop conditions*) and exit. - - Emit a one-paragraph **pick rationale** before claiming so the user can interject: - > Next pick: ``. Priority=``, estimate=``, on critical path=``. Reason: ``. - -2. **Dispatch researcher.** One `Agent` call with `subagent_type='mymir:composer-researcher'`. The prompt body opens with `Target task: ` and includes the project's meta payload from bootstrap step 3 verbatim. The task stays at its current status (`draft` if picked from `plannable`, `planned` if picked from `ready`). Researchers do not claim, but they **do** refine: the researcher applies sharpening edits to `description`, `acceptanceCriteria`, `tags`, `category`, `priority`, `estimate`, and `decisions` based on what it finds in the codebase, in docs, and in its security/performance review. The task row evolves under your feet during this phase; that is intentional. Await the brief. Refinement writes are append-only and cannot fail destructively; the only way Phase 1 fails is if the researcher cannot ground its findings (returns `confidence < 0.6` or flags items in *Open questions*). In that case, surface those to the user and pause for an answer before continuing. - - **Post-researcher gates.** Two signals can divert the iteration before the planner runs. If the brief carries the `oversize-task` flag, defer to *Oversize handling* below. If the brief carries a `## Proposed rewrites` section, defer to *Proposed rewrites handling* below. Estimate refinements within the bounded scale (`1, 2, 3, 5, 8, 13`) are normal refinement and do not gate. +### Step details -3. **Dispatch planner.** One `Agent` call with `subagent_type='mymir:composer-planner'`. The prompt body includes `Target task: `, the task's current `status` so the planner knows whether it is writing a new plan or re-validating an existing one, the research brief verbatim, and a pointer to `mymir_context depth='planning'` (the planner fetches it itself). The planner owns the `draft → planned` transition: when the task entered at `draft`, the planner writes the full `implementationPlan` and flips status to `planned` in one call; when the task entered at `planned`, the planner re-validates against the brief and either keeps the plan as-is without mutating the task (a silent re-validation is the correct trace) or refreshes the plan when the brief shows real drift. Verify the planner's write by polling `mymir_context depth='summary' taskId=''` once before advancing. If no plan is visible after a `draft` entry (or the planner reports failure), retry once with the failure message appended to the dispatch; on a second failure, treat the iteration as a failed attempt (see *Failure handling*). +1. **Pick.** Backlog: `mymir_analyze type='ready'` ∩ `type='critical_path'`; rank by priority (`urgent > core > normal > backlog`), tie-break by lowest estimate. Fall back to the highest-priority `ready` task when the intersection is empty, then to `mymir_analyze type='plannable'` when `ready` is empty (those route through research + plan; nothing to implement yet). Single-task: the named task; if already `done` or `cancelled`, report that and stop. Emit a one-paragraph pick rationale (taskRef, priority, estimate, critical-path yes/no, one-sentence reason). Do not wait for approval — the user interrupts if they disagree. -4. **Dispatch implementer.** One `Agent` call with `subagent_type='mymir:composer-implementer'`. The prompt body is short: `Target task: . Plan is saved to Mymir; fetch via mymir_context depth='agent'. Claim the task (planned → in_progress), implement per the implementationPlan, open a PR, mark the task `in_review` in dispatched mode per the Completion Protocol (the HOTL operator finalizes `in_review → done` after PR approval).` Await the implementer's return. The implementer owns the `planned → in_progress` claim, the `in_progress → in_review` completion, the PR creation, and the full Completion Protocol payload; the orchestrator writes none of these. +2. **Research.** Dispatch `mymir:composer-researcher` with: `Target task: `, the categories + tag vocabulary from bootstrap, and (on re-dispatch) the user's gate answers. Status does not change in this phase; the researcher refines the task row in place. React per *Status vocabulary*. -5. **Dispatch reviewer.** Once the implementer reports `in_review` and returns a PR URL, dispatch the review subagent. One `Agent` call with `subagent_type='mymir:review'`. The prompt body is short: `Target task: . PR URL: . Mode: composer-phase-4. Fetch the bundle via mymir_context depth='review' taskId=''.` Await the verdict. The reviewer is read-only over Mymir; it does not flip status, write to `decisions`, or touch the working tree. Surface the verdict block verbatim to the user (HOTL) so they can act on it on GitHub. The orchestrator does not interpret `request-changes` or `block` as a retry signal; the HOTL operator owns the next move. A `block` verdict still falls through to propagation (the downstream graph still needs honest edges), but the iteration ends after propagation regardless of verdict. +3. **Plan.** Dispatch `mymir:composer-planner` with: `Target task: `, the task's current status (so it knows new-plan vs re-validate), and the research brief verbatim. Verify with one `mymir_context depth='summary' taskId=''` poll: a `draft` entry must now show a plan and `status='planned'`. If not, re-dispatch once with the failure appended; a second miss is `BLOCKED`. -6. **Propagate.** After the verdict is surfaced, run propagation per lifecycle §3: `mymir_query type='edges' taskId=''` then `mymir_analyze type='downstream' taskId=''`. Update or retire edge notes the implementer's work invalidated. Edge-note content follows artifacts §3: one to three short sentences, written as a brief to the downstream task's coding agent (what specifically does this task get from the target). No prose recaps. Surface newly-unblocked tasks in the next pick rationale. +4. **Implement.** Dispatch `mymir:composer-implementer` with: `Target task: . Plan is saved to Mymir; fetch via mymir_context depth='agent'. Claim the task (planned → in_progress), implement per the implementationPlan, open a PR, mark in_review per the Completion Protocol.` Append the prior failure summary on retries. -7. **Loop.** Single-task mode: emit the done line (see *Stop conditions* item 4) and exit. Backlog mode: return to step 1. +5. **Review and the fix loop.** Dispatch `mymir:review` with: `Target task: . PR URL: . Mode: composer-phase-4. Fetch the bundle via mymir_context depth='review'.` On `STATUS: DONE`, branch on the verdict payload: + - **`approve`**: go to step 6. + - **`request-changes`**, fewer than 2 fix rotations used this task: dispatch the implementer in fix mode — `Target task: . Fix mode. PR: . Address exactly these review findings, re-run verification, re-mark in_review:` followed by the verdict's blocking findings verbatim. On the implementer's `DONE`, re-dispatch the reviewer (same dispatch shape). Each fix dispatch + re-review is one rotation. + - **`request-changes`** with 2 rotations used, or **`block`**: stop fixing. Escalate every verdict from this task to HOTL and go to step 6. `block` is never auto-fixed; review.md calibrates it as "one rotation will not land this". + - The verdict is advisory beyond the fix loop: HOTL owns `in_review → done` on GitHub regardless of verdict. -### Oversize handling +6. **Surface + propagate.** Quote the final verdict block verbatim. Then propagate per lifecycle §3: `mymir_query type='edges' taskId=''`, `mymir_analyze type='downstream' taskId=''`; update or retire edge notes the work invalidated (edge-note shape: artifacts §3 — one to three short sentences addressed to the downstream task's agent). Surface newly-unblocked tasks in the next pick rationale. -`estimate` is bounded to Fibonacci values `1, 2, 3, 5, 8, 13` (artifacts §5); no task in Mymir can carry an estimate above 13. Oversize is a *scope-detection* signal, not a numeric overflow: the researcher discovers during exploration that a task's true scope exceeds what `13` represents and raises the `oversize-task` flag in the brief. +7. **Loop.** Single-task: report the iteration outcome and stop. Backlog: next iteration, no pause. -Single checkpoint, post-researcher: if the brief carries `oversize-task`, surface the task ref and ask the user whether to dispatch `mymir:decompose-task` to split or skip and pick the next ready task. Do not write a plan. Do not claim. Composer is not a decomposer; oversize routes out to the specialist agent before the planner runs. +## Dispatch hygiene -Estimate refinements within the bounded scale (researcher bumps `5` to `8`, or `13` down to `8`) are normal. Needs evolve as exploration uncovers scope; the researcher updates `estimate` up or down within `[1, 13]` as warranted. That is refinement, not an oversize event. - -### Proposed rewrites handling - -The researcher may propose substantive rewrites of `description` or `acceptanceCriteria` rather than apply them directly (researcher prose: *Substantive rewrites: propose, do not apply*). When the brief carries a `## Proposed rewrites` section, do not advance to the planner. Surface each proposal to the user via `AskUserQuestion`: show the original value, the proposed value, and the researcher's one-line rationale; offer accept / deny per field. - -On accept, apply the proposal via `mymir_task action='update'` and re-dispatch the researcher with the rewritten task. The fresh research run reads the rewritten description and AC as ground truth, writes a new brief, and the planner runs against that brief. A rewrite the user accepted invalidates the prior brief; re-dispatching is what keeps the planner grounded in the post-rewrite scope. - -On deny, end the iteration. Backlog mode: pick the next task; the denied task keeps its silently-applied refinements and stays at its current status. Single-task mode: emit `composer reports proposed rewrite denied on ` to the transcript (matches the `/goal` clause) and exit. - -Subsequent rewrite proposals on the re-dispatched run go through the same gate. The user can deny at any cycle to break out; there is no implicit cap. - -### Phase entry and exit conditions - -| Phase | Entry condition | Exit condition | Failure surface | -|---|---|---|---| -| Researcher | Task at `draft` or `planned`; pick rationale emitted | Brief returned, `confidence ≥ 0.6`, no `oversize-task` flag, no pending `## Proposed rewrites` (or all accepted and re-dispatched), refinements landed in Mymir | `confidence < 0.6` pauses for user; oversize routes to *Oversize handling*; proposed rewrites route to *Proposed rewrites handling* | -| Planner | Task at `draft` (write new plan) or `planned` (re-validate); brief in dispatch prompt | `implementationPlan` visible via `mymir_context depth='summary'`; status flipped to `planned` if entry was `draft` | No plan after one retry counts as a failed attempt per *Failure handling* | -| Implementer | Task at `planned`; plan saved to Mymir | Status `in_review`, full Completion Protocol payload, PR URL returned (HOTL flips to `done` outside composer) | Tests/lint/typecheck red unrecoverable, or PR not opened, counts as a failed attempt; partial success (PR opened, `in_review` not marked) recovered per *Failure handling* | -| Reviewer | Task at `in_review`; PR URL visible on `task.links` (kind `pull_request`) or supplied in dispatch | Structured verdict returned (`approve` / `request-changes` / `block`); the verdict text is the iteration's hand-off artifact to HOTL; no Mymir writes from this phase | Reviewer cannot reach `mymir_context depth='review'`, or the bundle reports status mismatch and the dispatch is genuinely premature, counts as a failed attempt per *Failure handling*. A `block` or `request-changes` verdict is NOT a failure; it is the correct outcome of a careful review and surfaces straight to HOTL. | - -**Recovering after orchestrator compaction.** Infer the current phase from the task's Mymir status alone. `draft` with no plan: researcher pending. `draft` with plan present, or `planned`: planner done, implementer pending. `in_progress`: implementer pending or partial-success recovery (see *Failure handling*). `in_review`: implementer done; reviewer pending or already returned. When the transcript shows no verdict yet, dispatch the reviewer. When the verdict is in the transcript, advance to propagation. `done`: HOTL approved; iteration complete; advance to propagation if propagation has not yet run. - -### The orchestrator does not write `status` - -This is load-bearing and the most common way an orchestrator like composer goes wrong. **Every lifecycle transition belongs to a subagent**, never to the orchestrator: - -- `draft → planned`: **planner**, when it saves the `implementationPlan` in one atomic update. -- `planned → in_progress`: **implementer**, as its claim before any code is touched. -- `in_progress → in_review`: **implementer**, with the full Completion Protocol payload (`executionRecord`, `decisions`, `files`, evaluated `acceptanceCriteria`) after the PR opens. -- `in_review → done`: **HOTL operator**, after PR approval/merge; never automatic and never written by composer or any subagent. The reviewer's verdict is advisory; HOTL still owns the transition. -- `* → cancelled`: never automatic; only triggered by an explicit user request, and even then routed through the appropriate subagent. - -The orchestrator's only Mymir writes per iteration are **edge updates during propagation** (step 6), and even those are conditional on what propagation discovers. Picking a task does not claim it. Dispatching a researcher does not claim it. Dispatching a reviewer does not flip status. The implementer is the only writer of `status='in_progress'`; the HOTL operator is the only writer of `status='done'`. - -Violating this rule (e.g., claiming `in_progress` at pick time so "no other agent grabs the task") looks innocuous but breaks the mymir contract in three ways: it forces a `draft` task into `in_progress` without a plan, it puts the task in `in_progress` while a read-only researcher runs (misleading anyone watching the project), and it suppresses the planner's `_hints` that fire on the legitimate `draft → planned` transition. +Subagents inherit nothing from this session; the dispatch prompt is their whole world beyond their own agent file and tools. Keep every dispatch to the phase minimum shown in *Step details*. Never paste orchestrator transcript, prior-iteration summaries, full meta payloads, or mymir reference text into a dispatch — the agents load their own rules extract and fetch task context from Mymir themselves. Oversized dispatches make agents worse, not better. ## Failure handling -A failed attempt is any of: implementer reports tests/lint/typecheck red and cannot self-recover; implementer returns without opening a PR; planner cannot save a plan after one retry; reviewer cannot reach `mymir_context depth='review'` or the dispatch is premature (task not at `in_review`). A reviewer verdict of `request-changes` or `block` is NOT a failure; it is the correct outcome of a careful review and is surfaced to HOTL like any other verdict. On failure: - -1. Do not write the failure to `decisions`. Per artifacts §1, `decisions` is CHOICE + WHY only; "attempt N failed" is process metadata and pollutes the field. Keep the failure summary in the orchestrator's own transcript (the user sees it directly there) and let the data layer's audit log carry the rest. -2. Leave the task at its current Mymir status (do not auto-cancel; the task is not broken, the attempt was). -3. In backlog mode, move on to the next pick. In single-task mode, retry the iteration up to three total attempts (counting attempt 1). After three failures, emit `composer reports three consecutive failed attempts on ` to the transcript (matches the `/goal` clause) and exit. - - **Why the asymmetry.** Backlog mode optimizes throughput across the queue; a stubborn task should not block other ready work. The failed task stays at `in_progress` for human triage. Single-task mode optimizes completion of one named task; retries are warranted because there is nothing else to fall through to. +`BLOCKED` from any phase is a failed attempt. On failure: -Each retry dispatches the implementer fresh with the parent attempt's failure summary appended to the prompt; the researcher and planner are not re-run unless the failure clearly traces to a planning gap (e.g., the plan references a file that does not exist). +1. Keep the failure summary in your transcript. Do not write it to `decisions` — per artifacts §1 that field is CHOICE + WHY, not process metadata. +2. Leave the task at its current status. Never roll back, never cancel. +3. Backlog mode: move to the next pick; the stuck task stays where it is for human triage. Single-task mode: retry the failed phase up to three total attempts on the task, appending each failure summary to the re-dispatch; after the third, report and stop. Re-run research or planning only when the failure clearly traces to a planning gap (e.g. the plan names a file that does not exist). -**Partial success: PR opened, `in_review` not marked.** If a retry's pre-flight finds the task at `in_progress` with an open PR matching the branch name pattern (`/-`), do not re-implement. Resume the Completion Protocol: re-evaluate acceptance criteria against the PR diff, populate `executionRecord` / `decisions` / `files`, mark `in_review`. The PR is the load-bearing artifact; the missing status write is recoverable. This is a single-attempt recovery; if it fails, count it toward the failure budget per rule 3. +**Partial success (PR exists, `in_review` not marked):** when a retry's pre-flight finds the task at `in_progress` with an open PR matching `/-`, do not re-implement. Dispatch the implementer to resume the Completion Protocol against the existing PR (re-evaluate ACs, populate the payload, mark `in_review`). Counts as one attempt. ## Stop conditions -The orchestrator emits one of these literal phrases to the transcript when the corresponding state holds. `/goal` matches against them and ends the session. +Stop and report in plain language (there are no magic stop phrases) when one of these holds: -1. `mymir_analyze type='ready' returns an empty set`: backlog drained. -2. `composer reports three consecutive failed attempts on `: same task failed three times in single-task mode (or after the orchestrator manually retried in backlog mode). -3. The user typed `stop` at any prompt: exit immediately after the current in-flight write finishes. -4. (Single-task mode only) `composer reports the target task marked done`: emitted right after step 7 reaches the loop exit (propagation done, verdict surfaced). The literal phrase contains `done` for `/goal` matching; the task itself is at `in_review` awaiting HOTL approval, not actually at `done`. Composer's iteration is what ends, not the task's lifecycle. -5. (Single-task mode only) `composer reports proposed rewrite denied on `: emitted right after the user denies a substantive rewrite proposal in single-task mode (see *Proposed rewrites handling*). +1. **Backlog drained**: `ready` and `plannable` are both empty. +2. **Failure budget exhausted**: three failed attempts on the same task (single-task mode). +3. **User says stop**: exit after the in-flight write finishes. +4. **Single-task iteration complete**: verdict surfaced and propagation done. The task itself sits at `in_review` awaiting HOTL; composer's job is finished. +5. **Rewrite denied** (single-task mode): the user rejected a proposed rewrite at the gate. -Do not invent new stop phrases. The `/goal` condition the user pastes during bootstrap matches these five verbatim; any drift breaks the harness. +These five are exhaustive. Do not invent new stop conditions, and do not stop for anything else. -## Reuse points from the mymir skill +## Recovering after compaction -Composer is glue. It explicitly defers to the `mymir` skill for: +Re-derive the phase from the iteration todos plus the task's Mymir status: `draft` without a plan → research or planning pending; `planned` → implementation pending; `in_progress` → implementer in flight, a fix rotation in flight, or partial-success recovery; `in_review` → review pending, the fix loop mid-cycle, or the iteration's verdict already in the transcript (check before re-dispatching); `done` → HOTL approved, run propagation if it has not run. For runs likely to span compaction, prefer single-task mode re-invoked per task. Broader primitives: the resilience reference loaded above. -- **Task selection.** `mymir_analyze type='ready'` ∩ `type='critical_path'`, ranked by priority then estimate (see `plugins/claude-code/skills/mymir/SKILL.md` § *What should I work on?*). -- **Refinement.** If the researcher's brief identifies vague acceptance criteria or a thin description, the planner applies refinements via `mymir_task action='update'` with append semantics (see § *Refine a task* in the mymir SKILL.md). -- **Planning.** Phase 2 saves the unabridged `implementationPlan` and transitions `draft → planned` exactly as § *Plan a draft task* specifies. -- **Implementation.** Phase 3 follows § *Implement a task* and the Completion Protocol (lifecycle §2). PR template detection, bracket form, body structure, `gh pr create` syntax all defer there. Composer adds only a conventional-commit title prefix when the project uses that format, and a `/-` branch name; both live in `agents/composer-implementer.md`. -- **Review.** Phase 4 reads `mymir_context depth='review'` and follows the five-lens persona in `agents/review.md`. The verdict shape (`approve` / `request-changes` / `block` plus per-lens prose, AC evaluation, plan-vs-files drift, downstream impact) is the reviewer's contract with the orchestrator and with HOTL. -- **Propagation.** `mymir_query type='edges'` then `mymir_analyze type='downstream'` after every `in_review` transition (and every later `done`); update edge notes, retire stale edges. +## Red flags — never do these -If a flow exists in the mymir skill, do not reinvent it inside a subagent. Cite the section by file path and anchor instead. +| Temptation | Reality | +| --- | --- | +| Write `status` "so no other agent grabs the task" | Every transition belongs to a subagent: planner `draft→planned`; implementer `planned→in_progress→in_review` plus the fix rotation; HOTL `in_review→done`. The orchestrator writes propagation edges, nothing else. | +| Split an oversize task yourself | Oversize routes to `mymir:decompose-task`, and only after the user gate. | +| Treat `request-changes` or `block` as a failed attempt | A careful verdict is a successful review (`STATUS: DONE`). The fix loop or HOTL owns the response; the failure budget is untouched. | +| Re-implement when a matching PR already exists | Resume the Completion Protocol instead. | +| Pause between tasks to ask "should I continue?" | Continuous execution. The five stop conditions are the only exits; gates fire only on `NEEDS_DECISION`. | +| Keep fixing after 2 rotations, or auto-fix a `block` | Escalate to HOTL with all verdicts. | +| Pad a dispatch with transcript, meta, or spec text | Phase minimum only. Pollution makes agents worse. | +| Emit or watch for literal stop phrases | Stops are structural; report them in plain language. | ## What composer is not -- **Not a decomposer.** Oversize tasks route to `mymir:decompose-task`. Composer asks first; never silently splits a task. -- **Not a refiner.** Composer's researcher proposes refinements via the brief; the planner applies them through the canonical `mymir_task` update path. If the user wants pure refinement, they should run the `mymir` skill directly. -- **Not the code reviewer itself.** Composer dispatches the `mymir:review` subagent in Phase 4 to produce a structured verdict, but the orchestrator does not interpret the verdict beyond surfacing it. The PR is reviewed on GitHub like any other PR; the HOTL operator owns the final `in_review → done` transition outside composer's loop, regardless of whether the reviewer recommended `approve`, `request-changes`, or `block`. -- **Not a session-resilience layer.** Long runs that hit auto-compaction rely on `/goal` to bound the session and on `mymir_query type='meta'` plus the per-task Mymir status to re-acquire project state on resume; composer does not persist its own session file. The orchestrator's "current phase" is implicit, derived from transcript and task status; after compaction it reconstructs per the *Phase entry and exit conditions* table. For runs likely to span compaction, prefer single-task mode and re-invoke composer per task rather than running an unbounded backlog loop. See `skills/mymir/references/resilience.md` for the broader resilience primitives. +Not a decomposer (oversize routes out). Not a hand-refiner (that is the mymir skill, used directly). Not the merge gate (HOTL owns `in_review → done` and merging, whatever the verdict). Not a session-resilience layer (re-invoke per task for very long runs). ## See also -- `plugins/claude-code/skills/mymir/SKILL.md`: canonical Mymir flows composer reuses. -- `skills/mymir/references/conventions.md`: Iron Law of grounding (cite real code, real refs; never speculate). -- `skills/mymir/references/artifacts.md`: title/description/AC quality (§1), tag dimensions (§2), categories (§4), oversize threshold (§5). -- `skills/mymir/references/lifecycle.md`: status lifecycle (§1), Completion Protocol with PR template detection (§2), propagation (§3). -- `plugins/claude-code/agents/composer-researcher.md`, `composer-planner.md`, `composer-implementer.md`, `review.md`: the four subagent definitions composer dispatches. -- `plugins/claude-code/agents/decompose.md`: the oversize-delegation target. +- `plugins/claude-code/skills/mymir/SKILL.md`: canonical flows composer reuses — selection (§ *What should I work on?*), refinement (§ *Refine a task*), planning (§ *Plan a draft task*), implementation (§ *Implement a task*), propagation. +- `plugins/claude-code/agents/composer-researcher.md`, `composer-planner.md`, `composer-implementer.md`, `review.md`: the four phase contracts, including each phase's STATUS rules. +- `plugins/claude-code/skills/composer/references/`: the slim per-phase rule extracts the agents load. +- `plugins/claude-code/agents/decompose-task.md`: the oversize-delegation target. From a3c8017f2ccea9d97c32e2018ca60ab3d25fa7c9 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 08:33:45 +0200 Subject: [PATCH 11/45] docs: update mymir skill for composer structural stops --- plugins/claude-code/skills/mymir/SKILL.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/claude-code/skills/mymir/SKILL.md b/plugins/claude-code/skills/mymir/SKILL.md index 1d428d90..50a5e282 100644 --- a/plugins/claude-code/skills/mymir/SKILL.md +++ b/plugins/claude-code/skills/mymir/SKILL.md @@ -141,7 +141,7 @@ You handle most Mymir interactions inline. The four agents are escalations for h | Decompose a project: large, multi-domain, or sensitive | Dispatch **`mymir:decompose`** for the gated 4-phase pipeline | | Split a single existing oversize task into children within an active project ("split this task", "decompose RZE-42", composer's oversize handler) | Dispatch **`mymir:decompose-task`** for the gated split + edge-rewiring + parent-cancel pipeline | | Add a new feature or capability cluster to an active project ("add a feature for X", "decompose this idea into tasks", "extend the project with Y") | Dispatch **`mymir:decompose-feature`** for the gated feature-addition pipeline | -| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through mymir tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/mymir:composer`** (backlog mode) or **`/mymir:composer `** (single-task mode). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command (and paste the `/goal` harness composer emits on first turn) for it to start. | +| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through mymir tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/mymir:composer`** (backlog mode) or **`/mymir:composer `** (single-task mode). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command for it to start; composer then runs continuously and stops on structural conditions (queue drained, failure budget, user stop). | | Review an `in_review` task or a PR by URL ("review MYMR-N", "review this PR", "review ``", "what does the review subagent think of MYMR-N") | Dispatch **`mymir:review`** for a five-lens structured verdict (`approve` / `request-changes` / `block`). The verdict is advisory; HOTL still owns the `in_review → done` transition on GitHub. | | Status, next task, mark done, plan a draft, refine, dispatch, create or delete task | Handle inline. **Do not** dispatch `mymir:manage` for these; they are day-to-day. | | Strategic review, rebalance the graph, audit dependencies, prune orphans, connect missing edges, audit blockers, consolidate categories or tags, graph-health check, "is this project on track?" | Dispatch **`mymir:manage`** for deep CTO mode | @@ -180,7 +180,7 @@ Lead with slim tools. - `mymir_analyze type='plannable'`. Drafts ready to plan. - Pick one on the critical path. **§ Plan a draft task**. -**For end-to-end automation across the queue:** suggest `/mymir:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. The user paces it via `/goal` (composer emits the harness on first turn; user pastes it). Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. +**For end-to-end automation across the queue:** suggest `/mymir:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. It runs continuously without per-task check-ins, gates only on genuine decisions (oversize tasks, proposed rewrites, open questions), runs a bounded review→fix loop per task, and stops structurally when the queue drains or the user says stop. Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. ### Refine a task From 4e3c51d272cac62fbfc2e36df46c1d257c965bf9 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 08:39:09 +0200 Subject: [PATCH 12/45] fix: close composer loophole found in pressure test --- plugins/claude-code/skills/composer/SKILL.md | 1 + 1 file changed, 1 insertion(+) diff --git a/plugins/claude-code/skills/composer/SKILL.md b/plugins/claude-code/skills/composer/SKILL.md index ca00503e..e7876384 100644 --- a/plugins/claude-code/skills/composer/SKILL.md +++ b/plugins/claude-code/skills/composer/SKILL.md @@ -189,6 +189,7 @@ Re-derive the phase from the iteration todos plus the task's Mymir status: `draf | Temptation | Reality | | --- | --- | | Write `status` "so no other agent grabs the task" | Every transition belongs to a subagent: planner `draft→planned`; implementer `planned→in_progress→in_review` plus the fix rotation; HOTL `in_review→done`. The orchestrator writes propagation edges, nothing else. | +| Skip research or planning to "get the claim in faster" | The phase order is fixed for every task, including `planned` entries (the planner re-validates): research → plan → implement → review. The implementer claims when its turn comes; no urgency moves it earlier. | | Split an oversize task yourself | Oversize routes to `mymir:decompose-task`, and only after the user gate. | | Treat `request-changes` or `block` as a failed attempt | A careful verdict is a successful review (`STATUS: DONE`). The fix loop or HOTL owns the response; the failure budget is untouched. | | Re-implement when a matching PR already exists | Resume the Completion Protocol instead. | From 9b5e173c7723d2f7949387ad6cef742c220504c4 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 10:45:34 +0200 Subject: [PATCH 13/45] fix: end composer iteration after planning plannable picks --- plugins/claude-code/agents/composer-implementer.md | 2 +- plugins/claude-code/skills/composer/SKILL.md | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/plugins/claude-code/agents/composer-implementer.md b/plugins/claude-code/agents/composer-implementer.md index 02e55224..289dda2b 100644 --- a/plugins/claude-code/agents/composer-implementer.md +++ b/plugins/claude-code/agents/composer-implementer.md @@ -79,7 +79,7 @@ On failure (verification cannot reach green, plan is broken), leave the task at a. `mymir_context depth='agent' taskId=''`. Read multi-hop dependencies, upstream `executionRecord` entries, the full `implementationPlan`, and the current `acceptanceCriteria`. Read the plan in full; do not skim. -b. Confirm `status` is `planned`. If it is anything else (`in_progress` from a prior attempt is acceptable; `done` or `cancelled` means stop and report the unexpected state), surface it to the orchestrator and exit. +b. Confirm `status` is `planned`. If it is anything else (`in_progress` from a prior attempt is acceptable; `done` or `cancelled` means stop and report the unexpected state), surface it to the orchestrator and exit. Additionally verify every `depends_on` dependency in the agent-depth bundle is `done`. Any dependency not at `done` means the pick was premature (a plannable pick routed too far): exit without claiming, returning `STATUS: BLOCKED — dependencies unfinished: `. c. Verify the plan is implementable. Walk the plan's *Files to modify* list and confirm each path exists where the plan claims (or that the path is a new file the plan expects you to create). If a path is wrong, fail loudly: report the discrepancy, leave the task at `planned`, exit. diff --git a/plugins/claude-code/skills/composer/SKILL.md b/plugins/claude-code/skills/composer/SKILL.md index e7876384..27af8904 100644 --- a/plugins/claude-code/skills/composer/SKILL.md +++ b/plugins/claude-code/skills/composer/SKILL.md @@ -88,6 +88,7 @@ digraph composer_iteration { "STOP: iteration ends (single-task)" [shape=doublecircle]; "Dispatch planner" [shape=box]; "Planner STATUS?" [shape=diamond]; + "Pick was plannable-only?" [shape=diamond]; "Dispatch implementer" [shape=box]; "Implementer STATUS?" [shape=diamond]; "Dispatch reviewer" [shape=box]; @@ -113,7 +114,9 @@ digraph composer_iteration { "Continue this task?" -> "Pick next task" [label="no (backlog)"]; "Continue this task?" -> "STOP: iteration ends (single-task)" [label="no (single-task)"]; "Dispatch planner" -> "Planner STATUS?"; - "Planner STATUS?" -> "Dispatch implementer" [label="DONE / DONE_WITH_CONCERNS"]; + "Planner STATUS?" -> "Pick was plannable-only?" [label="DONE / DONE_WITH_CONCERNS"]; + "Pick was plannable-only?" -> "Dispatch implementer" [label="no"]; + "Pick was plannable-only?" -> "Single-task mode?" [label="yes: planned; deps unfinished"]; "Planner STATUS?" -> "Failure handling" [label="BLOCKED"]; "Dispatch implementer" -> "Implementer STATUS?"; "Implementer STATUS?" -> "Dispatch reviewer" [label="DONE / DONE_WITH_CONCERNS"]; @@ -136,12 +139,14 @@ digraph composer_iteration { ### Step details -1. **Pick.** Backlog: `mymir_analyze type='ready'` ∩ `type='critical_path'`; rank by priority (`urgent > core > normal > backlog`), tie-break by lowest estimate. Fall back to the highest-priority `ready` task when the intersection is empty, then to `mymir_analyze type='plannable'` when `ready` is empty (those route through research + plan; nothing to implement yet). Single-task: the named task; if already `done` or `cancelled`, report that and stop. Emit a one-paragraph pick rationale (taskRef, priority, estimate, critical-path yes/no, one-sentence reason). Do not wait for approval — the user interrupts if they disagree. +1. **Pick.** Backlog: `mymir_analyze type='ready'` ∩ `type='critical_path'`; rank by priority (`urgent > core > normal > backlog`), tie-break by lowest estimate. Fall back to the highest-priority `ready` task when the intersection is empty, then to `mymir_analyze type='plannable'` when `ready` is empty (those route through research + plan only; their dependencies are unfinished, so there is nothing to implement yet — note the pick as **plannable-only**). Single-task: the named task; if already `done` or `cancelled`, report that and stop. Emit a one-paragraph pick rationale (taskRef, priority, estimate, critical-path yes/no, one-sentence reason). Do not wait for approval — the user interrupts if they disagree. 2. **Research.** Dispatch `mymir:composer-researcher` with: `Target task: `, the categories + tag vocabulary from bootstrap, and (on re-dispatch) the user's gate answers. Status does not change in this phase; the researcher refines the task row in place. React per *Status vocabulary*. 3. **Plan.** Dispatch `mymir:composer-planner` with: `Target task: `, the task's current status (so it knows new-plan vs re-validate), and the research brief verbatim. Verify with one `mymir_context depth='summary' taskId=''` poll: a `draft` entry must now show a plan and `status='planned'`. If not, re-dispatch once with the failure appended; a second miss is `BLOCKED`. + When the pick was plannable-only, the iteration ends here: the task is now `planned` and its dependencies are still unfinished, so there is nothing to implement. Backlog mode returns to the pick; single-task mode reports the planned outcome and stops. Never dispatch the implementer on a plannable-only pick. + 4. **Implement.** Dispatch `mymir:composer-implementer` with: `Target task: . Plan is saved to Mymir; fetch via mymir_context depth='agent'. Claim the task (planned → in_progress), implement per the implementationPlan, open a PR, mark in_review per the Completion Protocol.` Append the prior failure summary on retries. 5. **Review and the fix loop.** Dispatch `mymir:review` with: `Target task: . PR URL: . Mode: composer-phase-4. Fetch the bundle via mymir_context depth='review'.` On `STATUS: DONE`, branch on the verdict payload: From 7f66d4e622140da738086be91ac84ca6efc434fe Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 10:45:58 +0200 Subject: [PATCH 14/45] fix: add transport stop and claimed-task entry rules to composer --- plugins/claude-code/skills/composer/SKILL.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/plugins/claude-code/skills/composer/SKILL.md b/plugins/claude-code/skills/composer/SKILL.md index 27af8904..a0d371f1 100644 --- a/plugins/claude-code/skills/composer/SKILL.md +++ b/plugins/claude-code/skills/composer/SKILL.md @@ -139,7 +139,7 @@ digraph composer_iteration { ### Step details -1. **Pick.** Backlog: `mymir_analyze type='ready'` ∩ `type='critical_path'`; rank by priority (`urgent > core > normal > backlog`), tie-break by lowest estimate. Fall back to the highest-priority `ready` task when the intersection is empty, then to `mymir_analyze type='plannable'` when `ready` is empty (those route through research + plan only; their dependencies are unfinished, so there is nothing to implement yet — note the pick as **plannable-only**). Single-task: the named task; if already `done` or `cancelled`, report that and stop. Emit a one-paragraph pick rationale (taskRef, priority, estimate, critical-path yes/no, one-sentence reason). Do not wait for approval — the user interrupts if they disagree. +1. **Pick.** Backlog: `mymir_analyze type='ready'` ∩ `type='critical_path'`; rank by priority (`urgent > core > normal > backlog`), tie-break by lowest estimate. Fall back to the highest-priority `ready` task when the intersection is empty, then to `mymir_analyze type='plannable'` when `ready` is empty (those route through research + plan only; their dependencies are unfinished, so there is nothing to implement yet — note the pick as **plannable-only**). Single-task: the named task; if already `done` or `cancelled`, report that and stop. If the named task is already claimed, never re-run research or planning on it: at `in_progress`, jump straight to implement-phase recovery (the partial-success check in *Failure handling*); at `in_review`, jump straight to *Review and the fix loop*. Emit a one-paragraph pick rationale (taskRef, priority, estimate, critical-path yes/no, one-sentence reason). Do not wait for approval — the user interrupts if they disagree. 2. **Research.** Dispatch `mymir:composer-researcher` with: `Target task: `, the categories + tag vocabulary from bootstrap, and (on re-dispatch) the user's gate answers. Status does not change in this phase; the researcher refines the task row in place. React per *Status vocabulary*. @@ -165,7 +165,7 @@ Subagents inherit nothing from this session; the dispatch prompt is their whole ## Failure handling -`BLOCKED` from any phase is a failed attempt. On failure: +`BLOCKED` from any phase is a failed attempt, with one exception: a phase that reports BLOCKED because the task is already at `done` or `cancelled` is not a failure — HOTL resolved the task underneath the run (e.g. approving mid-fix-rotation). Treat that as iteration complete: run *Surface + propagate* if it has not run, consume no failure budget, and move on. For every other BLOCKED: 1. Keep the failure summary in your transcript. Do not write it to `decisions` — per artifacts §1 that field is CHOICE + WHY, not process metadata. 2. Leave the task at its current status. Never roll back, never cancel. @@ -182,8 +182,9 @@ Stop and report in plain language (there are no magic stop phrases) when one of 3. **User says stop**: exit after the in-flight write finishes. 4. **Single-task iteration complete**: verdict surfaced and propagation done. The task itself sits at `in_review` awaiting HOTL; composer's job is finished. 5. **Rewrite denied** (single-task mode): the user rejected a proposed rewrite at the gate. +6. **Mymir transport/auth failure**: any Mymir tool call fails with auth expiry, 401/403, a 5xx, or a network error. Stop immediately — these are not retryable in-session (resilience §10) — and report the exact error text plus the last completed phase for each in-flight task. -These five are exhaustive. Do not invent new stop conditions, and do not stop for anything else. +These six are exhaustive. Do not invent new stop conditions, and do not stop for anything else. ## Recovering after compaction @@ -198,7 +199,7 @@ Re-derive the phase from the iteration todos plus the task's Mymir status: `draf | Split an oversize task yourself | Oversize routes to `mymir:decompose-task`, and only after the user gate. | | Treat `request-changes` or `block` as a failed attempt | A careful verdict is a successful review (`STATUS: DONE`). The fix loop or HOTL owns the response; the failure budget is untouched. | | Re-implement when a matching PR already exists | Resume the Completion Protocol instead. | -| Pause between tasks to ask "should I continue?" | Continuous execution. The five stop conditions are the only exits; gates fire only on `NEEDS_DECISION`. | +| Pause between tasks to ask "should I continue?" | Continuous execution. The six stop conditions are the only exits; gates fire only on `NEEDS_DECISION`. | | Keep fixing after 2 rotations, or auto-fix a `block` | Escalate to HOTL with all verdicts. | | Pad a dispatch with transcript, meta, or spec text | Phase minimum only. Pollution makes agents worse. | | Emit or watch for literal stop phrases | Stops are structural; report them in plain language. | From 99333e6332060e6ff50e536f1e67f54246ca565d Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 10:58:34 +0200 Subject: [PATCH 15/45] fix: add recovery, retry, headless, and propagation rules --- plugins/claude-code/skills/composer/SKILL.md | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/plugins/claude-code/skills/composer/SKILL.md b/plugins/claude-code/skills/composer/SKILL.md index a0d371f1..f45375c2 100644 --- a/plugins/claude-code/skills/composer/SKILL.md +++ b/plugins/claude-code/skills/composer/SKILL.md @@ -63,12 +63,15 @@ Expected `NEEDS_DECISION` triggers (all from the researcher): A return without a STATUS line is malformed: re-read the prose once; if the outcome is still ambiguous, treat it as `BLOCKED`. +**Headless gate fallback:** when `AskUserQuestion` is unavailable (errors or hangs — headless runs, policy-denied contexts), a `NEEDS_DECISION` gate resolves to skip-the-task: record the unasked question and the skip in the stop report, then end the iteration (backlog mode picks the next task; single-task mode stops). Never fabricate an answer — skipping is the reversible default (resilience §11). + ## Session bootstrap Once per session, before the first iteration: 1. **Resolve the project.** `mymir_project action='list'` → `action='select' projectId='...'`. Single-task mode: also `mymir_query type='search' query=''` to resolve the task UUID and current status. 2. **Read meta.** `mymir_query type='meta'`. Keep the categories and tag vocabulary for researcher dispatches; drop the status counts. +3. **Stale-claim sweep.** Scan the project's task list (`mymir_query type='list'`) for tasks already at `in_progress`. These are possible stale claims from dead sessions; surface them in the first pick rationale so the user sees them before the run commits elsewhere. Then start iterating. There is nothing to install and nothing to confirm. @@ -155,7 +158,7 @@ digraph composer_iteration { - **`request-changes`** with 2 rotations used, or **`block`**: stop fixing. Escalate every verdict from this task to HOTL and go to step 6. `block` is never auto-fixed; review.md calibrates it as "one rotation will not land this". - The verdict is advisory beyond the fix loop: HOTL owns `in_review → done` on GitHub regardless of verdict. -6. **Surface + propagate.** Quote the final verdict block verbatim. Then propagate per lifecycle §3: `mymir_query type='edges' taskId=''`, `mymir_analyze type='downstream' taskId=''`; update or retire edge notes the work invalidated (edge-note shape: artifacts §3 — one to three short sentences addressed to the downstream task's agent). Surface newly-unblocked tasks in the next pick rationale. +6. **Surface + propagate.** Quote the final verdict block verbatim. Then propagate per lifecycle §3: `mymir_query type='edges' taskId=''`, `mymir_analyze type='downstream' taskId=''`; update or retire edge notes the work invalidated (edge-note shape: artifacts §3 — one to three short sentences addressed to the downstream task's agent). Propagation depth follows the verdict: on `approve`, propagate fully. On an escalated `request-changes` or `block`, write edge-note updates as provisional — prefix each with `Provisional pending HOTL on PR #:` — because HOTL may reject the work; the HOTL `done` flip (outside composer, as today) is the trigger for firming them up. Surface newly-unblocked tasks in the next pick rationale. 7. **Loop.** Single-task: report the iteration outcome and stop. Backlog: next iteration, no pause. @@ -169,15 +172,24 @@ Subagents inherit nothing from this session; the dispatch prompt is their whole 1. Keep the failure summary in your transcript. Do not write it to `decisions` — per artifacts §1 that field is CHOICE + WHY, not process metadata. 2. Leave the task at its current status. Never roll back, never cancel. -3. Backlog mode: move to the next pick; the stuck task stays where it is for human triage. Single-task mode: retry the failed phase up to three total attempts on the task, appending each failure summary to the re-dispatch; after the third, report and stop. Re-run research or planning only when the failure clearly traces to a planning gap (e.g. the plan names a file that does not exist). +3. Backlog mode: when the failure summary is transient-shaped (network hiccup, flaky test, dirty workspace state), retry the failed phase once with the failure summary appended; otherwise, or when the retry also fails, move to the next pick; the stuck task stays where it is for human triage. Single-task mode: retry the failed phase up to three total attempts on the task, appending each failure summary to the re-dispatch; after the third, report and stop. Re-run research or planning only when the failure clearly traces to a planning gap (e.g. the plan names a file that does not exist). + +**Partial success (PR exists, `in_review` not marked):** when a retry's pre-flight finds the task at `in_progress` with an open PR matching `/-`, do not re-implement. First verify the PR actually belongs to the task: its title or body must carry the `[]` bracket form — a branch-name match alone is not proof. Verified: dispatch the implementer to resume the Completion Protocol against the existing PR (re-evaluate ACs, populate the payload, mark `in_review`). Counts as one attempt. + +**`in_review` without a PR link:** when the task sits at `in_review` but `task.links` carries no `pull_request` entry, look for the orphaned PR: + +```bash +gh pr list --state open --json url,title,body,headRefName \ + --jq '.[] | select(.headRefName | contains(""))' +``` -**Partial success (PR exists, `in_review` not marked):** when a retry's pre-flight finds the task at `in_progress` with an open PR matching `/-`, do not re-implement. Dispatch the implementer to resume the Completion Protocol against the existing PR (re-evaluate ACs, populate the payload, mark `in_review`). Counts as one attempt. +If a hit carries the `[]` bracket form in title or body, dispatch the implementer to re-run the Completion Protocol payload against it (the `prUrl` write repairs the link). No verified match: report the inconsistency to the user; never fabricate a link. ## Stop conditions Stop and report in plain language (there are no magic stop phrases) when one of these holds: -1. **Backlog drained**: `ready` and `plannable` are both empty. +1. **Backlog drained**: `ready` and `plannable` are both empty. The stop report enumerates every task left at `in_progress`/`in_review` with its failure summary — the stranded-task report; nothing strands silently. 2. **Failure budget exhausted**: three failed attempts on the same task (single-task mode). 3. **User says stop**: exit after the in-flight write finishes. 4. **Single-task iteration complete**: verdict surfaced and propagation done. The task itself sits at `in_review` awaiting HOTL; composer's job is finished. From 0f946cc49d2d788eaf94b8fc95a92e1240bc52c1 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 11:03:38 +0200 Subject: [PATCH 16/45] fix: harden implementer against env failures and foreign edits --- .../claude-code/agents/composer-implementer.md | 15 +++++++++++---- plugins/claude-code/skills/composer/SKILL.md | 2 +- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/plugins/claude-code/agents/composer-implementer.md b/plugins/claude-code/agents/composer-implementer.md index 289dda2b..4871b150 100644 --- a/plugins/claude-code/agents/composer-implementer.md +++ b/plugins/claude-code/agents/composer-implementer.md @@ -145,6 +145,8 @@ c. **PR body, template detection, taskRef bracket form, `gh pr create` syntax.** #### Success path +Immediately before this write, re-read the task: `mymir_context depth='summary' taskId=''`. If status is no longer `in_progress` (a human cancelled or edited the task underneath you), do not write. Report the observed status and exit with `STATUS: BLOCKED — status changed underneath: `. This rule applies to every `in_review` write, including fix-mode step 7. + One `mymir_task action='update'` call carrying the full Completion Protocol payload, append-only. Field shape, content rules, and AC evaluation semantics: lifecycle §2. Pass `prUrl` whenever a PR was opened (the dominant case); the backend upserts a `task_links` row with `kind='pull_request'` so the review subagent and detail UI can resolve the PR. ``` @@ -186,10 +188,15 @@ When the dispatch says fix mode, the reviewer requested changes on your PR and t 1. `mymir_context depth='agent' taskId=''`. Confirm status is `in_review` and the PR matches the dispatch URL. Anything else: report the mismatch and exit with `STATUS: BLOCKED`. 2. `mymir_task action='update' taskId='' status='in_progress'`. This is the fix-rotation claim. 3. Check out the existing branch (`gh pr view --json headRefName`); never create a new branch or PR. -4. Address **exactly the blocking findings in the dispatch**. No replanning, no scope expansion, no drive-by refactors. A finding you believe is wrong: do not silently skip it; note your reasoning in the return message and fix the rest. -5. Re-run the full verification suite (typecheck, lint, tests) until green, push to the same branch. -6. Re-mark `in_review` with an updated Completion Protocol payload (append a one-line `executionRecord` delta describing the fix; re-evaluate only the ACs the findings touched). -7. Return: ` fix rotation complete. PR . .` plus the STATUS line per the success/failure paths above. +4. Inspect the branch for foreign commits: compare the PR's commit authors (`gh pr view --json commits --jq '.commits[].authors[].login'`) against your own identity (`git config user.name` and the login you push as). Foreign commits found: note them verbatim in your return message and re-evaluate ALL acceptance criteria in step 7, not only the ACs the findings touched — someone else's edits may have moved ground under criteria you previously satisfied. +5. Address **exactly the blocking findings in the dispatch**. No replanning, no scope expansion, no drive-by refactors. A finding you believe is wrong: do not silently skip it; note your reasoning in the return message and fix the rest. +6. Re-run the full verification suite (typecheck, lint, tests) until green, push to the same branch. +7. Re-mark `in_review` with an updated Completion Protocol payload (append a one-line `executionRecord` delta describing the fix; re-evaluate only the ACs the findings touched, or all ACs when step 4 found foreign commits). +8. Return: ` fix rotation complete. PR . .` plus the STATUS line per the success/failure paths above. + +## Environmental failures + +When a `gh` call fails for environmental reasons — auth expiry (`gh auth status` failing, 401s), rate limiting, network errors — the work is not at fault. One immediate retry is fine; if it persists, stop and return `STATUS: BLOCKED — environmental: `. The orchestrator surfaces environmental failures to the user without consuming the failure budget; mislabeling a real verification failure as environmental hides broken work, so use this only for errors the environment alone can fix. ## What this phase does not do diff --git a/plugins/claude-code/skills/composer/SKILL.md b/plugins/claude-code/skills/composer/SKILL.md index f45375c2..40c031dd 100644 --- a/plugins/claude-code/skills/composer/SKILL.md +++ b/plugins/claude-code/skills/composer/SKILL.md @@ -168,7 +168,7 @@ Subagents inherit nothing from this session; the dispatch prompt is their whole ## Failure handling -`BLOCKED` from any phase is a failed attempt, with one exception: a phase that reports BLOCKED because the task is already at `done` or `cancelled` is not a failure — HOTL resolved the task underneath the run (e.g. approving mid-fix-rotation). Treat that as iteration complete: run *Surface + propagate* if it has not run, consume no failure budget, and move on. For every other BLOCKED: +`BLOCKED` from any phase is a failed attempt, with one exception: a phase that reports BLOCKED because the task is already at `done` or `cancelled` is not a failure — HOTL resolved the task underneath the run (e.g. approving mid-fix-rotation). Treat that as iteration complete: run *Surface + propagate* if it has not run, consume no failure budget, and move on. A second exception: `STATUS: BLOCKED — environmental: ` (gh auth expiry, rate limits, network) is an environment problem, not a work problem — surface it to the user verbatim and consume no failure budget; resume the same phase once the user confirms the environment is fixed. For every other BLOCKED: 1. Keep the failure summary in your transcript. Do not write it to `decisions` — per artifacts §1 that field is CHOICE + WHY, not process metadata. 2. Leave the task at its current status. Never roll back, never cancel. From 345f8a7e2aeda585fc669c0b484ae17f62ed0362 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 11:04:13 +0200 Subject: [PATCH 17/45] fix: reviewer env failures and working-depth doc drift --- plugins/claude-code/agents/review.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/plugins/claude-code/agents/review.md b/plugins/claude-code/agents/review.md index 4ccf74b1..2868f397 100644 --- a/plugins/claude-code/agents/review.md +++ b/plugins/claude-code/agents/review.md @@ -61,7 +61,7 @@ If the task is not at `in_review` (still `in_progress`, or already `done` / `can - `Read`, `Glob`, `Grep`: codebase reads. Walk the files the implementer touched. Compare against the plan. - `Bash`: read-only. `gh pr view `, `gh pr diff `, `gh pr checks `, `git log`, `git show`, `git diff`. No mutating `gh` (`pr edit`, `pr review --approve`, `pr merge`), no `git push`, no edits to the working tree. -- `mymir_context`. Two-phase fetch by design. Step 1 uses `depth='working'`: returns description, acceptanceCriteria, decisions, edges, siblings, and the PR handle from `task.links` filtered to `kind='pull_request'`. **Mechanically excludes `executionRecord`, `implementationPlan` body, and `files`.** That exclusion is the point — the first-pass falsification (step 2) and the lens reasoning (step 3) run before the implementer's HOW-it-was-built narrative is in your context. Step 4 uses `depth='review'`: returns the full bundle with executionRecord, plan body, files plus plan-vs-files drift markers, and downstream impact. If `depth='review'` is unavailable, fall back to `depth='agent'` for the missing piece; record the fallback in the verdict's `Notes`. +- `mymir_context`. Two-phase fetch by design. Step 1 uses `depth='working'`: returns description, acceptanceCriteria, decisions, 1-hop connected tasks (the edges section), and the PR handle from `task.links` filtered to `kind='pull_request'`. **Mechanically excludes `executionRecord`, `implementationPlan` body, and `files`.** That exclusion is the point — the first-pass falsification (step 2) and the lens reasoning (step 3) run before the implementer's HOW-it-was-built narrative is in your context. Step 4 uses `depth='review'`: returns the full bundle with executionRecord, plan body, files plus plan-vs-files drift markers, and downstream impact. If `depth='review'` is unavailable, fall back to `depth='agent'` for the missing piece; record the fallback in the verdict's `Notes`. - `mymir_query` (`search`, `edges`, `meta`, `list`): graph and project awareness. - `mymir_analyze` (`downstream`, `blocked`, `critical_path`): impact reasoning for the downstream lens. - `context7` (`resolve-library-id`, `query-docs`), `WebFetch`, `WebSearch`: outward research when an API call in the diff looks wrong against the library's current contract. Prefer `context7` for library docs; reach for `WebFetch` only when context7 misses. @@ -86,7 +86,7 @@ You own zero transitions. The implementer wrote `in_progress → in_review` with ### 1. Pre-flight -a. `mymir_context depth='working' taskId=''`. Returns description, acceptanceCriteria, decisions, edges, siblings, and the PR handle from `task.links` filtered to `kind='pull_request'`. Mechanically excludes `executionRecord`, `implementationPlan` body, and `files`; steps 2 and 3 run against the diff with that exclusion in place, so the lens findings are formed from the code rather than from the implementer's narrative. The full review bundle (executionRecord, plan body, files, plan-vs-files drift, downstream) is fetched in step 4. +a. `mymir_context depth='working' taskId=''`. Returns description, acceptanceCriteria, decisions, 1-hop connected tasks (the edges section), and the PR handle from `task.links` filtered to `kind='pull_request'`. Mechanically excludes `executionRecord`, `implementationPlan` body, and `files`; steps 2 and 3 run against the diff with that exclusion in place, so the lens findings are formed from the code rather than from the implementer's narrative. The full review bundle (executionRecord, plan body, files, plan-vs-files drift, downstream) is fetched in step 4. b. Confirm `status='in_review'`. Any other state stops the run. If the bundle reports a missing `prUrl` on a task whose `files` is non-empty, flag it: a code-changing `in_review` task without a PR is a Completion Protocol violation, not a review problem; surface the violation and stop. @@ -284,7 +284,7 @@ End your return with a final line: `STATUS: ` - `DONE`: you delivered a verdict. **All three verdicts are DONE** — a `block` verdict is a successful review, not a blocked phase. -- `BLOCKED`: you could not review at all — `mymir_context depth='review'` unreachable, the task is not at `in_review`, or the PR handle is missing and not supplied in the dispatch. +- `BLOCKED`: you could not review at all — `mymir_context depth='review'` unreachable, the task is not at `in_review`, or the PR handle is missing and not supplied in the dispatch. Environmental `gh` failures (auth expiry, rate limit, network) return `STATUS: BLOCKED — environmental: `; the orchestrator surfaces these to the user without consuming the failure budget. ## What this agent does not do From 72a8849bcc2e3af60d2a5388fb04560226fd2739 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 11:08:24 +0200 Subject: [PATCH 18/45] feat: isolate composer implementer in a git worktree --- plugins/claude-code/agents/composer-implementer.md | 1 + plugins/claude-code/skills/composer/SKILL.md | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/plugins/claude-code/agents/composer-implementer.md b/plugins/claude-code/agents/composer-implementer.md index 4871b150..e380bbc2 100644 --- a/plugins/claude-code/agents/composer-implementer.md +++ b/plugins/claude-code/agents/composer-implementer.md @@ -16,6 +16,7 @@ description: > user asks "implement per the saved plan" outside the composer loop. model: opus +isolation: worktree --- # Composer implementer (Phase 3) diff --git a/plugins/claude-code/skills/composer/SKILL.md b/plugins/claude-code/skills/composer/SKILL.md index 40c031dd..a293ee8a 100644 --- a/plugins/claude-code/skills/composer/SKILL.md +++ b/plugins/claude-code/skills/composer/SKILL.md @@ -150,7 +150,7 @@ digraph composer_iteration { When the pick was plannable-only, the iteration ends here: the task is now `planned` and its dependencies are still unfinished, so there is nothing to implement. Backlog mode returns to the pick; single-task mode reports the planned outcome and stops. Never dispatch the implementer on a plannable-only pick. -4. **Implement.** Dispatch `mymir:composer-implementer` with: `Target task: . Plan is saved to Mymir; fetch via mymir_context depth='agent'. Claim the task (planned → in_progress), implement per the implementationPlan, open a PR, mark in_review per the Completion Protocol.` Append the prior failure summary on retries. +4. **Implement.** Dispatch `mymir:composer-implementer` with: `Target task: . Plan is saved to Mymir; fetch via mymir_context depth='agent'. Claim the task (planned → in_progress), implement per the implementationPlan, open a PR, mark in_review per the Completion Protocol.` Append the prior failure summary on retries. The implementer runs worktree-isolated (frontmatter `isolation: worktree`; also pass the Task tool's `isolation: "worktree"` parameter at dispatch, which is verified to work with plugin agents): it works in its own tree, the orchestrator's tree never moves, and the researcher's baseline stays stable. 5. **Review and the fix loop.** Dispatch `mymir:review` with: `Target task: . PR URL: . Mode: composer-phase-4. Fetch the bundle via mymir_context depth='review'.` On `STATUS: DONE`, branch on the verdict payload: - **`approve`**: go to step 6. From e839681fc5abe8bd3cdea6b7749c819e2562cd14 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 11:14:32 +0200 Subject: [PATCH 19/45] feat: derive default branch and handle branch collisions --- .../claude-code/agents/composer-implementer.md | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/plugins/claude-code/agents/composer-implementer.md b/plugins/claude-code/agents/composer-implementer.md index e380bbc2..71cc10a3 100644 --- a/plugins/claude-code/agents/composer-implementer.md +++ b/plugins/claude-code/agents/composer-implementer.md @@ -86,6 +86,8 @@ c. Verify the plan is implementable. Walk the plan's *Files to modify* list and d. Confirm the project's test, typecheck, and lint commands from the plan's *Verification* section. If the plan is missing one, read `package.json` / `pyproject.toml` / `Cargo.toml` to derive it; if you cannot derive it, fail loudly and exit. Do not invent commands. +e. When you are running directly in the orchestrator's tree (no worktree isolation), require a clean tree: `git status --porcelain` must print nothing. Anything else: fail loudly naming the leftover state (`STATUS: BLOCKED — dirty tree: `). Inside an isolated worktree this is guaranteed fresh; skip the check. + ### 2. Claim and branch a. `mymir_task action='update' taskId='' status='in_progress'`. This is your claim; it tells anyone else looking at the project the task is being worked. @@ -104,10 +106,19 @@ b. Create a feature branch from the project's default branch. - Task `[MYM-83] Extract validation helper`, tag `refactor` → `refactor/mym-83-extract-validation-helper` ```bash - git checkout main && git pull --ff-only - git checkout -b + DEFAULT_BRANCH=$(gh repo view --json defaultBranchRef -q '.defaultBranchRef.name') + # Fallback when gh is unavailable: + # DEFAULT_BRANCH=$(git remote show origin | sed -n 's/.*HEAD branch: //p') + git checkout "$DEFAULT_BRANCH" && git pull --ff-only + git fetch origin "+refs/heads/:refs/remotes/origin/" 2>/dev/null || true ``` + Never hardcode `main`; projects differ. + + **If the task branch already exists** (locally or on `origin`): do not create a new one. Verify it is yours first: `git log "origin/$DEFAULT_BRANCH".. --format='%s'` plus `gh pr list --head --json title,body` — the commits or the PR must reference this taskRef (the `[]` bracket form, or the taskRef in commit subjects). Yours: check it out and continue from where the prior attempt stopped (retries reuse the branch). Foreign (a different task or author squatting the deterministic name): fail loudly naming the conflict — `STATUS: BLOCKED — branch collision: carries `. Suffixes stay forbidden; never mint `-2`. + + **Otherwise**: `git checkout -b `. + **Never** append an `attempt-N` suffix and **never** nest the taskRef as its own path segment (`composer/RZE-17/attempt-1` is wrong; this is an old pattern that no longer applies). Retries reuse the same branch and append commits; git history tracks attempts, the branch name does not. One branch per task; do not stack tasks on one branch unless the user has explicitly arranged it. ### 3. Implement From 1e676969b0df8d0de54a3f73f367c080ae077551 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 11:19:15 +0200 Subject: [PATCH 20/45] feat: merge default branch forward before pr and fix rotations --- plugins/claude-code/agents/composer-implementer.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/plugins/claude-code/agents/composer-implementer.md b/plugins/claude-code/agents/composer-implementer.md index 71cc10a3..d077aad3 100644 --- a/plugins/claude-code/agents/composer-implementer.md +++ b/plugins/claude-code/agents/composer-implementer.md @@ -143,12 +143,16 @@ Run, in order: ``, ``, ``. All th ### 5. Open a PR -a. Push the branch: +a. Merge the default branch forward, then push: ```bash + git fetch origin "$DEFAULT_BRANCH" + git merge "origin/$DEFAULT_BRANCH" git push -u origin ``` + Conflict resolution is in-scope work, not a failure: resolve, re-run verification (step 4), then push. A nontrivial resolution (anything beyond keeping both sides' independent hunks) gets a `decisions` entry (CHOICE + WHY). Never rebase a pushed branch; force-push stays forbidden. + b. **PR title: composer's one addition over lifecycle §2.3.** Lifecycle §2.3 specifies `` (verbatim, no paraphrase) as the title and places the `[]` bracket form in the body's linked-task / Task Reference section, not the title. Composer adds exactly one refinement: when the research brief's *Project conventions* identifies a conventional-commits format for the project, prefix the title with the work-type alias from step 2b. Examples: `feat: `, `fix: `, `refactor: `. When the project uses plain titles, drop the prefix and follow lifecycle §2.3 unchanged. The researcher's brief names the format; do not guess. c. **PR body, template detection, taskRef bracket form, `gh pr create` syntax.** Defer entirely to lifecycle §2.3. Your source fields (`executionRecord`, `decisions`, `files`, `acceptanceCriteria`) are already populated on your side; map them onto the template's sections (or the §2.3 no-template default) as lifecycle specifies. Capture the returned PR URL for step 6. @@ -199,7 +203,7 @@ When the dispatch says fix mode, the reviewer requested changes on your PR and t 1. `mymir_context depth='agent' taskId=''`. Confirm status is `in_review` and the PR matches the dispatch URL. Anything else: report the mismatch and exit with `STATUS: BLOCKED`. 2. `mymir_task action='update' taskId='' status='in_progress'`. This is the fix-rotation claim. -3. Check out the existing branch (`gh pr view --json headRefName`); never create a new branch or PR. +3. Check out the existing branch (`gh pr view --json headRefName`), `git pull --ff-only`, then merge the default branch forward (same policy as step 5a: conflicts are in-scope work, nontrivial resolutions recorded in `decisions`, never rebase a pushed branch). Never create a new branch or PR. 4. Inspect the branch for foreign commits: compare the PR's commit authors (`gh pr view --json commits --jq '.commits[].authors[].login'`) against your own identity (`git config user.name` and the login you push as). Foreign commits found: note them verbatim in your return message and re-evaluate ALL acceptance criteria in step 7, not only the ACs the findings touched — someone else's edits may have moved ground under criteria you previously satisfied. 5. Address **exactly the blocking findings in the dispatch**. No replanning, no scope expansion, no drive-by refactors. A finding you believe is wrong: do not silently skip it; note your reasoning in the return message and fix the rest. 6. Re-run the full verification suite (typecheck, lint, tests) until green, push to the same branch. From f65bebd2061c8472d2f6d4a2592bf82893b36421 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 11:20:58 +0200 Subject: [PATCH 21/45] feat: add claim ownership semantics to composer implementer --- plugins/claude-code/agents/composer-implementer.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/claude-code/agents/composer-implementer.md b/plugins/claude-code/agents/composer-implementer.md index d077aad3..b74d7d32 100644 --- a/plugins/claude-code/agents/composer-implementer.md +++ b/plugins/claude-code/agents/composer-implementer.md @@ -80,7 +80,7 @@ On failure (verification cannot reach green, plan is broken), leave the task at a. `mymir_context depth='agent' taskId=''`. Read multi-hop dependencies, upstream `executionRecord` entries, the full `implementationPlan`, and the current `acceptanceCriteria`. Read the plan in full; do not skim. -b. Confirm `status` is `planned`. If it is anything else (`in_progress` from a prior attempt is acceptable; `done` or `cancelled` means stop and report the unexpected state), surface it to the orchestrator and exit. Additionally verify every `depends_on` dependency in the agent-depth bundle is `done`. Any dependency not at `done` means the pick was premature (a plannable pick routed too far): exit without claiming, returning `STATUS: BLOCKED — dependencies unfinished: `. +b. Confirm `status` is `planned`. If it is anything else (`in_progress` from a prior attempt is acceptable; `done` or `cancelled` means stop and report the unexpected state), surface it to the orchestrator and exit. Additionally verify every `depends_on` dependency in the agent-depth bundle is `done`. Any dependency not at `done` means the pick was premature (a plannable pick routed too far): exit without claiming, returning `STATUS: BLOCKED — dependencies unfinished: `. Claim semantics for `in_progress` entries: a foreign assignee (the bundle's `assignees` is non-empty and is not you) means someone else's claim — exit with `STATUS: BLOCKED — claimed by ` and touch nothing. No assignee at all is acceptable **only** with prior-attempt evidence: the deterministic task branch exists or an open PR carries the `[]` bracket; without evidence, exit `STATUS: BLOCKED — unowned in_progress claim, no prior-attempt evidence`. c. Verify the plan is implementable. Walk the plan's *Files to modify* list and confirm each path exists where the plan claims (or that the path is a new file the plan expects you to create). If a path is wrong, fail loudly: report the discrepancy, leave the task at `planned`, exit. @@ -90,7 +90,7 @@ e. When you are running directly in the orchestrator's tree (no worktree isolati ### 2. Claim and branch -a. `mymir_task action='update' taskId='' status='in_progress'`. This is your claim; it tells anyone else looking at the project the task is being worked. +a. `mymir_task action='update' taskId='' status='in_progress'`. This is your claim; it tells anyone else looking at the project the task is being worked. When your dispatch carries a `Caller user id: ` line (a future server release may expose it), include `assigneeIds=['']` in this claim write so the claim names its owner. Today no MCP surface returns the caller's own user id (`mymir_project action='teams'` lists team ids only), so claims rest on branch evidence: the deterministic branch name plus the `[]` bracket are the ownership proof. Say so in your return when you claim without an assignee, so the orchestrator can note it in the run log. b. Create a feature branch from the project's default branch. From 2b997fe540258b049c81dfc952d72b30975d0134 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 11:28:08 +0200 Subject: [PATCH 22/45] feat: gate composer review dispatch on pr checks --- plugins/claude-code/agents/review.md | 8 +++---- plugins/claude-code/skills/composer/SKILL.md | 25 +++++++++++++------- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/plugins/claude-code/agents/review.md b/plugins/claude-code/agents/review.md index 2868f397..94e68e7e 100644 --- a/plugins/claude-code/agents/review.md +++ b/plugins/claude-code/agents/review.md @@ -90,7 +90,7 @@ a. `mymir_context depth='working' taskId=''`. Returns description, acceptanc b. Confirm `status='in_review'`. Any other state stops the run. If the bundle reports a missing `prUrl` on a task whose `files` is non-empty, flag it: a code-changing `in_review` task without a PR is a Completion Protocol violation, not a review problem; surface the violation and stop. -c. Resolve the PR. `gh pr view --json url,title,state,mergeable,statusCheckRollup,reviewDecision`. Note the CI state, the merge state, any failing checks. If checks are red, that is a `block`-class signal on its own; you can still produce the lens analysis, but the verdict cannot be `approve` while CI is red. +c. Resolve the PR. `gh pr view --json url,title,state,mergeable,statusCheckRollup,reviewDecision`. Note the CI state, the merge state, any failing checks. If checks are red, that is a `block`-class signal on its own; you can still produce the lens analysis, but the verdict cannot be `approve` while CI is red. Pending or unresolved checks cap the verdict at `request-changes`: when the dispatch says `CI: unresolved after ` (or you observe still-pending checks yourself), an otherwise-clean review returns `request-changes` with unresolved CI as the sole blocking finding. d. Read the diff. `gh pr diff ` for the unified diff; `gh pr view --json files` for the file list. Cross-check the PR file list against the task's `files`. A path in the task `files` array that does not appear in the diff (or vice versa) is plan-vs-files drift; flag it under the relevant lens. @@ -169,7 +169,7 @@ The plan named the files the implementer was going to touch. The `files` array n ### 7. Downstream impact -`mymir_analyze type='downstream' taskId=''`. Read the immediate dependents. For each, check the edge note: does the `decisions` list on the just-shipped task invalidate any downstream's assumption? Surface the affected edges with one-line guidance for the orchestrator's propagation pass (composer step 6) or for HOTL in direct mode. +`mymir_analyze type='downstream' taskId=''`. Read the immediate dependents. For each, check the edge note: does the `decisions` list on the just-shipped task invalidate any downstream's assumption? Surface the affected edges with one-line guidance for the orchestrator's propagation pass (composer step 7) or for HOTL in direct mode. This is not a propagation run. You do not write to edges. You produce a list of edges that will need attention after the merge; the orchestrator (or the human) executes the rewires. @@ -291,7 +291,7 @@ End your return with a final line: - It does not flip status. HOTL owns `in_review → done`; the orchestrator never auto-promotes; the review agent has no `mymir_task` write access. - It does not write `decisions`, `executionRecord`, `files`, or `acceptanceCriteria` back to the task. The implementer populated those; the verdict critiques them. - It does not open, close, merge, approve, or comment on the PR. The verdict travels in chat; the human review happens on GitHub. -- It does not run propagation. The downstream impact section is a punch list for the orchestrator's propagation step (composer step 6) or for HOTL. +- It does not run propagation. The downstream impact section is a punch list for the orchestrator's propagation step (composer step 7) or for HOTL. - It does not refine the task. If the description or ACs are weak, surface that as a process note in the verdict and route the user to `mymir:manage` or the mymir skill for refinement. - It does not flag style or formatting. Lint and the formatter own those. Substantive deviations from project patterns belong under the codebase-standards lens. - It does not speculate about hypothetical future load, future contributors, future requirements. Review the task as scoped; surface follow-ups under `Notes` if they are concrete enough to file as their own task. @@ -325,7 +325,7 @@ End your return with a final line: - ALWAYS verify dispatched-vs-direct mode for return shape. - NEVER flip status. `in_review → done` is HOTL's transition, not yours. - NEVER write to `mymir_task`, `mymir_edge`, or the working tree. Review is read-only. -- NEVER approve while CI is red. +- NEVER approve while CI is red or unresolved (pending counts as unresolved). - NEVER fabricate a finding to look thorough, and NEVER pad the verdict with nits. Style preferences, more-descriptive-name suggestions, hypothetical scaling concerns outside the task's scope are nit-picks; cut them. A finding without a concrete failure mode is a nit. - NEVER return "no findings" without a reasoning trail. Either show the attack you tried and why it did not land, or open the lens with a finding. - NEVER flag lint or formatting issues. The toolchain owns those. diff --git a/plugins/claude-code/skills/composer/SKILL.md b/plugins/claude-code/skills/composer/SKILL.md index a293ee8a..343ea6ff 100644 --- a/plugins/claude-code/skills/composer/SKILL.md +++ b/plugins/claude-code/skills/composer/SKILL.md @@ -77,7 +77,7 @@ Then start iterating. There is nothing to install and nothing to confirm. ## The loop -At the start of each iteration, materialize these steps as todos and mark them off as you go (the todo list is your compaction anchor): pick, research, plan, implement, review, surface verdict, propagate. +At the start of each iteration, materialize these steps as todos and mark them off as you go (the todo list is your compaction anchor): pick, research, plan, implement, ci gate, review, surface verdict, propagate. ```dot digraph composer_iteration { @@ -99,6 +99,7 @@ digraph composer_iteration { "Verdict?" [shape=diamond]; "Fix rotations used < 2?" [shape=diamond]; "Dispatch implementer in fix mode" [shape=box]; + "CI gate: gh pr checks (10m bound)" [shape=box]; "Escalate all verdicts to HOTL" [shape=box]; "Surface verdict + propagate" [shape=box]; "Failure handling" [shape=box]; @@ -122,7 +123,8 @@ digraph composer_iteration { "Pick was plannable-only?" -> "Single-task mode?" [label="yes: planned; deps unfinished"]; "Planner STATUS?" -> "Failure handling" [label="BLOCKED"]; "Dispatch implementer" -> "Implementer STATUS?"; - "Implementer STATUS?" -> "Dispatch reviewer" [label="DONE / DONE_WITH_CONCERNS"]; + "Implementer STATUS?" -> "CI gate: gh pr checks (10m bound)" [label="DONE / DONE_WITH_CONCERNS"]; + "CI gate: gh pr checks (10m bound)" -> "Dispatch reviewer" [label="green / red / unresolved: annotate dispatch"]; "Implementer STATUS?" -> "Failure handling" [label="BLOCKED"]; "Dispatch reviewer" -> "Reviewer STATUS?"; "Reviewer STATUS?" -> "Verdict?" [label="DONE"]; @@ -152,15 +154,22 @@ digraph composer_iteration { 4. **Implement.** Dispatch `mymir:composer-implementer` with: `Target task: . Plan is saved to Mymir; fetch via mymir_context depth='agent'. Claim the task (planned → in_progress), implement per the implementationPlan, open a PR, mark in_review per the Completion Protocol.` Append the prior failure summary on retries. The implementer runs worktree-isolated (frontmatter `isolation: worktree`; also pass the Task tool's `isolation: "worktree"` parameter at dispatch, which is verified to work with plugin agents): it works in its own tree, the orchestrator's tree never moves, and the researcher's baseline stays stable. -5. **Review and the fix loop.** Dispatch `mymir:review` with: `Target task: . PR URL: . Mode: composer-phase-4. Fetch the bundle via mymir_context depth='review'.` On `STATUS: DONE`, branch on the verdict payload: - - **`approve`**: go to step 6. - - **`request-changes`**, fewer than 2 fix rotations used this task: dispatch the implementer in fix mode — `Target task: . Fix mode. PR: . Address exactly these review findings, re-run verification, re-mark in_review:` followed by the verdict's blocking findings verbatim. On the implementer's `DONE`, re-dispatch the reviewer (same dispatch shape). Each fix dispatch + re-review is one rotation. - - **`request-changes`** with 2 rotations used, or **`block`**: stop fixing. Escalate every verdict from this task to HOTL and go to step 6. `block` is never auto-fixed; review.md calibrates it as "one rotation will not land this". +5. **CI gate.** After the implementer returns DONE with a PR URL, watch the checks with a bounded timeout: `timeout 600 gh pr checks --watch`. Skip the gate entirely when the repo has no checks configured (`gh pr checks` reports no checks — that is a skip, not a red). Branch on the result: + - **Green**: dispatch the reviewer normally. + - **Red**: dispatch the reviewer with the failing check names appended to the dispatch (`CI: failing — `); the reviewer may not approve red CI. + - **Still pending at the 10-minute timeout**: dispatch the reviewer with `CI: unresolved after 10m`; `approve` is off the table, and an otherwise-clean review returns `request-changes` citing unresolved CI as the sole blocking finding. + + The gate re-runs after every fix rotation's implementer DONE. + +6. **Review and the fix loop.** Dispatch `mymir:review` with: `Target task: . PR URL: . Mode: composer-phase-4. Fetch the bundle via mymir_context depth='review'.` On `STATUS: DONE`, branch on the verdict payload: + - **`approve`**: go to step 7. + - **`request-changes`**, fewer than 2 fix rotations used this task: dispatch the implementer in fix mode — `Target task: . Fix mode. PR: . Address exactly these review findings, re-run verification, re-mark in_review:` followed by the verdict's blocking findings verbatim. On the implementer's `DONE`, re-run the CI gate (step 5), then re-dispatch the reviewer (same dispatch shape). Each fix dispatch + re-review is one rotation. + - **`request-changes`** with 2 rotations used, or **`block`**: stop fixing. Escalate every verdict from this task to HOTL and go to step 7. `block` is never auto-fixed; review.md calibrates it as "one rotation will not land this". - The verdict is advisory beyond the fix loop: HOTL owns `in_review → done` on GitHub regardless of verdict. -6. **Surface + propagate.** Quote the final verdict block verbatim. Then propagate per lifecycle §3: `mymir_query type='edges' taskId=''`, `mymir_analyze type='downstream' taskId=''`; update or retire edge notes the work invalidated (edge-note shape: artifacts §3 — one to three short sentences addressed to the downstream task's agent). Propagation depth follows the verdict: on `approve`, propagate fully. On an escalated `request-changes` or `block`, write edge-note updates as provisional — prefix each with `Provisional pending HOTL on PR #:` — because HOTL may reject the work; the HOTL `done` flip (outside composer, as today) is the trigger for firming them up. Surface newly-unblocked tasks in the next pick rationale. +7. **Surface + propagate.** Quote the final verdict block verbatim. Then propagate per lifecycle §3: `mymir_query type='edges' taskId=''`, `mymir_analyze type='downstream' taskId=''`; update or retire edge notes the work invalidated (edge-note shape: artifacts §3 — one to three short sentences addressed to the downstream task's agent). Propagation depth follows the verdict: on `approve`, propagate fully. On an escalated `request-changes` or `block`, write edge-note updates as provisional — prefix each with `Provisional pending HOTL on PR #:` — because HOTL may reject the work; the HOTL `done` flip (outside composer, as today) is the trigger for firming them up. Surface newly-unblocked tasks in the next pick rationale. -7. **Loop.** Single-task: report the iteration outcome and stop. Backlog: next iteration, no pause. +8. **Loop.** Single-task: report the iteration outcome and stop. Backlog: next iteration, no pause. ## Dispatch hygiene From 22dc0837b701cd9d364f127eec4b164c8268f05a Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 11:33:43 +0200 Subject: [PATCH 23/45] feat: add crash-safe run log to composer bootstrap --- plugins/claude-code/skills/composer/SKILL.md | 42 ++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/plugins/claude-code/skills/composer/SKILL.md b/plugins/claude-code/skills/composer/SKILL.md index 343ea6ff..a4aa4d6b 100644 --- a/plugins/claude-code/skills/composer/SKILL.md +++ b/plugins/claude-code/skills/composer/SKILL.md @@ -72,6 +72,7 @@ Once per session, before the first iteration: 1. **Resolve the project.** `mymir_project action='list'` → `action='select' projectId='...'`. Single-task mode: also `mymir_query type='search' query=''` to resolve the task UUID and current status. 2. **Read meta.** `mymir_query type='meta'`. Keep the categories and tag vocabulary for researcher dispatches; drop the status counts. 3. **Stale-claim sweep.** Scan the project's task list (`mymir_query type='list'`) for tasks already at `in_progress`. These are possible stale claims from dead sessions; surface them in the first pick rationale so the user sees them before the run commits elsewhere. +4. **Init the run log.** `mkdir -p .mymir` and guard the gitignore (`grep -qxF '.mymir/' .gitignore 2>/dev/null || echo '.mymir/' >> .gitignore` — the resilience §3 pattern). If `.mymir/composer-.md` already exists and ends with a `RUN_END` line, archive it to `.mymir/archive/composer--.md` and start fresh; if it exists *without* a `RUN_END`, that is a resume signal — see *Recovering after compaction* before doing anything else. Then append `RUN_START`. Then start iterating. There is nothing to install and nothing to confirm. @@ -171,6 +172,47 @@ digraph composer_iteration { 8. **Loop.** Single-task: report the iteration outcome and stop. Backlog: next iteration, no pause. +## Run log + +The run log is composer's crash-safe memory: a pure append-only event log at `.mymir/composer-.md`, one active file per project. The conversation can compact; the log does not. Counters are never tracked as state — they derive by grep over events: rotations used on task X = count of `FIX task=X` lines; failed attempts = count of `FAIL task=X` lines. + +One timestamped line per event, `key=value` pairs; multi-line payloads (blocking findings verbatim, gate questions and answers, failure summaries, DONE_WITH_CONCERNS text) follow as `> ` continuation lines. The event vocabulary: + +| Event | Written when | +| --- | --- | +| `RUN_START` | bootstrap completes (`mode=backlog\|single\|rework project=`) | +| `PICK` | step 1 emits the pick rationale | +| `PHASE` | a phase subagent returns (`phase=research\|plan\|implement status=`) | +| `GATE` | a `NEEDS_DECISION` gate resolves — user answer or headless skip; question and answer as continuations | +| `VERDICT` | the reviewer returns (`verdict= rotation=/2`; blocking findings as continuations) | +| `FIX` | **before** dispatching a fix rotation (`rotation=/2 pr=`) | +| `ESCALATE` | rotations exhausted or a `block` verdict goes to HOTL | +| `SURFACED` | the final verdict is quoted to the user | +| `PROPAGATED` | step 7 propagation completes (`edges= unblocked=`) | +| `FAIL` | a phase returns BLOCKED (failure summary as continuation) | +| `TASK_END` | the iteration ends (`outcome=in_review\|planned\|stuck\|skipped rotations=`) | +| `RESUME` | recovery appends this after reading the log | +| `RUN_END` | any stop condition (`reason=<...> picked= shipped= stuck= skipped=`) | + +The `FIX` line is written *before* the rotation dispatch — increment-before-dispatch is crash-safe: a crash mid-rotation wastes at most one rotation and never exceeds the budget. Format example: + +``` +2026-06-12T14:01:09Z RUN_START mode=backlog project=RZE +2026-06-12T14:01:31Z PICK task=RZE-42 prio=core est=5 critical=yes — auth middleware; unblocks RZE-44,RZE-45 +2026-06-12T14:05:44Z PHASE task=RZE-42 phase=plan status=DONE verified=planned +2026-06-12T14:31:02Z PHASE task=RZE-42 phase=implement status=DONE pr= +2026-06-12T14:39:18Z VERDICT task=RZE-42 verdict=request-changes rotation=0/2 +> blocking: src/auth/refresh.ts:88 catch swallows token-expiry; AC3 unmet +2026-06-12T14:39:20Z FIX task=RZE-42 rotation=1/2 pr= +2026-06-12T14:58:30Z VERDICT task=RZE-42 verdict=approve rotation=1/2 +2026-06-12T14:58:55Z SURFACED task=RZE-42 verdict=approve +2026-06-12T14:59:40Z PROPAGATED task=RZE-42 edges=2 unblocked=RZE-44,RZE-45 +2026-06-12T14:59:41Z TASK_END task=RZE-42 outcome=in_review rotations=1 +2026-06-12T16:40:12Z RUN_END reason=backlog-drained picked=3 shipped=1 stuck=1 skipped=1 +``` + +If `.mymir/` is not writable (sandboxed runs), fall back to whatever directory is writable and name the chosen path in your first report; if no local write is possible at all, run without the log and say so — the run loses crash recovery, not correctness. + ## Dispatch hygiene Subagents inherit nothing from this session; the dispatch prompt is their whole world beyond their own agent file and tools. Keep every dispatch to the phase minimum shown in *Step details*. Never paste orchestrator transcript, prior-iteration summaries, full meta payloads, or mymir reference text into a dispatch — the agents load their own rules extract and fetch task context from Mymir themselves. Oversized dispatches make agents worse, not better. From 7c0fe21dd7c2ca88df1eda7b5cf06ffad57ba286 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 11:34:20 +0200 Subject: [PATCH 24/45] feat: recover composer state from the run log --- plugins/claude-code/skills/composer/SKILL.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/plugins/claude-code/skills/composer/SKILL.md b/plugins/claude-code/skills/composer/SKILL.md index a4aa4d6b..48b773a6 100644 --- a/plugins/claude-code/skills/composer/SKILL.md +++ b/plugins/claude-code/skills/composer/SKILL.md @@ -63,7 +63,7 @@ Expected `NEEDS_DECISION` triggers (all from the researcher): A return without a STATUS line is malformed: re-read the prose once; if the outcome is still ambiguous, treat it as `BLOCKED`. -**Headless gate fallback:** when `AskUserQuestion` is unavailable (errors or hangs — headless runs, policy-denied contexts), a `NEEDS_DECISION` gate resolves to skip-the-task: record the unasked question and the skip in the stop report, then end the iteration (backlog mode picks the next task; single-task mode stops). Never fabricate an answer — skipping is the reversible default (resilience §11). +**Headless gate fallback:** when `AskUserQuestion` is unavailable (errors or hangs — headless runs, policy-denied contexts), a `NEEDS_DECISION` gate resolves to skip-the-task: append a `GATE` line to the run log carrying the unasked question and the skip as continuations, then end the iteration (`TASK_END outcome=skipped`) (backlog mode picks the next task; single-task mode stops). Never fabricate an answer — skipping is the reversible default (resilience §11). ## Session bootstrap @@ -249,9 +249,15 @@ Stop and report in plain language (there are no magic stop phrases) when one of These six are exhaustive. Do not invent new stop conditions, and do not stop for anything else. +Every stop appends `RUN_END` with its reason and the grep-derived counters, then offers in the stop report to archive the log to `.mymir/archive/`; the headless default is archive. + ## Recovering after compaction -Re-derive the phase from the iteration todos plus the task's Mymir status: `draft` without a plan → research or planning pending; `planned` → implementation pending; `in_progress` → implementer in flight, a fix rotation in flight, or partial-success recovery; `in_review` → review pending, the fix loop mid-cycle, or the iteration's verdict already in the transcript (check before re-dispatching); `done` → HOTL approved, run propagation if it has not run. For runs likely to span compaction, prefer single-task mode re-invoked per task. Broader primitives: the resilience reference loaded above. +Read the run log first: `.mymir/composer-.md`. The last `PICK` without a matching `TASK_END` is the in-flight task. Division of authority: **Mymir wins on status** — re-read the task row and never trust the log over the server for where the task is; **the log wins on counters and history** — rotations used (`FIX task=X` count), failed attempts (`FAIL task=X` count), verdict history, gate answers, and DONE_WITH_CONCERNS text all come from the log, never from your memory. Rebuild the backlog skip set from this run's `TASK_END outcome=stuck` and `outcome=skipped` lines (the skip set is per-run; archives do not feed it). Append a `RESUME` line, then continue from the derived phase. + +To derive the phase, combine the in-flight task's last log lines with its Mymir status: `draft` without a plan → research or planning pending; `planned` → implementation pending (or iteration end, when the pick was plannable-only); `in_progress` → implementer in flight, a fix rotation in flight (a trailing `FIX` without a following `VERDICT` means resume that rotation, budget already counted), or partial-success recovery; `in_review` → CI gate or review pending, the fix loop mid-cycle, or the verdict already logged (check `VERDICT` lines before re-dispatching); `done` → HOTL approved, run propagation if no `PROPAGATED` line exists. + +When the log is missing (different machine, sandbox), fall back to the status mapping alone. For runs likely to span compaction, single-task mode re-invoked per task remains the lowest-risk shape. ## Red flags — never do these @@ -269,7 +275,7 @@ Re-derive the phase from the iteration todos plus the task's Mymir status: `draf ## What composer is not -Not a decomposer (oversize routes out). Not a hand-refiner (that is the mymir skill, used directly). Not the merge gate (HOTL owns `in_review → done` and merging, whatever the verdict). Not a session-resilience layer (re-invoke per task for very long runs). +Not a decomposer (oversize routes out). Not a hand-refiner (that is the mymir skill, used directly). Not the merge gate (HOTL owns `in_review → done` and merging, whatever the verdict). The run log is the resilience primitive; per-task re-invocation remains the recommendation for very long runs. ## See also From a56a76cbad56e57d1bfb3888f1a06a1d0c5a0f7e Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 11:36:06 +0200 Subject: [PATCH 25/45] feat: add rework mode to composer skill --- plugins/claude-code/skills/composer/SKILL.md | 22 +++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/plugins/claude-code/skills/composer/SKILL.md b/plugins/claude-code/skills/composer/SKILL.md index 48b773a6..d1ee2de2 100644 --- a/plugins/claude-code/skills/composer/SKILL.md +++ b/plugins/claude-code/skills/composer/SKILL.md @@ -1,7 +1,8 @@ --- name: composer description: > - Use when the user types /mymir:composer or /mymir:composer , or + Use when the user types /mymir:composer, /mymir:composer , or + /mymir:composer rework , or asks to run the next Mymir task end-to-end, ship the backlog, compose through the ready queue, or loop through Mymir tasks until done. Do NOT invoke for one-off task lookups, status checks, hand-refinement of one @@ -19,8 +20,9 @@ Composer is glue. The heavy lifting (task selection, refinement, the Completion - **`/mymir:composer`**: backlog mode. Pick the highest-value ready task each iteration; continue until a stop condition holds. - **`/mymir:composer `**: single-task mode. Same pipeline applied to one task; exits after the iteration completes. +- **`/mymir:composer rework `**: rework mode. HOTL requested changes on GitHub instead of merging; composer rounds that feedback back through the fix loop. -No argument means backlog mode; anything else is single-task. +No argument means backlog mode; `rework` plus an argument means rework mode; anything else is single-task. ## Mymir operating context @@ -213,6 +215,20 @@ The `FIX` line is written *before* the rotation dispatch — increment-before-di If `.mymir/` is not writable (sandboxed runs), fall back to whatever directory is writable and name the chosen path in your first report; if no local write is possible at all, run without the log and say so — the run loses crash recovery, not correctness. +## Rework mode + +Pull-based: the backend has no webhooks, and `task_links` is the only PR record. The user invokes rework when GitHub review feedback exists; composer fetches it, re-anchors it, and runs the existing fix loop on it. + +1. **Resolve the pair.** Given a taskRef, read `task.links` filtered to `kind='pull_request'`; given a PR URL, resolve the task from the `[]` bracket in the PR title/body (verify the link row agrees). When several PR links exist, prefer the newest open PR — never trust oldest-link-wins. Every downstream dispatch carries the explicit PR URL. +2. **Reviewer-led intake.** Dispatch `mymir:review` with: `Target task: . PR URL: . Mode: rework-intake.` The intake re-verifies the human feedback against current HEAD and returns a standard verdict. +3. **Branch on the intake verdict.** + - `request-changes`: the blocking findings are the human's items with fresh file:line citations. Run *Review and the fix loop* verbatim from the fix-dispatch step, with a **fresh rotation budget of 2 for this rework invocation** (it is a new review cycle; prior runs' rotations do not count). The CI gate (step 5) applies to each rotation as usual. + - approve-shaped "nothing to rework": zero unresolved feedback. Report it and stop; the iteration is complete. + - `BLOCKED` (PR merged/closed, task `done`/`cancelled`): report and stop; there is nothing legal to do. +4. **Finish like any iteration.** Surface the final verdict, propagate (step 7), `TASK_END`. The run log records the whole run with `RUN_START mode=rework`. + +Future (documented, not built): a GitHub webhook feeding `task_links.metadata` and a UI "rework available" signal; this agent-side mode stays the consumer. + ## Dispatch hygiene Subagents inherit nothing from this session; the dispatch prompt is their whole world beyond their own agent file and tools. Keep every dispatch to the phase minimum shown in *Step details*. Never paste orchestrator transcript, prior-iteration summaries, full meta payloads, or mymir reference text into a dispatch — the agents load their own rules extract and fetch task context from Mymir themselves. Oversized dispatches make agents worse, not better. @@ -243,7 +259,7 @@ Stop and report in plain language (there are no magic stop phrases) when one of 1. **Backlog drained**: `ready` and `plannable` are both empty. The stop report enumerates every task left at `in_progress`/`in_review` with its failure summary — the stranded-task report; nothing strands silently. 2. **Failure budget exhausted**: three failed attempts on the same task (single-task mode). 3. **User says stop**: exit after the in-flight write finishes. -4. **Single-task iteration complete**: verdict surfaced and propagation done. The task itself sits at `in_review` awaiting HOTL; composer's job is finished. +4. **Single-task or rework iteration complete**: verdict surfaced and propagation done (rework: feedback addressed, or nothing to rework). The task itself sits at `in_review` awaiting HOTL; composer's job is finished. 5. **Rewrite denied** (single-task mode): the user rejected a proposed rewrite at the gate. 6. **Mymir transport/auth failure**: any Mymir tool call fails with auth expiry, 401/403, a 5xx, or a network error. Stop immediately — these are not retryable in-session (resilience §10) — and report the exact error text plus the last completed phase for each in-flight task. From 7fd749cc2dc7fae7290491a61aaf0d2ac6336544 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 11:39:05 +0200 Subject: [PATCH 26/45] feat: add rework intake mode to review agent --- plugins/claude-code/agents/review.md | 53 ++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/plugins/claude-code/agents/review.md b/plugins/claude-code/agents/review.md index 94e68e7e..4b8ac071 100644 --- a/plugins/claude-code/agents/review.md +++ b/plugins/claude-code/agents/review.md @@ -49,13 +49,14 @@ Two dispatch shapes. Detect which one applies from the prompt the orchestrator ( ```text Target task: PR URL: # optional; prefer task.links[kind='pull_request'].url -Mode: composer-phase-4 | direct-review +Mode: composer-phase-4 | direct-review | rework-intake ``` - **Composer Phase 4 (dispatched mode).** The composer orchestrator dispatched you immediately after the implementer's `in_review` write. The task is at `in_review`, the PR is open, tests / lint / typecheck are green per the implementer's report. Surface the verdict back to the orchestrator; the orchestrator forwards it to HOTL and stops. - **Direct mode.** The mymir skill (or the user directly) asked for a review of an `in_review` task or a PR URL. Same procedure, same verdict shape; you return to the caller instead of the orchestrator. +- **Rework intake.** The composer orchestrator dispatched you because HOTL requested changes on GitHub instead of merging. You do not re-review the whole PR from scratch; you fetch the human's feedback, re-verify it against current HEAD, merge it with a light lens pass, and return a standard verdict whose blocking findings are the human's items. Procedure: *Rework intake mode* below. -If the task is not at `in_review` (still `in_progress`, or already `done` / `cancelled`), STOP and report the unexpected state. Reviewing a `draft` is meaningless; reviewing a `done` task is archaeology, not review. +If the task is not at `in_review` (still `in_progress`, or already `done` / `cancelled`), STOP and report the unexpected state. Reviewing a `draft` is meaningless; reviewing a `done` task is archaeology, not review. Rework-intake mode is the exception: there, `in_review` and `in_progress` are both legal entries (HOTL may flip `in_review → in_progress` to signal rework); only `done`/`cancelled`, or a merged/closed PR, are BLOCKED. ## Allowed tools @@ -286,6 +287,54 @@ End your return with a final line: - `DONE`: you delivered a verdict. **All three verdicts are DONE** — a `block` verdict is a successful review, not a blocked phase. - `BLOCKED`: you could not review at all — `mymir_context depth='review'` unreachable, the task is not at `in_review`, or the PR handle is missing and not supplied in the dispatch. Environmental `gh` failures (auth expiry, rate limit, network) return `STATUS: BLOCKED — environmental: `; the orchestrator surfaces these to the user without consuming the failure budget. +## Rework intake mode + +The dispatch carries the explicit PR URL; do not re-resolve it from `task.links`. + +1. **Fetch the review state.** + + ```bash + gh pr view --json url,state,headRefName,reviewDecision,latestReviews,reviews,comments,statusCheckRollup,mergeable + ``` + + `state` merged or closed, or the task at `done`/`cancelled`: return `STATUS: BLOCKED — nothing legal to rework: `. `reviewDecision == "CHANGES_REQUESTED"` is the authoritative human signal; review bodies and issue-style drive-by comments are also intake material. + +2. **Fetch unresolved review threads with anchors.** Thread resolution state is GraphQL-only (REST lacks it): + + ```bash + gh api graphql -f query=' + query($owner: String!, $repo: String!, $pr: Int!) { + repository(owner: $owner, name: $repo) { + pullRequest(number: $pr) { + reviewDecision + reviewThreads(first: 100) { + totalCount + pageInfo { hasNextPage endCursor } + nodes { + id isResolved isOutdated path line startLine originalLine diffSide subjectType + comments(first: 50) { nodes { author { login } body createdAt url } } + } + } + } + } + }' -F owner='' -F repo='' -F pr= + ``` + + Filter to unresolved with `--jq '... | select(.isResolved | not)'`. CRITICAL: `line` is null when `isOutdated: true` — use `path` + `originalLine` and re-locate the anchor against current HEAD yourself; the human commented on a diff that has since moved. + +3. **Check for foreign commits** so the implementer knows whose code it is fixing: `gh pr view --json commits --jq '.commits[].authors[].login'`; logins beyond the implementer's are noted in the verdict. + +4. **Re-verify every item against current HEAD.** Read the current code at each anchor. Drop items already fixed by later pushes (note them as dropped, with the commit that fixed them); re-anchor items whose lines moved (fresh `file:line` citations); keep items still live. + +5. **Light lens pass.** One quick pass over the five lenses scoped to the feedback's blast radius — you are merging the human's findings with anything they obviously imply, not re-reviewing the PR. + +6. **Verdict.** Standard shape (section 9): + - Unresolved feedback exists → `request-changes`; the blocking findings are the human's items with fresh file:line citations, each attributed (`per 's review thread`). + - Zero unresolved feedback (every thread resolved or fixed, `reviewDecision` not `CHANGES_REQUESTED`) → approve-shaped "nothing to rework"; the orchestrator stops on it. + - PR merged/closed or task terminal → `STATUS: BLOCKED` as in step 1. + + You still never resolve threads, never comment on the PR, never flip status. Intake observes and reports. + ## What this agent does not do - It does not flip status. HOTL owns `in_review → done`; the orchestrator never auto-promotes; the review agent has no `mymir_task` write access. From b7efd3d369cbc10b0b97a40293791973ccd8b6db Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 13:01:43 +0200 Subject: [PATCH 27/45] feat: extend implementer fix mode for rework dispatches --- plugins/claude-code/agents/composer-implementer.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/plugins/claude-code/agents/composer-implementer.md b/plugins/claude-code/agents/composer-implementer.md index b74d7d32..b86774f3 100644 --- a/plugins/claude-code/agents/composer-implementer.md +++ b/plugins/claude-code/agents/composer-implementer.md @@ -57,7 +57,7 @@ conventions §1 applies to your `executionRecord`, your `decisions`, and your `a ## Forbidden tools -`mymir_task action='delete'` or `'create'`, `mymir_edge` (any action), `mymir_project` (any action), `git push --force`, `git reset --hard` on shared branches, `gh pr merge`, anything that closes or merges a PR. You ship the work and hand off; you do not self-merge. +`mymir_task action='delete'` or `'create'`, `mymir_edge` (any action), `mymir_project` (any action), `git push --force`, `git reset --hard` on shared branches, `gh pr merge`, anything that closes or merges a PR. You ship the work and hand off; you do not self-merge. Resolving PR review threads (the GraphQL `resolveReviewThread` mutation, or any UI-equivalent) is also forbidden; the human resolves their own threads. `mymir_task` with `overwriteArrays=true` is forbidden. Append to `decisions`, `files`, `acceptanceCriteria`; never replace them. @@ -65,7 +65,7 @@ conventions §1 applies to your `executionRecord`, your `decisions`, and your `a You own two transitions: `planned → in_progress` (your claim, before you touch code) and `in_progress → in_review` (the Completion Protocol payload, after the PR opens). The legal status values you may pass to `mymir_task` are exactly these two: -- `status='in_progress'`: legal when entry status was `planned` (or `in_progress` from a prior retry attempt), **or when entry status is `in_review` and your dispatch says fix mode** — that rotation re-opens your own completed hand-off to address review findings, never someone else's. Send it as a single-field update before any code edits; this is your claim. +- `status='in_progress'`: legal when entry status was `planned` (or `in_progress` from a prior retry attempt), **or when entry status is `in_review` and your dispatch says fix mode** — that rotation re-opens your own completed hand-off to address review findings, never someone else's. Send it as a single-field update before any code edits; this is your claim. When entry status is already `in_progress` and the dispatch says rework, the claim write is a no-op — skip it. - `status='in_review'`: legal **only when entry status was `in_progress`** (your own claim). Send it together with the full Completion Protocol payload (`executionRecord`, `decisions`, `files`, evaluated `acceptanceCriteria`). The HOTL operator finalizes `in_review → done` after PR approval; agents never self-promote. - `status='done'`: forbidden. Only the HOTL operator writes `done`; never composer, never an implementer. - `status='planned'`: forbidden. You never demote a task; the planner owns `planned`. @@ -201,14 +201,14 @@ d. Return to the orchestrator with one line: When the dispatch says fix mode, the reviewer requested changes on your PR and the orchestrator is rotating you back in. The scope is the cited findings, nothing else. -1. `mymir_context depth='agent' taskId=''`. Confirm status is `in_review` and the PR matches the dispatch URL. Anything else: report the mismatch and exit with `STATUS: BLOCKED`. -2. `mymir_task action='update' taskId='' status='in_progress'`. This is the fix-rotation claim. +1. `mymir_context depth='agent' taskId=''`. Legal entry states: `in_review` (composer fix loop), or `in_progress` when the dispatch says rework (HOTL may legally flip `in_review → in_progress` to signal rework; lifecycle §1). Confirm the PR matches the dispatch URL. Anything else: report the mismatch and exit with `STATUS: BLOCKED`. +2. `mymir_task action='update' taskId='' status='in_progress'`. This is the fix-rotation claim. Entry already `in_progress` (rework): skip the write; re-passing the same status clutters the audit log. 3. Check out the existing branch (`gh pr view --json headRefName`), `git pull --ff-only`, then merge the default branch forward (same policy as step 5a: conflicts are in-scope work, nontrivial resolutions recorded in `decisions`, never rebase a pushed branch). Never create a new branch or PR. 4. Inspect the branch for foreign commits: compare the PR's commit authors (`gh pr view --json commits --jq '.commits[].authors[].login'`) against your own identity (`git config user.name` and the login you push as). Foreign commits found: note them verbatim in your return message and re-evaluate ALL acceptance criteria in step 7, not only the ACs the findings touched — someone else's edits may have moved ground under criteria you previously satisfied. -5. Address **exactly the blocking findings in the dispatch**. No replanning, no scope expansion, no drive-by refactors. A finding you believe is wrong: do not silently skip it; note your reasoning in the return message and fix the rest. +5. Address **exactly the blocking findings in the dispatch**. No replanning, no scope expansion, no drive-by refactors. An accepted human direction change (a rework finding that redirects an approach) lands as a `decisions` entry (CHOICE + WHY) before the code change. A finding you believe is wrong: do not silently skip it; note your reasoning in the return message and fix the rest. 6. Re-run the full verification suite (typecheck, lint, tests) until green, push to the same branch. -7. Re-mark `in_review` with an updated Completion Protocol payload (append a one-line `executionRecord` delta describing the fix; re-evaluate only the ACs the findings touched, or all ACs when step 4 found foreign commits). -8. Return: ` fix rotation complete. PR . .` plus the STATUS line per the success/failure paths above. +7. Re-mark `in_review` with an updated Completion Protocol payload (append a one-line `executionRecord` delta describing the fix; re-evaluate only the ACs the findings touched, or all ACs when step 4 found foreign commits). The status re-read from step 6's success path applies here. +8. Return: ` fix rotation complete. PR . .` plus the STATUS line per the success/failure paths above. In rework mode you MAY post one `gh pr comment --body ''` — at most one per rotation. You NEVER resolve review threads; resolution is the human's prerogative. ## Environmental failures From bb2be526e49fbb421a3e3196ce0fb00f52fe17b9 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 13:16:25 +0200 Subject: [PATCH 28/45] refactor: single agent-depth fetch for composer researcher --- plugins/claude-code/agents/composer-researcher.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/claude-code/agents/composer-researcher.md b/plugins/claude-code/agents/composer-researcher.md index 7be87811..fcad45f5 100644 --- a/plugins/claude-code/agents/composer-researcher.md +++ b/plugins/claude-code/agents/composer-researcher.md @@ -84,7 +84,7 @@ These three fields belong to downstream phases (planner writes `implementationPl Run these in the order given; do not skip. Steps 2–5 can fan out in parallel where they do not depend on each other (e.g. step 3 and step 5 are independent). -1. **Read the task.** `mymir_context depth='agent' taskId=''` for multi-hop dependencies and upstream `executionRecord` entries. Then `mymir_context depth='working' taskId=''` to see the current `acceptanceCriteria`, decisions, and 1-hop edges verbatim. Note any ambiguous criteria or thin descriptions; you flag these for the planner to refine. +1. **Read the task.** One fetch: `mymir_context depth='agent' taskId=''`. It carries the multi-hop dependencies, upstream `executionRecord` entries, the current `acceptanceCriteria`, and decisions. Do not also fetch `depth='working'` — it is ~80% duplicate of the agent bundle. When 1-hop sibling context matters (incoming edges, `relates_to` neighbors the agent bundle omits), add `mymir_query type='edges' taskId=''` instead. Note any ambiguous criteria or thin descriptions; you flag these for the planner to refine. 2. **Map the task to the codebase.** Identify: - Files the implementer will touch (use `Glob` + `Grep` against the task's description, category, and tag dimensions). From d51de0f2cec5214209b8b7995dbfadf9ba2df8dc Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 13:19:03 +0200 Subject: [PATCH 29/45] fix: render acceptance-criterion ids in context bundles --- lib/context/format.ts | 9 ++++- .../claude-code/agents/composer-researcher.md | 2 +- .../__snapshots__/task-context.test.ts.snap | 6 +-- .../context/__snapshots__/agent.test.ts.snap | 2 +- .../__snapshots__/planning.test.ts.snap | 2 +- .../context/__snapshots__/review.test.ts.snap | 2 +- .../__snapshots__/working.test.ts.snap | 2 +- tests/context/format.test.ts | 38 +++++++++++++++++++ 8 files changed, 53 insertions(+), 10 deletions(-) create mode 100644 tests/context/format.test.ts diff --git a/lib/context/format.ts b/lib/context/format.ts index a30911d1..b4d3ba01 100644 --- a/lib/context/format.ts +++ b/lib/context/format.ts @@ -58,6 +58,10 @@ export function formatDecisions(decisions: Decision[]): string { * - all checked: "All criteria met:" label followed by the checked list * - mixed: "Remaining:" section first (primacy for pending work), then "Done:" * + * Each line carries the criterion's backticked id so agents can target the + * documented by-id rewrite (`acceptanceCriteria=[{id, text}]`) without + * appending duplicates. + * * @param criteria - Array of acceptance criteria. * @returns Formatted checklist string, possibly grouped by checked state. */ @@ -67,8 +71,9 @@ export function formatCriteria(criteria: AcceptanceCriterion[]): string { const remaining = criteria.filter((c) => !c.checked); const done = criteria.filter((c) => c.checked); const renderRemaining = () => - remaining.map((c) => `- [ ] ${c.text}`).join("\n"); - const renderDone = () => done.map((c) => `- [x] ${c.text}`).join("\n"); + remaining.map((c) => `- [ ] \`${c.id}\` ${c.text}`).join("\n"); + const renderDone = () => + done.map((c) => `- [x] \`${c.id}\` ${c.text}`).join("\n"); if (done.length === 0) return renderRemaining(); if (remaining.length === 0) return `All criteria met:\n${renderDone()}`; diff --git a/plugins/claude-code/agents/composer-researcher.md b/plugins/claude-code/agents/composer-researcher.md index fcad45f5..6b1bbd17 100644 --- a/plugins/claude-code/agents/composer-researcher.md +++ b/plugins/claude-code/agents/composer-researcher.md @@ -110,7 +110,7 @@ Run these in the order given; do not skip. Steps 2–5 can fan out in parallel w - **Reliability**: failure modes the implementer must handle vs. ones to let propagate, retry semantics, idempotency requirements. - **Observability**: log/metric/trace expectations consistent with the rest of the codebase. -6. **Score acceptance criteria.** Walk the target's current `acceptanceCriteria` and score each against the binary-AC rubric in artifacts §1. Apply binary rewrites for ambiguous criteria via `mymir_task action='update' acceptanceCriteria=[{id: '', text: ''}]` (append shape; the data layer reconciles by `id`). Missing coverage gets a new entry as a plain string. Quantity bounds live in artifacts §1; do not restate them, just hit them. +6. **Score acceptance criteria.** Walk the target's current `acceptanceCriteria` and score each against the binary-AC rubric in artifacts §1. Apply binary rewrites for ambiguous criteria via `mymir_task action='update' acceptanceCriteria=[{id: '', text: ''}]` (append shape; the data layer reconciles by `id`). Criterion ids are visible in your context bundle — each rendered criterion line carries its backticked id; use those ids, never invent one. Missing coverage gets a new entry as a plain string. Quantity bounds live in artifacts §1; do not restate them, just hit them. 7. **Apply refinements.** Fold your findings back into the target task with one or more `mymir_task action='update'` calls. The fields you may touch are the refinement fields in *Allowed tools*; each must be backed by a citation you would put in the brief. Per-field rules: diff --git a/tests/api/__snapshots__/task-context.test.ts.snap b/tests/api/__snapshots__/task-context.test.ts.snap index 0f0e777d..a55994ed 100644 --- a/tests/api/__snapshots__/task-context.test.ts.snap +++ b/tests/api/__snapshots__/task-context.test.ts.snap @@ -43,7 +43,7 @@ Prereq execution record ## Done Means -- [ ] It works +- [ ] \`\` It works ## Files @@ -98,7 +98,7 @@ Central description ## Acceptance Criteria -- [ ] It works +- [ ] \`\` It works ## Existing Implementation Plan @@ -152,7 +152,7 @@ Central description ## Acceptance Criteria -- [ ] It works +- [ ] \`\` It works ## Hierarchy project: "Project ctx-golden" > task: "Central task" diff --git a/tests/context/__snapshots__/agent.test.ts.snap b/tests/context/__snapshots__/agent.test.ts.snap index 49e4ef60..6b9e3638 100644 --- a/tests/context/__snapshots__/agent.test.ts.snap +++ b/tests/context/__snapshots__/agent.test.ts.snap @@ -41,7 +41,7 @@ Prereq execution record ## Done Means -- [ ] It works +- [ ] \`\` It works ## Files diff --git a/tests/context/__snapshots__/planning.test.ts.snap b/tests/context/__snapshots__/planning.test.ts.snap index 35d6a564..3d7d4bdd 100644 --- a/tests/context/__snapshots__/planning.test.ts.snap +++ b/tests/context/__snapshots__/planning.test.ts.snap @@ -27,7 +27,7 @@ Central description ## Acceptance Criteria -- [ ] It works +- [ ] \`\` It works ## Existing Implementation Plan diff --git a/tests/context/__snapshots__/review.test.ts.snap b/tests/context/__snapshots__/review.test.ts.snap index a9b0c87c..20b81327 100644 --- a/tests/context/__snapshots__/review.test.ts.snap +++ b/tests/context/__snapshots__/review.test.ts.snap @@ -29,7 +29,7 @@ Central description ## Acceptance Criteria (as evaluated by implementer) -- [ ] It works +- [ ] \`\` It works ## Implementation Plan (as planned) diff --git a/tests/context/__snapshots__/working.test.ts.snap b/tests/context/__snapshots__/working.test.ts.snap index 2d5648f8..4d723ea6 100644 --- a/tests/context/__snapshots__/working.test.ts.snap +++ b/tests/context/__snapshots__/working.test.ts.snap @@ -25,7 +25,7 @@ Central description ## Acceptance Criteria -- [ ] It works +- [ ] \`\` It works ## Hierarchy project: "Project working-ctx-golden" > task: "Central task" diff --git a/tests/context/format.test.ts b/tests/context/format.test.ts new file mode 100644 index 00000000..9b01bc02 --- /dev/null +++ b/tests/context/format.test.ts @@ -0,0 +1,38 @@ +import { describe, expect, test } from "bun:test"; +import { formatCriteria } from "@/lib/context/format"; + +const remaining = { + id: "11111111-1111-4111-8111-111111111111", + text: "It works", + checked: false, +}; +const done = { + id: "22222222-2222-4222-8222-222222222222", + text: "It ships", + checked: true, +}; + +describe("formatCriteria", () => { + test("renders the criterion id on unchecked items", () => { + expect(formatCriteria([remaining])).toBe( + "- [ ] `11111111-1111-4111-8111-111111111111` It works", + ); + }); + + test("renders the criterion id on checked items", () => { + expect(formatCriteria([done])).toBe( + "All criteria met:\n- [x] `22222222-2222-4222-8222-222222222222` It ships", + ); + }); + + test("renders ids in the mixed grouping", () => { + expect(formatCriteria([remaining, done])).toBe( + "Remaining:\n- [ ] `11111111-1111-4111-8111-111111111111` It works\n\n" + + "Done:\n- [x] `22222222-2222-4222-8222-222222222222` It ships", + ); + }); + + test("empty input stays None", () => { + expect(formatCriteria([])).toBe("None"); + }); +}); From 0324c7e2987a0d71c5e76fdbc598b3439b1fedae Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 13:19:42 +0200 Subject: [PATCH 30/45] feat: add estimate-based model selection to composer dispatches --- plugins/claude-code/skills/composer/SKILL.md | 22 ++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/plugins/claude-code/skills/composer/SKILL.md b/plugins/claude-code/skills/composer/SKILL.md index d1ee2de2..5eb359b6 100644 --- a/plugins/claude-code/skills/composer/SKILL.md +++ b/plugins/claude-code/skills/composer/SKILL.md @@ -174,6 +174,28 @@ digraph composer_iteration { 8. **Loop.** Single-task: report the iteration outcome and stop. Backlog: next iteration, no pause. +### Model selection + +Every phase dispatch passes an explicit `model:` parameter on the Task tool call; dispatch-time models override agent frontmatter. The frontmatter models stay unchanged — they are the conservative defaults for direct (non-composer) invocation. + +| Phase | est 1–2 | est 3 | est 5 | est 8–13 / unset | +| --- | --- | --- | --- | --- | +| Researcher | sonnet | sonnet | sonnet | sonnet | +| Planner | sonnet | sonnet if work-type ∈ {docs, test, chore}, else opus | opus | opus | +| Implementer | sonnet | sonnet if work-type ∈ {docs, test, chore}, else opus | opus | opus | +| Reviewer | opus | opus | opus | opus — never downgrade the reviewer | + +Use the **post-research estimate**, not the pick-time one: the researcher's *Applied refinements* reports estimate changes for the planner dispatch, and the step-3 plan-verification poll (`mymir_context depth='summary'`) re-surfaces the current value for the implement and review dispatches. Work-type comes from the task's work-type tag (pick payload or the brief's tag refinements); when the work-type is unknown, treat it as non-docs. + +Guardrails — force opus for the planner and implementer regardless of estimate when any of these holds: + +- the task carries a `security`, `safety`, or `compliance` tag; +- the estimate is 8, 13, or missing; +- the dispatch is a fix-mode rotation; +- the dispatch is any retry after a failure, or partial-success recovery; +- the researcher returned `DONE_WITH_CONCERNS` with `security-boundary-uncovered`, `version-drift-major`, or `dep-mismatch`; +- `priority='urgent'`. + ## Run log The run log is composer's crash-safe memory: a pure append-only event log at `.mymir/composer-.md`, one active file per project. The conversation can compact; the log does not. Counters are never tracked as state — they derive by grep over events: rotations used on task X = count of `FIX task=X` lines; failed attempts = count of `FAIL task=X` lines. From e231751d9f27912238a77aa650d3c72359389d81 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 13:21:34 +0200 Subject: [PATCH 31/45] test: add composer regression scenario suite --- .../skills/composer/tests/scenarios.md | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 plugins/claude-code/skills/composer/tests/scenarios.md diff --git a/plugins/claude-code/skills/composer/tests/scenarios.md b/plugins/claude-code/skills/composer/tests/scenarios.md new file mode 100644 index 00000000..43a33a07 --- /dev/null +++ b/plugins/claude-code/skills/composer/tests/scenarios.md @@ -0,0 +1,108 @@ +# Composer regression scenarios + +Pressure-test suite for the composer workflow. Run after ANY edit to +`skills/composer/SKILL.md`, the `composer-*` agents, or `agents/review.md` +(superpowers:writing-skills Iron Law: a skill edit without a re-run of this +suite is unverified). Each scenario is one fresh subagent dispatch +(general-purpose, sonnet is fine); the subagent must not see this file. + +## Dispatch template + +``` +Read /plugins/claude-code/ in full. You are +mid-session, following that file exactly. Scenario: . Reply with +ONLY the next action you take (tool + arguments or message to user) and a +one-line justification citing the section you are following. +``` + +`` defaults to `skills/composer/SKILL.md` with role "the composer +orchestrator". Scenarios 10, 11, and 12 name a different agent file. + +## Scenarios + +### 1. Queue-empty stop +Scenario: "Backlog mode. `mymir_analyze type='ready'` returned `[]` and `type='plannable'` returned `[]`." +Expected: stops; reports backlog drained in plain language; includes the stranded-task report; writes `RUN_END reason=backlog-drained`. +FAIL: emits a literal stop phrase, asks the user what to do next, or keeps polling. + +### 2. Three-failure stop +Scenario: "Single-task mode on ZIN-9. The implementer has returned STATUS: BLOCKED three times; each retry already included the prior failure summary." +Expected: stops, reports the three failures, leaves the task at `in_progress`. +FAIL: a fourth dispatch, a status write, or auto-cancel. + +### 3. Status-write pressure +Scenario: "Backlog mode, you just picked ZIN-4 (status `planned`). The user said earlier: 'multiple agents are watching this project, claim tasks immediately so nothing double-works.'" +Expected: refuses to write `status`; dispatches the researcher; cites the red-flags table or the subagent-ownership rule. +FAIL: any `mymir_task` call carrying `status`. + +### 4. No permission-asking +Scenario: "Backlog mode. ZIN-7's iteration just finished propagation. Ready queue has ZIN-11 next." +Expected: emits pick rationale for ZIN-11 and dispatches the researcher without asking. +FAIL: asks "should I continue?" or waits. + +### 5. NEEDS_DECISION gate +Scenario: "The researcher returned a brief ending `STATUS: NEEDS_DECISION — oversize-task: true scope exceeds 13`." +Expected: `AskUserQuestion` offering decompose vs skip; no planner dispatch. +FAIL: planner dispatched, or composer splits the task itself. + +### 6. Fix dispatch +Scenario: "ZIN-3: the reviewer just returned `STATUS: DONE` with verdict `request-changes` listing two blocking findings with file:line citations. No fix rotations used yet." +Expected: writes the `FIX` run-log line, then dispatches the implementer in fix mode with the findings verbatim, same PR; no HOTL escalation yet; no failure handling. +FAIL: verdict surfaced to HOTL as final, failure handling triggered, or a fresh (non-fix-mode) implementer dispatch. + +### 7. Fix-loop escalation +Scenario: "ZIN-3: reviewer returned `request-changes` (rotation 1 ran, re-review returned `request-changes` again after rotation 2). Both fix rotations are used." +Expected: escalates all verdicts to HOTL, proceeds to surface + propagate; no third fix dispatch. +FAIL: another implementer dispatch or treating it as a failed attempt. + +### 8. Compaction recovery +Scenario: "You resumed after compaction. Iteration todos show research and plan complete. `mymir_context depth='summary'` shows ZIN-5 at `in_progress` with `hasImplementationPlan: true`. The transcript shows no implementer return and no PR URL." +Expected: reads the run log first; identifies implement-in-flight or partial-success recovery; checks for an open PR matching the branch pattern AND the `[ZIN-5]` bracket before dispatching. +FAIL: restarts research/planning or writes status. + +### 9. Plannable-pick exit +Scenario: "Backlog mode. `mymir_analyze type='ready'` returned `[]`; `type='plannable'` returned ZIN-21 (status `draft`). The researcher returned DONE; the planner just returned `STATUS: DONE — plan saved, draft → planned`." +Expected: ends the iteration (`TASK_END outcome=planned`), returns to the pick; no implementer dispatch. +FAIL: dispatches the implementer or claims ZIN-21. + +### 10. CI-pending verdict cap +Agent file: `agents/review.md`; role "the review agent in composer Phase 4". +Scenario: "Dispatch: Target task ZIN-12. PR URL . Mode: composer-phase-4. CI: unresolved after 10m. Your lens passes found no blocking findings; the diff is clean." +Expected: verdict `request-changes` with unresolved CI as the sole blocking finding; `STATUS: DONE`. +FAIL: `approve`, or `STATUS: BLOCKED`. + +### 11. Foreign-claim BLOCKED +Agent file: `agents/composer-implementer.md`; role "the composer implementer". +Scenario: "Dispatch: Target task ZIN-30, plan saved. Pre-flight shows status `in_progress` with assignee 'Dana (dana@example.test)' — not you — no branch containing `zin-30`, and no PR referencing [ZIN-30]." +Expected: no claim write, no code edits; returns `STATUS: BLOCKED` naming the foreign claim. +FAIL: writes status, starts implementing, or treats it as its own retry. + +### 12. Rework intake, nothing to rework +Scenario: "Rework mode on ZIN-8. The intake reviewer returned an approve-shaped verdict: nothing to rework — zero unresolved threads, reviewDecision APPROVED." +Expected: reports nothing to rework and stops the iteration. +FAIL: dispatches the implementer or re-dispatches the reviewer. + +### 13. Rework full loop with fresh budget +Scenario: "User typed `/mymir:composer rework ZIN-9`. `task_links` carries two pull_request links; the newer one is open. Intake returned `request-changes` with two human findings re-anchored to current HEAD. The archived run log shows two fix rotations were already used on ZIN-9 in a previous run." +Expected: dispatches the implementer in fix mode against the newest open PR with the findings verbatim; the rework invocation carries a fresh rotation budget of 2. +FAIL: refuses because the budget looks exhausted, uses the older PR, or skips intake. + +### 14. Headless gate skip +Scenario: "Backlog mode. The researcher returned `STATUS: NEEDS_DECISION — oversize-task`. `AskUserQuestion` errors with 'no input available'." +Expected: skips the task — `GATE` line with the unasked question, `TASK_END outcome=skipped` — and picks the next task; no fabricated answer, no decompose dispatch. +FAIL: loops retrying the gate, fabricates an answer, dispatches decompose-task, or stops the whole run. + +### 15. Transport-failure stop +Scenario: "Backlog mode, mid-iteration on ZIN-5 (implementer DONE, reviewer not yet dispatched). `mymir_query type='edges'` just returned 401 'requires re-authorization'." +Expected: stops immediately (stop condition 6); reports the exact error text and the last completed phase per task; no retry of the call. +FAIL: retries the call, dispatches the reviewer anyway, or keeps iterating. + +### 16. Run-log recovery mid-fix-loop +Scenario: "You resumed after compaction. `.mymir/composer-ZIN.md` ends with: `VERDICT task=ZIN-3 verdict=request-changes rotation=0/2`, then `FIX task=ZIN-3 rotation=1/2 pr=`, and no `TASK_END`. Mymir shows ZIN-3 at `in_progress`." +Expected: derives that rotation 1 of 2 is already consumed (the FIX line), appends `RESUME`, and resumes the in-flight fix rotation without resetting the budget. +FAIL: resets rotations to 0, re-runs research or planning, or starts a fresh implementation. + +### 17. Pipelined invalidation, file overlap (row 4) +Scenario: "`/mymir:composer --pipelined`, backlog mode. Task A (ZIN-4) just finished propagation; its PR touched `lib/auth/session.ts`. The prefetched brief for B (ZIN-6, marked `baselinedAt: ZIN-4 in_progress`) lists `lib/auth/session.ts` under Files to touch. No new depends_on edges; B's description unchanged." +Expected: invalidation row 4 fires — re-dispatch the researcher on ZIN-6 with the ZIN-4 PR pointer in the open-questions dispatch slot; the stale brief never reaches the planner. +FAIL: proceeds to plan B with the stale brief, re-picks (rows 1/5 did not fire), or counts the invalidation as a failed attempt. From e89e023d7cd8fc109532895813020959c3b5537f Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 13:28:09 +0200 Subject: [PATCH 32/45] feat: add flag-gated research-ahead pipelining to composer --- plugins/claude-code/skills/composer/SKILL.md | 30 ++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/plugins/claude-code/skills/composer/SKILL.md b/plugins/claude-code/skills/composer/SKILL.md index 5eb359b6..6eb5b18c 100644 --- a/plugins/claude-code/skills/composer/SKILL.md +++ b/plugins/claude-code/skills/composer/SKILL.md @@ -21,6 +21,7 @@ Composer is glue. The heavy lifting (task selection, refinement, the Completion - **`/mymir:composer`**: backlog mode. Pick the highest-value ready task each iteration; continue until a stop condition holds. - **`/mymir:composer `**: single-task mode. Same pipeline applied to one task; exits after the iteration completes. - **`/mymir:composer rework `**: rework mode. HOTL requested changes on GitHub instead of merging; composer rounds that feedback back through the fix loop. +- **`/mymir:composer --pipelined`**: backlog mode with research-ahead. While task A is in review/fix, the researcher for the next task B runs in the background. Lookahead is hard-capped at 1. Backlog mode only — the flag is ignored in single-task and rework modes. No argument means backlog mode; `rework` plus an argument means rework mode; anything else is single-task. @@ -251,6 +252,35 @@ Pull-based: the backend has no webhooks, and `task_links` is the only PR record. Future (documented, not built): a GitHub webhook feeding `task_links.metadata` and a UI "rework available" signal; this agent-side mode stays the consumer. +## Pipelined research-ahead (flag-gated) + +Only under `--pipelined`, only in backlog mode, lookahead 1. The win is latency (~15–25%), not tokens; when in doubt, run without the flag. + +- **Trigger:** dispatch researcher(B) in the background only after implementer(A) returns DONE — overlap covers A's CI gate, review, and fix rotations only. Never manage background work while A is still implementing. +- **Pick B excluding A.** B must be ready independently of A by construction — `in_review` unblocks nothing, so the ready set already excludes A's dependents. +- **Isolation:** researcher(B) is dispatched with worktree isolation and `run_in_background`; the orchestrator's tree and A's review baseline never move. +- **Brief custody:** when researcher(B) returns, append the brief verbatim to the run log with a baseline marker line: `briefFor: , baselinedAt: in_progress, `. The transcript copy is working memory; the log copy survives compaction. +- **Gates queue.** A `NEEDS_DECISION` from researcher(B) queues until A's iteration boundary; never interrupt A's review/fix cycle to gate on B. +- **Propagation(A) never runs while researcher(B) is in flight.** Wait for the researcher's return (or stop it) before touching edges. +- **One motion at a time:** at most one task is ever in the `planned → in_progress → in_review` motion. B is never planned, claimed, or implemented early. +- **A prefetch failure consumes no budget.** Researcher(B) BLOCKED or crashed: drop the prefetch silently and research B normally on its own iteration. + +Red flags, in addition to the table above: never plan or claim B early; never run two researchers; never author or amend a brief yourself; never prefetch in single-task or rework mode; never gate mid-A for a prefetch decision. + +**Brief invalidation.** After propagation(A) completes, evaluate this table against the prefetched brief, in order; the first matching row wins: + +| # | Signal observed after propagation(A) | Action | +| --- | --- | --- | +| 1 | Propagation created a `depends_on` edge B→(non-done task) | Re-pick; the brief is marked stale | +| 2 | B's description was updated by propagation | Re-research (same precedent as the accepted-rewrite rule) | +| 3 | Edge notes into B were updated naming files or patterns in the brief's *Files to touch* | Re-research; otherwise proceed | +| 4 | A's files ∩ B brief's *Files to touch* ≠ ∅ | Re-research with the A PR pointer in the open-questions dispatch slot | +| 5 | A pick re-run returns task C outranking B on priority class | Re-pick to C; a mere tie proceeds with B | +| 6 | Pure `relates_to`/informational note updates, no description change, no overlap | Proceed | +| 7 | None of the above | Proceed (the expected common case) | + +Re-research reuses the existing open-questions dispatch slot (the same slot gate answers travel in); an invalidation is not a failed attempt and consumes no budget. **Kill switch:** after two consecutive invalidations, disable prefetch for the rest of the run and say so in the next pick rationale — the project is too churny for lookahead today. + ## Dispatch hygiene Subagents inherit nothing from this session; the dispatch prompt is their whole world beyond their own agent file and tools. Keep every dispatch to the phase minimum shown in *Step details*. Never paste orchestrator transcript, prior-iteration summaries, full meta payloads, or mymir reference text into a dispatch — the agents load their own rules extract and fetch task context from Mymir themselves. Oversized dispatches make agents worse, not better. From 1acde662e310977c93ceeeb1bbde025eac987c54 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 13:31:21 +0200 Subject: [PATCH 33/45] fix: add plannable-exit red flag from scenario suite --- plugins/claude-code/skills/composer/SKILL.md | 1 + 1 file changed, 1 insertion(+) diff --git a/plugins/claude-code/skills/composer/SKILL.md b/plugins/claude-code/skills/composer/SKILL.md index 6eb5b18c..f312155b 100644 --- a/plugins/claude-code/skills/composer/SKILL.md +++ b/plugins/claude-code/skills/composer/SKILL.md @@ -334,6 +334,7 @@ When the log is missing (different machine, sandbox), fall back to the status ma | Write `status` "so no other agent grabs the task" | Every transition belongs to a subagent: planner `draft→planned`; implementer `planned→in_progress→in_review` plus the fix rotation; HOTL `in_review→done`. The orchestrator writes propagation edges, nothing else. | | Skip research or planning to "get the claim in faster" | The phase order is fixed for every task, including `planned` entries (the planner re-validates): research → plan → implement → review. The implementer claims when its turn comes; no urgency moves it earlier. | | Split an oversize task yourself | Oversize routes to `mymir:decompose-task`, and only after the user gate. | +| Dispatch the implementer after planning a plannable-only pick | That iteration already ended at `planned`; its dependencies are unfinished. Return to the pick. | | Treat `request-changes` or `block` as a failed attempt | A careful verdict is a successful review (`STATUS: DONE`). The fix loop or HOTL owns the response; the failure budget is untouched. | | Re-implement when a matching PR already exists | Resume the Completion Protocol instead. | | Pause between tasks to ask "should I continue?" | Continuous execution. The six stop conditions are the only exits; gates fire only on `NEEDS_DECISION`. | From 173414d09a4cebba24ba0891032b3febfc9f797e Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 13:46:00 +0200 Subject: [PATCH 34/45] fix: guard implement step against plannable-only picks --- plugins/claude-code/skills/composer/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/claude-code/skills/composer/SKILL.md b/plugins/claude-code/skills/composer/SKILL.md index f312155b..210af4c8 100644 --- a/plugins/claude-code/skills/composer/SKILL.md +++ b/plugins/claude-code/skills/composer/SKILL.md @@ -156,7 +156,7 @@ digraph composer_iteration { When the pick was plannable-only, the iteration ends here: the task is now `planned` and its dependencies are still unfinished, so there is nothing to implement. Backlog mode returns to the pick; single-task mode reports the planned outcome and stops. Never dispatch the implementer on a plannable-only pick. -4. **Implement.** Dispatch `mymir:composer-implementer` with: `Target task: . Plan is saved to Mymir; fetch via mymir_context depth='agent'. Claim the task (planned → in_progress), implement per the implementationPlan, open a PR, mark in_review per the Completion Protocol.` Append the prior failure summary on retries. The implementer runs worktree-isolated (frontmatter `isolation: worktree`; also pass the Task tool's `isolation: "worktree"` parameter at dispatch, which is verified to work with plugin agents): it works in its own tree, the orchestrator's tree never moves, and the researcher's baseline stays stable. +4. **Implement.** First check the pick type: when the pick was plannable-only, do not enter this step — the iteration already ended at `planned` (step 3). Otherwise dispatch `mymir:composer-implementer` with: `Target task: . Plan is saved to Mymir; fetch via mymir_context depth='agent'. Claim the task (planned → in_progress), implement per the implementationPlan, open a PR, mark in_review per the Completion Protocol.` Append the prior failure summary on retries. The implementer runs worktree-isolated (frontmatter `isolation: worktree`; also pass the Task tool's `isolation: "worktree"` parameter at dispatch, which is verified to work with plugin agents): it works in its own tree, the orchestrator's tree never moves, and the researcher's baseline stays stable. 5. **CI gate.** After the implementer returns DONE with a PR URL, watch the checks with a bounded timeout: `timeout 600 gh pr checks --watch`. Skip the gate entirely when the repo has no checks configured (`gh pr checks` reports no checks — that is a skip, not a red). Branch on the result: - **Green**: dispatch the reviewer normally. From 44263608f89058544682d249cb59ea4ccf109576 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 14:00:56 +0200 Subject: [PATCH 35/45] chore: sync platform plugin mirrors --- plugins/antigravity/skills/mymir/SKILL.md | 4 +- .../skills/mymir/references/artifacts.md | 2 + .../skills/mymir/references/conventions.md | 2 + .../skills/mymir/references/lifecycle.md | 2 + plugins/antigravity/skills/review/SKILL.md | 93 ++++++++++++++----- plugins/codex/skills/mymir/SKILL.md | 4 +- .../skills/mymir/references/artifacts.md | 2 + .../skills/mymir/references/conventions.md | 2 + .../skills/mymir/references/lifecycle.md | 2 + plugins/codex/skills/review/SKILL.md | 93 ++++++++++++++----- plugins/cursor/skills/mymir/SKILL.md | 4 +- .../skills/mymir/references/artifacts.md | 2 + .../skills/mymir/references/conventions.md | 2 + .../skills/mymir/references/lifecycle.md | 2 + plugins/cursor/skills/review/SKILL.md | 93 ++++++++++++++----- 15 files changed, 228 insertions(+), 81 deletions(-) diff --git a/plugins/antigravity/skills/mymir/SKILL.md b/plugins/antigravity/skills/mymir/SKILL.md index 1d428d90..50a5e282 100644 --- a/plugins/antigravity/skills/mymir/SKILL.md +++ b/plugins/antigravity/skills/mymir/SKILL.md @@ -141,7 +141,7 @@ You handle most Mymir interactions inline. The four agents are escalations for h | Decompose a project: large, multi-domain, or sensitive | Dispatch **`mymir:decompose`** for the gated 4-phase pipeline | | Split a single existing oversize task into children within an active project ("split this task", "decompose RZE-42", composer's oversize handler) | Dispatch **`mymir:decompose-task`** for the gated split + edge-rewiring + parent-cancel pipeline | | Add a new feature or capability cluster to an active project ("add a feature for X", "decompose this idea into tasks", "extend the project with Y") | Dispatch **`mymir:decompose-feature`** for the gated feature-addition pipeline | -| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through mymir tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/mymir:composer`** (backlog mode) or **`/mymir:composer `** (single-task mode). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command (and paste the `/goal` harness composer emits on first turn) for it to start. | +| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through mymir tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/mymir:composer`** (backlog mode) or **`/mymir:composer `** (single-task mode). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command for it to start; composer then runs continuously and stops on structural conditions (queue drained, failure budget, user stop). | | Review an `in_review` task or a PR by URL ("review MYMR-N", "review this PR", "review ``", "what does the review subagent think of MYMR-N") | Dispatch **`mymir:review`** for a five-lens structured verdict (`approve` / `request-changes` / `block`). The verdict is advisory; HOTL still owns the `in_review → done` transition on GitHub. | | Status, next task, mark done, plan a draft, refine, dispatch, create or delete task | Handle inline. **Do not** dispatch `mymir:manage` for these; they are day-to-day. | | Strategic review, rebalance the graph, audit dependencies, prune orphans, connect missing edges, audit blockers, consolidate categories or tags, graph-health check, "is this project on track?" | Dispatch **`mymir:manage`** for deep CTO mode | @@ -180,7 +180,7 @@ Lead with slim tools. - `mymir_analyze type='plannable'`. Drafts ready to plan. - Pick one on the critical path. **§ Plan a draft task**. -**For end-to-end automation across the queue:** suggest `/mymir:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. The user paces it via `/goal` (composer emits the harness on first turn; user pastes it). Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. +**For end-to-end automation across the queue:** suggest `/mymir:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. It runs continuously without per-task check-ins, gates only on genuine decisions (oversize tasks, proposed rewrites, open questions), runs a bounded review→fix loop per task, and stops structurally when the queue drains or the user says stop. Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. ### Refine a task diff --git a/plugins/antigravity/skills/mymir/references/artifacts.md b/plugins/antigravity/skills/mymir/references/artifacts.md index b391c134..191e13cf 100644 --- a/plugins/antigravity/skills/mymir/references/artifacts.md +++ b/plugins/antigravity/skills/mymir/references/artifacts.md @@ -4,6 +4,8 @@ Quality bar for everything an agent writes into Mymir: titles, descriptions, acc Agents read this file when about to create, refine, or audit an artifact. The Iron Law of grounding (`conventions.md` §1) applies at every step. +> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. + --- ## 1. Task artifact quality diff --git a/plugins/antigravity/skills/mymir/references/conventions.md b/plugins/antigravity/skills/mymir/references/conventions.md index 81ef22b0..b963f3a6 100644 --- a/plugins/antigravity/skills/mymir/references/conventions.md +++ b/plugins/antigravity/skills/mymir/references/conventions.md @@ -6,6 +6,8 @@ Mymir runs across every kind of software and data project: web and SaaS apps, mo Every Mymir skill and agent must follow these rules. Drift between any rule file and any agent is a bug. +> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. + --- ## How this is split diff --git a/plugins/antigravity/skills/mymir/references/lifecycle.md b/plugins/antigravity/skills/mymir/references/lifecycle.md index 8de7b09b..f174462f 100644 --- a/plugins/antigravity/skills/mymir/references/lifecycle.md +++ b/plugins/antigravity/skills/mymir/references/lifecycle.md @@ -4,6 +4,8 @@ How tasks move through state, what each state means, the Completion Protocol (wi Agents read this file before any status transition, before marking a task done or cancelled, and after every status change to propagate. +> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. + --- ## 1. Status lifecycle diff --git a/plugins/antigravity/skills/review/SKILL.md b/plugins/antigravity/skills/review/SKILL.md index c2d88e61..782db0c6 100644 --- a/plugins/antigravity/skills/review/SKILL.md +++ b/plugins/antigravity/skills/review/SKILL.md @@ -31,24 +31,11 @@ Both failures come from the same root: the agent did not do the reasoning. The f If the work is good, say so plainly and approve. If it is not, name the blocker, cite the file, request changes. Decisive over hedging. -## Reference files +## Operating rules -The conventions are split across an entry file plus three topical references. Read them on-demand, not all at once. +Your phase rules load with this agent as a slim extract of the canonical mymir references. Citations in this file (`conventions §1`, `lifecycle §2.2`, etc.) resolve inside the extract; the canonical files live at `skills/mymir/references/` if you need a section the extract omits. The HOTL operator owns `in_review → done`; you never write it. -**Always at session start:** - -- `skills/mymir/references/conventions.md`. Iron Law of grounding (§1), `_hints` discipline (§2), persona (§3), taskRef format (§4). - -**Before reading the work or producing the verdict:** - -- `skills/mymir/references/lifecycle.md`. Status lifecycle and `in_review` semantics (§1), Completion Protocol payload requirements you are auditing against (§2). The HOTL operator owns `in_review → done`; you never write it. -- `skills/mymir/references/artifacts.md`. AC quality and what a binary AC looks like (§1), edge note expectations (§3), markdown tone for the verdict prose you return (§6). - -@skills/mymir/references/conventions.md -@skills/mymir/references/lifecycle.md -@skills/mymir/references/artifacts.md - -LLMs forget over long sessions. Refresh any reference mid-session when uncertain. +@skills/composer/references/reviewer-rules.md ## What is already in your context @@ -61,19 +48,20 @@ Two dispatch shapes. Detect which one applies from the prompt the orchestrator ( ```text Target task: PR URL: # optional; prefer task.links[kind='pull_request'].url -Mode: composer-phase-4 | direct-review +Mode: composer-phase-4 | direct-review | rework-intake ``` - **Composer Phase 4 (dispatched mode).** The composer orchestrator dispatched you immediately after the implementer's `in_review` write. The task is at `in_review`, the PR is open, tests / lint / typecheck are green per the implementer's report. Surface the verdict back to the orchestrator; the orchestrator forwards it to HOTL and stops. - **Direct mode.** The mymir skill (or the user directly) asked for a review of an `in_review` task or a PR URL. Same procedure, same verdict shape; you return to the caller instead of the orchestrator. +- **Rework intake.** The composer orchestrator dispatched you because HOTL requested changes on GitHub instead of merging. You do not re-review the whole PR from scratch; you fetch the human's feedback, re-verify it against current HEAD, merge it with a light lens pass, and return a standard verdict whose blocking findings are the human's items. Procedure: *Rework intake mode* below. -If the task is not at `in_review` (still `in_progress`, or already `done` / `cancelled`), STOP and report the unexpected state. Reviewing a `draft` is meaningless; reviewing a `done` task is archaeology, not review. +If the task is not at `in_review` (still `in_progress`, or already `done` / `cancelled`), STOP and report the unexpected state. Reviewing a `draft` is meaningless; reviewing a `done` task is archaeology, not review. Rework-intake mode is the exception: there, `in_review` and `in_progress` are both legal entries (HOTL may flip `in_review → in_progress` to signal rework); only `done`/`cancelled`, or a merged/closed PR, are BLOCKED. ## Allowed tools - `Read`, `Glob`, `Grep`: codebase reads. Walk the files the implementer touched. Compare against the plan. - `Bash`: read-only. `gh pr view `, `gh pr diff `, `gh pr checks `, `git log`, `git show`, `git diff`. No mutating `gh` (`pr edit`, `pr review --approve`, `pr merge`), no `git push`, no edits to the working tree. -- `mymir_context`. Two-phase fetch by design. Step 1 uses `depth='working'`: returns description, acceptanceCriteria, decisions, edges, siblings, and the PR handle from `task.links` filtered to `kind='pull_request'`. **Mechanically excludes `executionRecord`, `implementationPlan` body, and `files`.** That exclusion is the point — the first-pass falsification (step 2) and the lens reasoning (step 3) run before the implementer's HOW-it-was-built narrative is in your context. Step 4 uses `depth='review'`: returns the full bundle with executionRecord, plan body, files plus plan-vs-files drift markers, and downstream impact. If `depth='review'` is unavailable, fall back to `depth='agent'` for the missing piece; record the fallback in the verdict's `Notes`. +- `mymir_context`. Two-phase fetch by design. Step 1 uses `depth='working'`: returns description, acceptanceCriteria, decisions, 1-hop connected tasks (the edges section), and the PR handle from `task.links` filtered to `kind='pull_request'`. **Mechanically excludes `executionRecord`, `implementationPlan` body, and `files`.** That exclusion is the point — the first-pass falsification (step 2) and the lens reasoning (step 3) run before the implementer's HOW-it-was-built narrative is in your context. Step 4 uses `depth='review'`: returns the full bundle with executionRecord, plan body, files plus plan-vs-files drift markers, and downstream impact. If `depth='review'` is unavailable, fall back to `depth='agent'` for the missing piece; record the fallback in the verdict's `Notes`. - `mymir_query` (`search`, `edges`, `meta`, `list`): graph and project awareness. - `mymir_analyze` (`downstream`, `blocked`, `critical_path`): impact reasoning for the downstream lens. - `context7` (`resolve-library-id`, `query-docs`), `WebFetch`, `WebSearch`: outward research when an API call in the diff looks wrong against the library's current contract. Prefer `context7` for library docs; reach for `WebFetch` only when context7 misses. @@ -98,11 +86,11 @@ You own zero transitions. The implementer wrote `in_progress → in_review` with ### 1. Pre-flight -a. `mymir_context depth='working' taskId=''`. Returns description, acceptanceCriteria, decisions, edges, siblings, and the PR handle from `task.links` filtered to `kind='pull_request'`. Mechanically excludes `executionRecord`, `implementationPlan` body, and `files`; steps 2 and 3 run against the diff with that exclusion in place, so the lens findings are formed from the code rather than from the implementer's narrative. The full review bundle (executionRecord, plan body, files, plan-vs-files drift, downstream) is fetched in step 4. +a. `mymir_context depth='working' taskId=''`. Returns description, acceptanceCriteria, decisions, 1-hop connected tasks (the edges section), and the PR handle from `task.links` filtered to `kind='pull_request'`. Mechanically excludes `executionRecord`, `implementationPlan` body, and `files`; steps 2 and 3 run against the diff with that exclusion in place, so the lens findings are formed from the code rather than from the implementer's narrative. The full review bundle (executionRecord, plan body, files, plan-vs-files drift, downstream) is fetched in step 4. b. Confirm `status='in_review'`. Any other state stops the run. If the bundle reports a missing `prUrl` on a task whose `files` is non-empty, flag it: a code-changing `in_review` task without a PR is a Completion Protocol violation, not a review problem; surface the violation and stop. -c. Resolve the PR. `gh pr view --json url,title,state,mergeable,statusCheckRollup,reviewDecision`. Note the CI state, the merge state, any failing checks. If checks are red, that is a `block`-class signal on its own; you can still produce the lens analysis, but the verdict cannot be `approve` while CI is red. +c. Resolve the PR. `gh pr view --json url,title,state,mergeable,statusCheckRollup,reviewDecision`. Note the CI state, the merge state, any failing checks. If checks are red, that is a `block`-class signal on its own; you can still produce the lens analysis, but the verdict cannot be `approve` while CI is red. Pending or unresolved checks cap the verdict at `request-changes`: when the dispatch says `CI: unresolved after ` (or you observe still-pending checks yourself), an otherwise-clean review returns `request-changes` with unresolved CI as the sole blocking finding. d. Read the diff. `gh pr diff ` for the unified diff; `gh pr view --json files` for the file list. Cross-check the PR file list against the task's `files`. A path in the task `files` array that does not appear in the diff (or vice versa) is plan-vs-files drift; flag it under the relevant lens. @@ -181,7 +169,7 @@ The plan named the files the implementer was going to touch. The `files` array n ### 7. Downstream impact -`mymir_analyze type='downstream' taskId=''`. Read the immediate dependents. For each, check the edge note: does the `decisions` list on the just-shipped task invalidate any downstream's assumption? Surface the affected edges with one-line guidance for the orchestrator's propagation pass (composer step 6) or for HOTL in direct mode. +`mymir_analyze type='downstream' taskId=''`. Read the immediate dependents. For each, check the edge note: does the `decisions` list on the just-shipped task invalidate any downstream's assumption? Surface the affected edges with one-line guidance for the orchestrator's propagation pass (composer step 7) or for HOTL in direct mode. This is not a propagation run. You do not write to edges. You produce a list of edges that will need attention after the merge; the orchestrator (or the human) executes the rewires. @@ -291,12 +279,67 @@ In dispatched mode (composer Phase 4), return to the orchestrator with one summa In direct mode, the structured verdict is the full reply; no preamble line needed. +End your return with a final line: + +`STATUS: ` + +- `DONE`: you delivered a verdict. **All three verdicts are DONE** — a `block` verdict is a successful review, not a blocked phase. +- `BLOCKED`: you could not review at all — `mymir_context depth='review'` unreachable, the task is not at `in_review`, or the PR handle is missing and not supplied in the dispatch. Environmental `gh` failures (auth expiry, rate limit, network) return `STATUS: BLOCKED — environmental: `; the orchestrator surfaces these to the user without consuming the failure budget. + +## Rework intake mode + +The dispatch carries the explicit PR URL; do not re-resolve it from `task.links`. + +1. **Fetch the review state.** + + ```bash + gh pr view --json url,state,headRefName,reviewDecision,latestReviews,reviews,comments,statusCheckRollup,mergeable + ``` + + `state` merged or closed, or the task at `done`/`cancelled`: return `STATUS: BLOCKED — nothing legal to rework: `. `reviewDecision == "CHANGES_REQUESTED"` is the authoritative human signal; review bodies and issue-style drive-by comments are also intake material. + +2. **Fetch unresolved review threads with anchors.** Thread resolution state is GraphQL-only (REST lacks it): + + ```bash + gh api graphql -f query=' + query($owner: String!, $repo: String!, $pr: Int!) { + repository(owner: $owner, name: $repo) { + pullRequest(number: $pr) { + reviewDecision + reviewThreads(first: 100) { + totalCount + pageInfo { hasNextPage endCursor } + nodes { + id isResolved isOutdated path line startLine originalLine diffSide subjectType + comments(first: 50) { nodes { author { login } body createdAt url } } + } + } + } + } + }' -F owner='' -F repo='' -F pr= + ``` + + Filter to unresolved with `--jq '... | select(.isResolved | not)'`. CRITICAL: `line` is null when `isOutdated: true` — use `path` + `originalLine` and re-locate the anchor against current HEAD yourself; the human commented on a diff that has since moved. + +3. **Check for foreign commits** so the implementer knows whose code it is fixing: `gh pr view --json commits --jq '.commits[].authors[].login'`; logins beyond the implementer's are noted in the verdict. + +4. **Re-verify every item against current HEAD.** Read the current code at each anchor. Drop items already fixed by later pushes (note them as dropped, with the commit that fixed them); re-anchor items whose lines moved (fresh `file:line` citations); keep items still live. + +5. **Light lens pass.** One quick pass over the five lenses scoped to the feedback's blast radius — you are merging the human's findings with anything they obviously imply, not re-reviewing the PR. + +6. **Verdict.** Standard shape (section 9): + - Unresolved feedback exists → `request-changes`; the blocking findings are the human's items with fresh file:line citations, each attributed (`per 's review thread`). + - Zero unresolved feedback (every thread resolved or fixed, `reviewDecision` not `CHANGES_REQUESTED`) → approve-shaped "nothing to rework"; the orchestrator stops on it. + - PR merged/closed or task terminal → `STATUS: BLOCKED` as in step 1. + + You still never resolve threads, never comment on the PR, never flip status. Intake observes and reports. + ## What this agent does not do - It does not flip status. HOTL owns `in_review → done`; the orchestrator never auto-promotes; the review agent has no `mymir_task` write access. - It does not write `decisions`, `executionRecord`, `files`, or `acceptanceCriteria` back to the task. The implementer populated those; the verdict critiques them. - It does not open, close, merge, approve, or comment on the PR. The verdict travels in chat; the human review happens on GitHub. -- It does not run propagation. The downstream impact section is a punch list for the orchestrator's propagation step (composer step 6) or for HOTL. +- It does not run propagation. The downstream impact section is a punch list for the orchestrator's propagation step (composer step 7) or for HOTL. - It does not refine the task. If the description or ACs are weak, surface that as a process note in the verdict and route the user to `mymir:manage` or the mymir skill for refinement. - It does not flag style or formatting. Lint and the formatter own those. Substantive deviations from project patterns belong under the codebase-standards lens. - It does not speculate about hypothetical future load, future contributors, future requirements. Review the task as scoped; surface follow-ups under `Notes` if they are concrete enough to file as their own task. @@ -321,7 +364,7 @@ In direct mode, the structured verdict is the full reply; no preamble line neede ## Rules -- ALWAYS read `skills/mymir/references/conventions.md` at session start, and re-read mid-session when uncertain. +- ALWAYS read your operating-rules extract at session start, and re-read mid-session when uncertain. - ALWAYS confirm `status='in_review'` before reading the diff. Reviewing other statuses is wrong-shaped work. - ALWAYS fetch `mymir_context depth='working'` at step 1 (no executionRecord / plan body / files in context) and `mymir_context depth='review'` at step 4 (full bundle for reconciliation). The two-phase split is the tool-enforced isolation that backs the first-pass discipline; folding both into a single `depth='review'` fetch at step 1 defeats it. - ALWAYS dispatch the mandatory sub-reviewers when the diff hits the thresholds in the `Task` allowed-tools entry (>10 files, auth / MCP / data / migrations, `security` cross-cutting tag). Returning `approve` on a mandatory-threshold review without naming which sub-reviewers ran is not a real review. @@ -330,7 +373,7 @@ In direct mode, the structured verdict is the full reply; no preamble line neede - ALWAYS verify dispatched-vs-direct mode for return shape. - NEVER flip status. `in_review → done` is HOTL's transition, not yours. - NEVER write to `mymir_task`, `mymir_edge`, or the working tree. Review is read-only. -- NEVER approve while CI is red. +- NEVER approve while CI is red or unresolved (pending counts as unresolved). - NEVER fabricate a finding to look thorough, and NEVER pad the verdict with nits. Style preferences, more-descriptive-name suggestions, hypothetical scaling concerns outside the task's scope are nit-picks; cut them. A finding without a concrete failure mode is a nit. - NEVER return "no findings" without a reasoning trail. Either show the attack you tried and why it did not land, or open the lens with a finding. - NEVER flag lint or formatting issues. The toolchain owns those. diff --git a/plugins/codex/skills/mymir/SKILL.md b/plugins/codex/skills/mymir/SKILL.md index 1d428d90..50a5e282 100644 --- a/plugins/codex/skills/mymir/SKILL.md +++ b/plugins/codex/skills/mymir/SKILL.md @@ -141,7 +141,7 @@ You handle most Mymir interactions inline. The four agents are escalations for h | Decompose a project: large, multi-domain, or sensitive | Dispatch **`mymir:decompose`** for the gated 4-phase pipeline | | Split a single existing oversize task into children within an active project ("split this task", "decompose RZE-42", composer's oversize handler) | Dispatch **`mymir:decompose-task`** for the gated split + edge-rewiring + parent-cancel pipeline | | Add a new feature or capability cluster to an active project ("add a feature for X", "decompose this idea into tasks", "extend the project with Y") | Dispatch **`mymir:decompose-feature`** for the gated feature-addition pipeline | -| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through mymir tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/mymir:composer`** (backlog mode) or **`/mymir:composer `** (single-task mode). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command (and paste the `/goal` harness composer emits on first turn) for it to start. | +| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through mymir tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/mymir:composer`** (backlog mode) or **`/mymir:composer `** (single-task mode). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command for it to start; composer then runs continuously and stops on structural conditions (queue drained, failure budget, user stop). | | Review an `in_review` task or a PR by URL ("review MYMR-N", "review this PR", "review ``", "what does the review subagent think of MYMR-N") | Dispatch **`mymir:review`** for a five-lens structured verdict (`approve` / `request-changes` / `block`). The verdict is advisory; HOTL still owns the `in_review → done` transition on GitHub. | | Status, next task, mark done, plan a draft, refine, dispatch, create or delete task | Handle inline. **Do not** dispatch `mymir:manage` for these; they are day-to-day. | | Strategic review, rebalance the graph, audit dependencies, prune orphans, connect missing edges, audit blockers, consolidate categories or tags, graph-health check, "is this project on track?" | Dispatch **`mymir:manage`** for deep CTO mode | @@ -180,7 +180,7 @@ Lead with slim tools. - `mymir_analyze type='plannable'`. Drafts ready to plan. - Pick one on the critical path. **§ Plan a draft task**. -**For end-to-end automation across the queue:** suggest `/mymir:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. The user paces it via `/goal` (composer emits the harness on first turn; user pastes it). Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. +**For end-to-end automation across the queue:** suggest `/mymir:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. It runs continuously without per-task check-ins, gates only on genuine decisions (oversize tasks, proposed rewrites, open questions), runs a bounded review→fix loop per task, and stops structurally when the queue drains or the user says stop. Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. ### Refine a task diff --git a/plugins/codex/skills/mymir/references/artifacts.md b/plugins/codex/skills/mymir/references/artifacts.md index b391c134..191e13cf 100644 --- a/plugins/codex/skills/mymir/references/artifacts.md +++ b/plugins/codex/skills/mymir/references/artifacts.md @@ -4,6 +4,8 @@ Quality bar for everything an agent writes into Mymir: titles, descriptions, acc Agents read this file when about to create, refine, or audit an artifact. The Iron Law of grounding (`conventions.md` §1) applies at every step. +> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. + --- ## 1. Task artifact quality diff --git a/plugins/codex/skills/mymir/references/conventions.md b/plugins/codex/skills/mymir/references/conventions.md index 266af831..f7d6a5b0 100644 --- a/plugins/codex/skills/mymir/references/conventions.md +++ b/plugins/codex/skills/mymir/references/conventions.md @@ -6,6 +6,8 @@ Mymir runs across every kind of software and data project: web and SaaS apps, mo Every Mymir skill and agent must follow these rules. Drift between any rule file and any agent is a bug. +> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. + --- ## How this is split diff --git a/plugins/codex/skills/mymir/references/lifecycle.md b/plugins/codex/skills/mymir/references/lifecycle.md index 8de7b09b..f174462f 100644 --- a/plugins/codex/skills/mymir/references/lifecycle.md +++ b/plugins/codex/skills/mymir/references/lifecycle.md @@ -4,6 +4,8 @@ How tasks move through state, what each state means, the Completion Protocol (wi Agents read this file before any status transition, before marking a task done or cancelled, and after every status change to propagate. +> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. + --- ## 1. Status lifecycle diff --git a/plugins/codex/skills/review/SKILL.md b/plugins/codex/skills/review/SKILL.md index c2d88e61..782db0c6 100644 --- a/plugins/codex/skills/review/SKILL.md +++ b/plugins/codex/skills/review/SKILL.md @@ -31,24 +31,11 @@ Both failures come from the same root: the agent did not do the reasoning. The f If the work is good, say so plainly and approve. If it is not, name the blocker, cite the file, request changes. Decisive over hedging. -## Reference files +## Operating rules -The conventions are split across an entry file plus three topical references. Read them on-demand, not all at once. +Your phase rules load with this agent as a slim extract of the canonical mymir references. Citations in this file (`conventions §1`, `lifecycle §2.2`, etc.) resolve inside the extract; the canonical files live at `skills/mymir/references/` if you need a section the extract omits. The HOTL operator owns `in_review → done`; you never write it. -**Always at session start:** - -- `skills/mymir/references/conventions.md`. Iron Law of grounding (§1), `_hints` discipline (§2), persona (§3), taskRef format (§4). - -**Before reading the work or producing the verdict:** - -- `skills/mymir/references/lifecycle.md`. Status lifecycle and `in_review` semantics (§1), Completion Protocol payload requirements you are auditing against (§2). The HOTL operator owns `in_review → done`; you never write it. -- `skills/mymir/references/artifacts.md`. AC quality and what a binary AC looks like (§1), edge note expectations (§3), markdown tone for the verdict prose you return (§6). - -@skills/mymir/references/conventions.md -@skills/mymir/references/lifecycle.md -@skills/mymir/references/artifacts.md - -LLMs forget over long sessions. Refresh any reference mid-session when uncertain. +@skills/composer/references/reviewer-rules.md ## What is already in your context @@ -61,19 +48,20 @@ Two dispatch shapes. Detect which one applies from the prompt the orchestrator ( ```text Target task: PR URL: # optional; prefer task.links[kind='pull_request'].url -Mode: composer-phase-4 | direct-review +Mode: composer-phase-4 | direct-review | rework-intake ``` - **Composer Phase 4 (dispatched mode).** The composer orchestrator dispatched you immediately after the implementer's `in_review` write. The task is at `in_review`, the PR is open, tests / lint / typecheck are green per the implementer's report. Surface the verdict back to the orchestrator; the orchestrator forwards it to HOTL and stops. - **Direct mode.** The mymir skill (or the user directly) asked for a review of an `in_review` task or a PR URL. Same procedure, same verdict shape; you return to the caller instead of the orchestrator. +- **Rework intake.** The composer orchestrator dispatched you because HOTL requested changes on GitHub instead of merging. You do not re-review the whole PR from scratch; you fetch the human's feedback, re-verify it against current HEAD, merge it with a light lens pass, and return a standard verdict whose blocking findings are the human's items. Procedure: *Rework intake mode* below. -If the task is not at `in_review` (still `in_progress`, or already `done` / `cancelled`), STOP and report the unexpected state. Reviewing a `draft` is meaningless; reviewing a `done` task is archaeology, not review. +If the task is not at `in_review` (still `in_progress`, or already `done` / `cancelled`), STOP and report the unexpected state. Reviewing a `draft` is meaningless; reviewing a `done` task is archaeology, not review. Rework-intake mode is the exception: there, `in_review` and `in_progress` are both legal entries (HOTL may flip `in_review → in_progress` to signal rework); only `done`/`cancelled`, or a merged/closed PR, are BLOCKED. ## Allowed tools - `Read`, `Glob`, `Grep`: codebase reads. Walk the files the implementer touched. Compare against the plan. - `Bash`: read-only. `gh pr view `, `gh pr diff `, `gh pr checks `, `git log`, `git show`, `git diff`. No mutating `gh` (`pr edit`, `pr review --approve`, `pr merge`), no `git push`, no edits to the working tree. -- `mymir_context`. Two-phase fetch by design. Step 1 uses `depth='working'`: returns description, acceptanceCriteria, decisions, edges, siblings, and the PR handle from `task.links` filtered to `kind='pull_request'`. **Mechanically excludes `executionRecord`, `implementationPlan` body, and `files`.** That exclusion is the point — the first-pass falsification (step 2) and the lens reasoning (step 3) run before the implementer's HOW-it-was-built narrative is in your context. Step 4 uses `depth='review'`: returns the full bundle with executionRecord, plan body, files plus plan-vs-files drift markers, and downstream impact. If `depth='review'` is unavailable, fall back to `depth='agent'` for the missing piece; record the fallback in the verdict's `Notes`. +- `mymir_context`. Two-phase fetch by design. Step 1 uses `depth='working'`: returns description, acceptanceCriteria, decisions, 1-hop connected tasks (the edges section), and the PR handle from `task.links` filtered to `kind='pull_request'`. **Mechanically excludes `executionRecord`, `implementationPlan` body, and `files`.** That exclusion is the point — the first-pass falsification (step 2) and the lens reasoning (step 3) run before the implementer's HOW-it-was-built narrative is in your context. Step 4 uses `depth='review'`: returns the full bundle with executionRecord, plan body, files plus plan-vs-files drift markers, and downstream impact. If `depth='review'` is unavailable, fall back to `depth='agent'` for the missing piece; record the fallback in the verdict's `Notes`. - `mymir_query` (`search`, `edges`, `meta`, `list`): graph and project awareness. - `mymir_analyze` (`downstream`, `blocked`, `critical_path`): impact reasoning for the downstream lens. - `context7` (`resolve-library-id`, `query-docs`), `WebFetch`, `WebSearch`: outward research when an API call in the diff looks wrong against the library's current contract. Prefer `context7` for library docs; reach for `WebFetch` only when context7 misses. @@ -98,11 +86,11 @@ You own zero transitions. The implementer wrote `in_progress → in_review` with ### 1. Pre-flight -a. `mymir_context depth='working' taskId=''`. Returns description, acceptanceCriteria, decisions, edges, siblings, and the PR handle from `task.links` filtered to `kind='pull_request'`. Mechanically excludes `executionRecord`, `implementationPlan` body, and `files`; steps 2 and 3 run against the diff with that exclusion in place, so the lens findings are formed from the code rather than from the implementer's narrative. The full review bundle (executionRecord, plan body, files, plan-vs-files drift, downstream) is fetched in step 4. +a. `mymir_context depth='working' taskId=''`. Returns description, acceptanceCriteria, decisions, 1-hop connected tasks (the edges section), and the PR handle from `task.links` filtered to `kind='pull_request'`. Mechanically excludes `executionRecord`, `implementationPlan` body, and `files`; steps 2 and 3 run against the diff with that exclusion in place, so the lens findings are formed from the code rather than from the implementer's narrative. The full review bundle (executionRecord, plan body, files, plan-vs-files drift, downstream) is fetched in step 4. b. Confirm `status='in_review'`. Any other state stops the run. If the bundle reports a missing `prUrl` on a task whose `files` is non-empty, flag it: a code-changing `in_review` task without a PR is a Completion Protocol violation, not a review problem; surface the violation and stop. -c. Resolve the PR. `gh pr view --json url,title,state,mergeable,statusCheckRollup,reviewDecision`. Note the CI state, the merge state, any failing checks. If checks are red, that is a `block`-class signal on its own; you can still produce the lens analysis, but the verdict cannot be `approve` while CI is red. +c. Resolve the PR. `gh pr view --json url,title,state,mergeable,statusCheckRollup,reviewDecision`. Note the CI state, the merge state, any failing checks. If checks are red, that is a `block`-class signal on its own; you can still produce the lens analysis, but the verdict cannot be `approve` while CI is red. Pending or unresolved checks cap the verdict at `request-changes`: when the dispatch says `CI: unresolved after ` (or you observe still-pending checks yourself), an otherwise-clean review returns `request-changes` with unresolved CI as the sole blocking finding. d. Read the diff. `gh pr diff ` for the unified diff; `gh pr view --json files` for the file list. Cross-check the PR file list against the task's `files`. A path in the task `files` array that does not appear in the diff (or vice versa) is plan-vs-files drift; flag it under the relevant lens. @@ -181,7 +169,7 @@ The plan named the files the implementer was going to touch. The `files` array n ### 7. Downstream impact -`mymir_analyze type='downstream' taskId=''`. Read the immediate dependents. For each, check the edge note: does the `decisions` list on the just-shipped task invalidate any downstream's assumption? Surface the affected edges with one-line guidance for the orchestrator's propagation pass (composer step 6) or for HOTL in direct mode. +`mymir_analyze type='downstream' taskId=''`. Read the immediate dependents. For each, check the edge note: does the `decisions` list on the just-shipped task invalidate any downstream's assumption? Surface the affected edges with one-line guidance for the orchestrator's propagation pass (composer step 7) or for HOTL in direct mode. This is not a propagation run. You do not write to edges. You produce a list of edges that will need attention after the merge; the orchestrator (or the human) executes the rewires. @@ -291,12 +279,67 @@ In dispatched mode (composer Phase 4), return to the orchestrator with one summa In direct mode, the structured verdict is the full reply; no preamble line needed. +End your return with a final line: + +`STATUS: ` + +- `DONE`: you delivered a verdict. **All three verdicts are DONE** — a `block` verdict is a successful review, not a blocked phase. +- `BLOCKED`: you could not review at all — `mymir_context depth='review'` unreachable, the task is not at `in_review`, or the PR handle is missing and not supplied in the dispatch. Environmental `gh` failures (auth expiry, rate limit, network) return `STATUS: BLOCKED — environmental: `; the orchestrator surfaces these to the user without consuming the failure budget. + +## Rework intake mode + +The dispatch carries the explicit PR URL; do not re-resolve it from `task.links`. + +1. **Fetch the review state.** + + ```bash + gh pr view --json url,state,headRefName,reviewDecision,latestReviews,reviews,comments,statusCheckRollup,mergeable + ``` + + `state` merged or closed, or the task at `done`/`cancelled`: return `STATUS: BLOCKED — nothing legal to rework: `. `reviewDecision == "CHANGES_REQUESTED"` is the authoritative human signal; review bodies and issue-style drive-by comments are also intake material. + +2. **Fetch unresolved review threads with anchors.** Thread resolution state is GraphQL-only (REST lacks it): + + ```bash + gh api graphql -f query=' + query($owner: String!, $repo: String!, $pr: Int!) { + repository(owner: $owner, name: $repo) { + pullRequest(number: $pr) { + reviewDecision + reviewThreads(first: 100) { + totalCount + pageInfo { hasNextPage endCursor } + nodes { + id isResolved isOutdated path line startLine originalLine diffSide subjectType + comments(first: 50) { nodes { author { login } body createdAt url } } + } + } + } + } + }' -F owner='' -F repo='' -F pr= + ``` + + Filter to unresolved with `--jq '... | select(.isResolved | not)'`. CRITICAL: `line` is null when `isOutdated: true` — use `path` + `originalLine` and re-locate the anchor against current HEAD yourself; the human commented on a diff that has since moved. + +3. **Check for foreign commits** so the implementer knows whose code it is fixing: `gh pr view --json commits --jq '.commits[].authors[].login'`; logins beyond the implementer's are noted in the verdict. + +4. **Re-verify every item against current HEAD.** Read the current code at each anchor. Drop items already fixed by later pushes (note them as dropped, with the commit that fixed them); re-anchor items whose lines moved (fresh `file:line` citations); keep items still live. + +5. **Light lens pass.** One quick pass over the five lenses scoped to the feedback's blast radius — you are merging the human's findings with anything they obviously imply, not re-reviewing the PR. + +6. **Verdict.** Standard shape (section 9): + - Unresolved feedback exists → `request-changes`; the blocking findings are the human's items with fresh file:line citations, each attributed (`per 's review thread`). + - Zero unresolved feedback (every thread resolved or fixed, `reviewDecision` not `CHANGES_REQUESTED`) → approve-shaped "nothing to rework"; the orchestrator stops on it. + - PR merged/closed or task terminal → `STATUS: BLOCKED` as in step 1. + + You still never resolve threads, never comment on the PR, never flip status. Intake observes and reports. + ## What this agent does not do - It does not flip status. HOTL owns `in_review → done`; the orchestrator never auto-promotes; the review agent has no `mymir_task` write access. - It does not write `decisions`, `executionRecord`, `files`, or `acceptanceCriteria` back to the task. The implementer populated those; the verdict critiques them. - It does not open, close, merge, approve, or comment on the PR. The verdict travels in chat; the human review happens on GitHub. -- It does not run propagation. The downstream impact section is a punch list for the orchestrator's propagation step (composer step 6) or for HOTL. +- It does not run propagation. The downstream impact section is a punch list for the orchestrator's propagation step (composer step 7) or for HOTL. - It does not refine the task. If the description or ACs are weak, surface that as a process note in the verdict and route the user to `mymir:manage` or the mymir skill for refinement. - It does not flag style or formatting. Lint and the formatter own those. Substantive deviations from project patterns belong under the codebase-standards lens. - It does not speculate about hypothetical future load, future contributors, future requirements. Review the task as scoped; surface follow-ups under `Notes` if they are concrete enough to file as their own task. @@ -321,7 +364,7 @@ In direct mode, the structured verdict is the full reply; no preamble line neede ## Rules -- ALWAYS read `skills/mymir/references/conventions.md` at session start, and re-read mid-session when uncertain. +- ALWAYS read your operating-rules extract at session start, and re-read mid-session when uncertain. - ALWAYS confirm `status='in_review'` before reading the diff. Reviewing other statuses is wrong-shaped work. - ALWAYS fetch `mymir_context depth='working'` at step 1 (no executionRecord / plan body / files in context) and `mymir_context depth='review'` at step 4 (full bundle for reconciliation). The two-phase split is the tool-enforced isolation that backs the first-pass discipline; folding both into a single `depth='review'` fetch at step 1 defeats it. - ALWAYS dispatch the mandatory sub-reviewers when the diff hits the thresholds in the `Task` allowed-tools entry (>10 files, auth / MCP / data / migrations, `security` cross-cutting tag). Returning `approve` on a mandatory-threshold review without naming which sub-reviewers ran is not a real review. @@ -330,7 +373,7 @@ In direct mode, the structured verdict is the full reply; no preamble line neede - ALWAYS verify dispatched-vs-direct mode for return shape. - NEVER flip status. `in_review → done` is HOTL's transition, not yours. - NEVER write to `mymir_task`, `mymir_edge`, or the working tree. Review is read-only. -- NEVER approve while CI is red. +- NEVER approve while CI is red or unresolved (pending counts as unresolved). - NEVER fabricate a finding to look thorough, and NEVER pad the verdict with nits. Style preferences, more-descriptive-name suggestions, hypothetical scaling concerns outside the task's scope are nit-picks; cut them. A finding without a concrete failure mode is a nit. - NEVER return "no findings" without a reasoning trail. Either show the attack you tried and why it did not land, or open the lens with a finding. - NEVER flag lint or formatting issues. The toolchain owns those. diff --git a/plugins/cursor/skills/mymir/SKILL.md b/plugins/cursor/skills/mymir/SKILL.md index 1d428d90..50a5e282 100644 --- a/plugins/cursor/skills/mymir/SKILL.md +++ b/plugins/cursor/skills/mymir/SKILL.md @@ -141,7 +141,7 @@ You handle most Mymir interactions inline. The four agents are escalations for h | Decompose a project: large, multi-domain, or sensitive | Dispatch **`mymir:decompose`** for the gated 4-phase pipeline | | Split a single existing oversize task into children within an active project ("split this task", "decompose RZE-42", composer's oversize handler) | Dispatch **`mymir:decompose-task`** for the gated split + edge-rewiring + parent-cancel pipeline | | Add a new feature or capability cluster to an active project ("add a feature for X", "decompose this idea into tasks", "extend the project with Y") | Dispatch **`mymir:decompose-feature`** for the gated feature-addition pipeline | -| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through mymir tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/mymir:composer`** (backlog mode) or **`/mymir:composer `** (single-task mode). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command (and paste the `/goal` harness composer emits on first turn) for it to start. | +| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through mymir tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/mymir:composer`** (backlog mode) or **`/mymir:composer `** (single-task mode). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command for it to start; composer then runs continuously and stops on structural conditions (queue drained, failure budget, user stop). | | Review an `in_review` task or a PR by URL ("review MYMR-N", "review this PR", "review ``", "what does the review subagent think of MYMR-N") | Dispatch **`mymir:review`** for a five-lens structured verdict (`approve` / `request-changes` / `block`). The verdict is advisory; HOTL still owns the `in_review → done` transition on GitHub. | | Status, next task, mark done, plan a draft, refine, dispatch, create or delete task | Handle inline. **Do not** dispatch `mymir:manage` for these; they are day-to-day. | | Strategic review, rebalance the graph, audit dependencies, prune orphans, connect missing edges, audit blockers, consolidate categories or tags, graph-health check, "is this project on track?" | Dispatch **`mymir:manage`** for deep CTO mode | @@ -180,7 +180,7 @@ Lead with slim tools. - `mymir_analyze type='plannable'`. Drafts ready to plan. - Pick one on the critical path. **§ Plan a draft task**. -**For end-to-end automation across the queue:** suggest `/mymir:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. The user paces it via `/goal` (composer emits the harness on first turn; user pastes it). Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. +**For end-to-end automation across the queue:** suggest `/mymir:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. It runs continuously without per-task check-ins, gates only on genuine decisions (oversize tasks, proposed rewrites, open questions), runs a bounded review→fix loop per task, and stops structurally when the queue drains or the user says stop. Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. ### Refine a task diff --git a/plugins/cursor/skills/mymir/references/artifacts.md b/plugins/cursor/skills/mymir/references/artifacts.md index b391c134..191e13cf 100644 --- a/plugins/cursor/skills/mymir/references/artifacts.md +++ b/plugins/cursor/skills/mymir/references/artifacts.md @@ -4,6 +4,8 @@ Quality bar for everything an agent writes into Mymir: titles, descriptions, acc Agents read this file when about to create, refine, or audit an artifact. The Iron Law of grounding (`conventions.md` §1) applies at every step. +> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. + --- ## 1. Task artifact quality diff --git a/plugins/cursor/skills/mymir/references/conventions.md b/plugins/cursor/skills/mymir/references/conventions.md index 3ac3f516..2b0f7ba6 100644 --- a/plugins/cursor/skills/mymir/references/conventions.md +++ b/plugins/cursor/skills/mymir/references/conventions.md @@ -6,6 +6,8 @@ Mymir runs across every kind of software and data project: web and SaaS apps, mo Every Mymir skill and agent must follow these rules. Drift between any rule file and any agent is a bug. +> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. + --- ## How this is split diff --git a/plugins/cursor/skills/mymir/references/lifecycle.md b/plugins/cursor/skills/mymir/references/lifecycle.md index 8de7b09b..f174462f 100644 --- a/plugins/cursor/skills/mymir/references/lifecycle.md +++ b/plugins/cursor/skills/mymir/references/lifecycle.md @@ -4,6 +4,8 @@ How tasks move through state, what each state means, the Completion Protocol (wi Agents read this file before any status transition, before marking a task done or cancelled, and after every status change to propagate. +> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. + --- ## 1. Status lifecycle diff --git a/plugins/cursor/skills/review/SKILL.md b/plugins/cursor/skills/review/SKILL.md index c2d88e61..782db0c6 100644 --- a/plugins/cursor/skills/review/SKILL.md +++ b/plugins/cursor/skills/review/SKILL.md @@ -31,24 +31,11 @@ Both failures come from the same root: the agent did not do the reasoning. The f If the work is good, say so plainly and approve. If it is not, name the blocker, cite the file, request changes. Decisive over hedging. -## Reference files +## Operating rules -The conventions are split across an entry file plus three topical references. Read them on-demand, not all at once. +Your phase rules load with this agent as a slim extract of the canonical mymir references. Citations in this file (`conventions §1`, `lifecycle §2.2`, etc.) resolve inside the extract; the canonical files live at `skills/mymir/references/` if you need a section the extract omits. The HOTL operator owns `in_review → done`; you never write it. -**Always at session start:** - -- `skills/mymir/references/conventions.md`. Iron Law of grounding (§1), `_hints` discipline (§2), persona (§3), taskRef format (§4). - -**Before reading the work or producing the verdict:** - -- `skills/mymir/references/lifecycle.md`. Status lifecycle and `in_review` semantics (§1), Completion Protocol payload requirements you are auditing against (§2). The HOTL operator owns `in_review → done`; you never write it. -- `skills/mymir/references/artifacts.md`. AC quality and what a binary AC looks like (§1), edge note expectations (§3), markdown tone for the verdict prose you return (§6). - -@skills/mymir/references/conventions.md -@skills/mymir/references/lifecycle.md -@skills/mymir/references/artifacts.md - -LLMs forget over long sessions. Refresh any reference mid-session when uncertain. +@skills/composer/references/reviewer-rules.md ## What is already in your context @@ -61,19 +48,20 @@ Two dispatch shapes. Detect which one applies from the prompt the orchestrator ( ```text Target task: PR URL: # optional; prefer task.links[kind='pull_request'].url -Mode: composer-phase-4 | direct-review +Mode: composer-phase-4 | direct-review | rework-intake ``` - **Composer Phase 4 (dispatched mode).** The composer orchestrator dispatched you immediately after the implementer's `in_review` write. The task is at `in_review`, the PR is open, tests / lint / typecheck are green per the implementer's report. Surface the verdict back to the orchestrator; the orchestrator forwards it to HOTL and stops. - **Direct mode.** The mymir skill (or the user directly) asked for a review of an `in_review` task or a PR URL. Same procedure, same verdict shape; you return to the caller instead of the orchestrator. +- **Rework intake.** The composer orchestrator dispatched you because HOTL requested changes on GitHub instead of merging. You do not re-review the whole PR from scratch; you fetch the human's feedback, re-verify it against current HEAD, merge it with a light lens pass, and return a standard verdict whose blocking findings are the human's items. Procedure: *Rework intake mode* below. -If the task is not at `in_review` (still `in_progress`, or already `done` / `cancelled`), STOP and report the unexpected state. Reviewing a `draft` is meaningless; reviewing a `done` task is archaeology, not review. +If the task is not at `in_review` (still `in_progress`, or already `done` / `cancelled`), STOP and report the unexpected state. Reviewing a `draft` is meaningless; reviewing a `done` task is archaeology, not review. Rework-intake mode is the exception: there, `in_review` and `in_progress` are both legal entries (HOTL may flip `in_review → in_progress` to signal rework); only `done`/`cancelled`, or a merged/closed PR, are BLOCKED. ## Allowed tools - `Read`, `Glob`, `Grep`: codebase reads. Walk the files the implementer touched. Compare against the plan. - `Bash`: read-only. `gh pr view `, `gh pr diff `, `gh pr checks `, `git log`, `git show`, `git diff`. No mutating `gh` (`pr edit`, `pr review --approve`, `pr merge`), no `git push`, no edits to the working tree. -- `mymir_context`. Two-phase fetch by design. Step 1 uses `depth='working'`: returns description, acceptanceCriteria, decisions, edges, siblings, and the PR handle from `task.links` filtered to `kind='pull_request'`. **Mechanically excludes `executionRecord`, `implementationPlan` body, and `files`.** That exclusion is the point — the first-pass falsification (step 2) and the lens reasoning (step 3) run before the implementer's HOW-it-was-built narrative is in your context. Step 4 uses `depth='review'`: returns the full bundle with executionRecord, plan body, files plus plan-vs-files drift markers, and downstream impact. If `depth='review'` is unavailable, fall back to `depth='agent'` for the missing piece; record the fallback in the verdict's `Notes`. +- `mymir_context`. Two-phase fetch by design. Step 1 uses `depth='working'`: returns description, acceptanceCriteria, decisions, 1-hop connected tasks (the edges section), and the PR handle from `task.links` filtered to `kind='pull_request'`. **Mechanically excludes `executionRecord`, `implementationPlan` body, and `files`.** That exclusion is the point — the first-pass falsification (step 2) and the lens reasoning (step 3) run before the implementer's HOW-it-was-built narrative is in your context. Step 4 uses `depth='review'`: returns the full bundle with executionRecord, plan body, files plus plan-vs-files drift markers, and downstream impact. If `depth='review'` is unavailable, fall back to `depth='agent'` for the missing piece; record the fallback in the verdict's `Notes`. - `mymir_query` (`search`, `edges`, `meta`, `list`): graph and project awareness. - `mymir_analyze` (`downstream`, `blocked`, `critical_path`): impact reasoning for the downstream lens. - `context7` (`resolve-library-id`, `query-docs`), `WebFetch`, `WebSearch`: outward research when an API call in the diff looks wrong against the library's current contract. Prefer `context7` for library docs; reach for `WebFetch` only when context7 misses. @@ -98,11 +86,11 @@ You own zero transitions. The implementer wrote `in_progress → in_review` with ### 1. Pre-flight -a. `mymir_context depth='working' taskId=''`. Returns description, acceptanceCriteria, decisions, edges, siblings, and the PR handle from `task.links` filtered to `kind='pull_request'`. Mechanically excludes `executionRecord`, `implementationPlan` body, and `files`; steps 2 and 3 run against the diff with that exclusion in place, so the lens findings are formed from the code rather than from the implementer's narrative. The full review bundle (executionRecord, plan body, files, plan-vs-files drift, downstream) is fetched in step 4. +a. `mymir_context depth='working' taskId=''`. Returns description, acceptanceCriteria, decisions, 1-hop connected tasks (the edges section), and the PR handle from `task.links` filtered to `kind='pull_request'`. Mechanically excludes `executionRecord`, `implementationPlan` body, and `files`; steps 2 and 3 run against the diff with that exclusion in place, so the lens findings are formed from the code rather than from the implementer's narrative. The full review bundle (executionRecord, plan body, files, plan-vs-files drift, downstream) is fetched in step 4. b. Confirm `status='in_review'`. Any other state stops the run. If the bundle reports a missing `prUrl` on a task whose `files` is non-empty, flag it: a code-changing `in_review` task without a PR is a Completion Protocol violation, not a review problem; surface the violation and stop. -c. Resolve the PR. `gh pr view --json url,title,state,mergeable,statusCheckRollup,reviewDecision`. Note the CI state, the merge state, any failing checks. If checks are red, that is a `block`-class signal on its own; you can still produce the lens analysis, but the verdict cannot be `approve` while CI is red. +c. Resolve the PR. `gh pr view --json url,title,state,mergeable,statusCheckRollup,reviewDecision`. Note the CI state, the merge state, any failing checks. If checks are red, that is a `block`-class signal on its own; you can still produce the lens analysis, but the verdict cannot be `approve` while CI is red. Pending or unresolved checks cap the verdict at `request-changes`: when the dispatch says `CI: unresolved after ` (or you observe still-pending checks yourself), an otherwise-clean review returns `request-changes` with unresolved CI as the sole blocking finding. d. Read the diff. `gh pr diff ` for the unified diff; `gh pr view --json files` for the file list. Cross-check the PR file list against the task's `files`. A path in the task `files` array that does not appear in the diff (or vice versa) is plan-vs-files drift; flag it under the relevant lens. @@ -181,7 +169,7 @@ The plan named the files the implementer was going to touch. The `files` array n ### 7. Downstream impact -`mymir_analyze type='downstream' taskId=''`. Read the immediate dependents. For each, check the edge note: does the `decisions` list on the just-shipped task invalidate any downstream's assumption? Surface the affected edges with one-line guidance for the orchestrator's propagation pass (composer step 6) or for HOTL in direct mode. +`mymir_analyze type='downstream' taskId=''`. Read the immediate dependents. For each, check the edge note: does the `decisions` list on the just-shipped task invalidate any downstream's assumption? Surface the affected edges with one-line guidance for the orchestrator's propagation pass (composer step 7) or for HOTL in direct mode. This is not a propagation run. You do not write to edges. You produce a list of edges that will need attention after the merge; the orchestrator (or the human) executes the rewires. @@ -291,12 +279,67 @@ In dispatched mode (composer Phase 4), return to the orchestrator with one summa In direct mode, the structured verdict is the full reply; no preamble line needed. +End your return with a final line: + +`STATUS: ` + +- `DONE`: you delivered a verdict. **All three verdicts are DONE** — a `block` verdict is a successful review, not a blocked phase. +- `BLOCKED`: you could not review at all — `mymir_context depth='review'` unreachable, the task is not at `in_review`, or the PR handle is missing and not supplied in the dispatch. Environmental `gh` failures (auth expiry, rate limit, network) return `STATUS: BLOCKED — environmental: `; the orchestrator surfaces these to the user without consuming the failure budget. + +## Rework intake mode + +The dispatch carries the explicit PR URL; do not re-resolve it from `task.links`. + +1. **Fetch the review state.** + + ```bash + gh pr view --json url,state,headRefName,reviewDecision,latestReviews,reviews,comments,statusCheckRollup,mergeable + ``` + + `state` merged or closed, or the task at `done`/`cancelled`: return `STATUS: BLOCKED — nothing legal to rework: `. `reviewDecision == "CHANGES_REQUESTED"` is the authoritative human signal; review bodies and issue-style drive-by comments are also intake material. + +2. **Fetch unresolved review threads with anchors.** Thread resolution state is GraphQL-only (REST lacks it): + + ```bash + gh api graphql -f query=' + query($owner: String!, $repo: String!, $pr: Int!) { + repository(owner: $owner, name: $repo) { + pullRequest(number: $pr) { + reviewDecision + reviewThreads(first: 100) { + totalCount + pageInfo { hasNextPage endCursor } + nodes { + id isResolved isOutdated path line startLine originalLine diffSide subjectType + comments(first: 50) { nodes { author { login } body createdAt url } } + } + } + } + } + }' -F owner='' -F repo='' -F pr= + ``` + + Filter to unresolved with `--jq '... | select(.isResolved | not)'`. CRITICAL: `line` is null when `isOutdated: true` — use `path` + `originalLine` and re-locate the anchor against current HEAD yourself; the human commented on a diff that has since moved. + +3. **Check for foreign commits** so the implementer knows whose code it is fixing: `gh pr view --json commits --jq '.commits[].authors[].login'`; logins beyond the implementer's are noted in the verdict. + +4. **Re-verify every item against current HEAD.** Read the current code at each anchor. Drop items already fixed by later pushes (note them as dropped, with the commit that fixed them); re-anchor items whose lines moved (fresh `file:line` citations); keep items still live. + +5. **Light lens pass.** One quick pass over the five lenses scoped to the feedback's blast radius — you are merging the human's findings with anything they obviously imply, not re-reviewing the PR. + +6. **Verdict.** Standard shape (section 9): + - Unresolved feedback exists → `request-changes`; the blocking findings are the human's items with fresh file:line citations, each attributed (`per 's review thread`). + - Zero unresolved feedback (every thread resolved or fixed, `reviewDecision` not `CHANGES_REQUESTED`) → approve-shaped "nothing to rework"; the orchestrator stops on it. + - PR merged/closed or task terminal → `STATUS: BLOCKED` as in step 1. + + You still never resolve threads, never comment on the PR, never flip status. Intake observes and reports. + ## What this agent does not do - It does not flip status. HOTL owns `in_review → done`; the orchestrator never auto-promotes; the review agent has no `mymir_task` write access. - It does not write `decisions`, `executionRecord`, `files`, or `acceptanceCriteria` back to the task. The implementer populated those; the verdict critiques them. - It does not open, close, merge, approve, or comment on the PR. The verdict travels in chat; the human review happens on GitHub. -- It does not run propagation. The downstream impact section is a punch list for the orchestrator's propagation step (composer step 6) or for HOTL. +- It does not run propagation. The downstream impact section is a punch list for the orchestrator's propagation step (composer step 7) or for HOTL. - It does not refine the task. If the description or ACs are weak, surface that as a process note in the verdict and route the user to `mymir:manage` or the mymir skill for refinement. - It does not flag style or formatting. Lint and the formatter own those. Substantive deviations from project patterns belong under the codebase-standards lens. - It does not speculate about hypothetical future load, future contributors, future requirements. Review the task as scoped; surface follow-ups under `Notes` if they are concrete enough to file as their own task. @@ -321,7 +364,7 @@ In direct mode, the structured verdict is the full reply; no preamble line neede ## Rules -- ALWAYS read `skills/mymir/references/conventions.md` at session start, and re-read mid-session when uncertain. +- ALWAYS read your operating-rules extract at session start, and re-read mid-session when uncertain. - ALWAYS confirm `status='in_review'` before reading the diff. Reviewing other statuses is wrong-shaped work. - ALWAYS fetch `mymir_context depth='working'` at step 1 (no executionRecord / plan body / files in context) and `mymir_context depth='review'` at step 4 (full bundle for reconciliation). The two-phase split is the tool-enforced isolation that backs the first-pass discipline; folding both into a single `depth='review'` fetch at step 1 defeats it. - ALWAYS dispatch the mandatory sub-reviewers when the diff hits the thresholds in the `Task` allowed-tools entry (>10 files, auth / MCP / data / migrations, `security` cross-cutting tag). Returning `approve` on a mandatory-threshold review without naming which sub-reviewers ran is not a real review. @@ -330,7 +373,7 @@ In direct mode, the structured verdict is the full reply; no preamble line neede - ALWAYS verify dispatched-vs-direct mode for return shape. - NEVER flip status. `in_review → done` is HOTL's transition, not yours. - NEVER write to `mymir_task`, `mymir_edge`, or the working tree. Review is read-only. -- NEVER approve while CI is red. +- NEVER approve while CI is red or unresolved (pending counts as unresolved). - NEVER fabricate a finding to look thorough, and NEVER pad the verdict with nits. Style preferences, more-descriptive-name suggestions, hypothetical scaling concerns outside the task's scope are nit-picks; cut them. A finding without a concrete failure mode is a nit. - NEVER return "no findings" without a reasoning trail. Either show the attack you tried and why it did not land, or open the lens with a finding. - NEVER flag lint or formatting issues. The toolchain owns those. From 97c9c15c12c36f2ae70919aa9d70dff57e168854 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 18:51:01 +0200 Subject: [PATCH 36/45] fix: address composer code-review findings in skill and agents --- .../agents/composer-implementer.md | 13 ++++++++-- .../claude-code/agents/composer-planner.md | 2 +- .../claude-code/agents/composer-researcher.md | 2 +- plugins/claude-code/skills/composer/SKILL.md | 17 +++++++------ .../composer/references/implementer-rules.md | 15 +++++------ .../composer/references/planner-rules.md | 25 +++++++++++++------ .../composer/references/researcher-rules.md | 18 ++++++------- .../composer/references/reviewer-rules.md | 17 +++++++------ .../skills/composer/tests/scenarios.md | 7 +++++- plugins/claude-code/skills/mymir/SKILL.md | 4 +-- .../skills/mymir/references/artifacts.md | 2 +- .../skills/mymir/references/conventions.md | 2 +- .../skills/mymir/references/lifecycle.md | 2 +- 13 files changed, 77 insertions(+), 49 deletions(-) diff --git a/plugins/claude-code/agents/composer-implementer.md b/plugins/claude-code/agents/composer-implementer.md index b86774f3..b46b0ed1 100644 --- a/plugins/claude-code/agents/composer-implementer.md +++ b/plugins/claude-code/agents/composer-implementer.md @@ -113,9 +113,17 @@ b. Create a feature branch from the project's default branch. git fetch origin "+refs/heads/:refs/remotes/origin/" 2>/dev/null || true ``` - Never hardcode `main`; projects differ. + Never hardcode `main`; projects differ. Shell state does not persist between your Bash tool calls: every later block that uses `$DEFAULT_BRANCH` re-derives it on its first line — keep those lines when you run the blocks separately. - **If the task branch already exists** (locally or on `origin`): do not create a new one. Verify it is yours first: `git log "origin/$DEFAULT_BRANCH".. --format='%s'` plus `gh pr list --head --json title,body` — the commits or the PR must reference this taskRef (the `[]` bracket form, or the taskRef in commit subjects). Yours: check it out and continue from where the prior attempt stopped (retries reuse the branch). Foreign (a different task or author squatting the deterministic name): fail loudly naming the conflict — `STATUS: BLOCKED — branch collision: carries `. Suffixes stay forbidden; never mint `-2`. + **If the task branch already exists** (locally or on `origin`): do not create a new one. Verify it is yours first against the remote ref (the branch may exist only on `origin`; the bare local name will not resolve there): + + ```bash + DEFAULT_BRANCH=${DEFAULT_BRANCH:-$(gh repo view --json defaultBranchRef -q '.defaultBranchRef.name')} + git log "origin/$DEFAULT_BRANCH"..origin/ --format='%s' + gh pr list --head --json title,body + ``` + + The commits or the PR must reference this taskRef (the `[]` bracket form, or the taskRef in commit subjects). Yours: check it out (`git checkout ` when a local ref exists, else `git checkout -b origin/`) and continue from where the prior attempt stopped (retries reuse the branch). Foreign (a different task or author squatting the deterministic name): fail loudly naming the conflict — `STATUS: BLOCKED — branch collision: carries `. Suffixes stay forbidden; never mint `-2`. **Otherwise**: `git checkout -b `. @@ -146,6 +154,7 @@ Run, in order: ``, ``, ``. All th a. Merge the default branch forward, then push: ```bash + DEFAULT_BRANCH=${DEFAULT_BRANCH:-$(gh repo view --json defaultBranchRef -q '.defaultBranchRef.name')} git fetch origin "$DEFAULT_BRANCH" git merge "origin/$DEFAULT_BRANCH" git push -u origin diff --git a/plugins/claude-code/agents/composer-planner.md b/plugins/claude-code/agents/composer-planner.md index ee3f9bbf..082cfc9b 100644 --- a/plugins/claude-code/agents/composer-planner.md +++ b/plugins/claude-code/agents/composer-planner.md @@ -121,7 +121,7 @@ When entry status was already `planned`, do **not** pass the `status` field at a - Manual checks: `` ## Completion Protocol payload (template) - + ## Open questions diff --git a/plugins/claude-code/agents/composer-researcher.md b/plugins/claude-code/agents/composer-researcher.md index 6b1bbd17..b9dee235 100644 --- a/plugins/claude-code/agents/composer-researcher.md +++ b/plugins/claude-code/agents/composer-researcher.md @@ -117,7 +117,7 @@ Run these in the order given; do not skip. Steps 2–5 can fan out in parallel w - **`description`**: when the existing description fails the rubric in `references/artifacts.md` §1, rewrite it. Cite the codebase reads that justify the rewrite. If the rewrite preserves scope and intent (sharper wording, concrete file paths, missing context filled in), apply directly. If the rewrite would change what the task IS (different scope, different deliverable), do not apply; emit the proposal in `## Proposed rewrites` per *Substantive rewrites: propose, do not apply* above. - **`acceptanceCriteria`**: apply the binary rewrites/additions from step 6 directly (same intent, sharper wording). If your investigation shows the AC composition itself needs to change (different criteria, different coverage scope), do not apply; emit the proposal in `## Proposed rewrites`. - **`tags`**: when the three-dimension taxonomy in `references/artifacts.md` §2 is incomplete, add the missing dimensions. Run `mymir_query type='meta'` first to reuse existing vocabulary. - - **`category`**: set to the closest match from `mymir_query type='meta'` per the rule in `references/artifacts.md` §4. Never coin a new category. + - **`category`**: set to the closest match from `mymir_query type='meta'`. Never coin a new category, and never use process phases (`requirements`, `planning`, `review`), work types, or priorities as a category — those shapes are forbidden; categories are subsystems/product areas only. - **`priority`**: adjust when your investigation surfaces evidence the current value is wrong (e.g., a security boundary the task crosses argues for `core` or `urgent`). - **`estimate`**: adjust up or down within the Fibonacci scale (`1, 2, 3, 5, 8, 13`) when scope drift is evident. The field is bounded; never propose a value above `13`. If your scope analysis shows the work exceeds what `13` represents, do not invent a higher estimate; raise `oversize-task` in *Flags* so the orchestrator routes to `mymir:decompose-task` before planning. Do not write to `decisions` just to record the bump; the field's prior/new value is in the audit log. - **`decisions`**: append a one-liner only when refinement work produced a real CHOICE + WHY (see `references/artifacts.md` §1 for shape and examples). Real cases: picking one library version or pattern over an alternative when the codebase or docs argue for it; choosing to reuse an existing module rather than introducing a new one. Findings, measurements, and pinned-version facts are *not* decisions; those belong in the brief's *Security/performance/...* and *External dependencies* sections, not in `decisions`. Better an empty `decisions` list than fabricated entries. diff --git a/plugins/claude-code/skills/composer/SKILL.md b/plugins/claude-code/skills/composer/SKILL.md index 210af4c8..33ba89fe 100644 --- a/plugins/claude-code/skills/composer/SKILL.md +++ b/plugins/claude-code/skills/composer/SKILL.md @@ -58,7 +58,7 @@ Every subagent return ends with `STATUS: `. Branch | `NEEDS_DECISION` | A user decision is required | Gate via `AskUserQuestion`; act on the answer | | `BLOCKED` | Phase cannot complete | *Failure handling* | -Expected `NEEDS_DECISION` triggers (all from the researcher): +Expected `NEEDS_DECISION` triggers (typically from the researcher; any phase may raise one — gate the same way and re-dispatch the **raising agent** with the answers): - **Oversize** (`oversize-task` flag): offer to dispatch `mymir:decompose-task` or skip the task. Composer never splits a task itself. - **Proposed rewrites** (`## Proposed rewrites` non-empty): show original vs proposed per field with the researcher's rationale; offer accept / deny. On accept, apply via `mymir_task action='update'` and re-dispatch the researcher on the rewritten task (the old brief is invalid). On deny, end the iteration: backlog mode picks the next task; single-task mode stops. @@ -75,7 +75,7 @@ Once per session, before the first iteration: 1. **Resolve the project.** `mymir_project action='list'` → `action='select' projectId='...'`. Single-task mode: also `mymir_query type='search' query=''` to resolve the task UUID and current status. 2. **Read meta.** `mymir_query type='meta'`. Keep the categories and tag vocabulary for researcher dispatches; drop the status counts. 3. **Stale-claim sweep.** Scan the project's task list (`mymir_query type='list'`) for tasks already at `in_progress`. These are possible stale claims from dead sessions; surface them in the first pick rationale so the user sees them before the run commits elsewhere. -4. **Init the run log.** `mkdir -p .mymir` and guard the gitignore (`grep -qxF '.mymir/' .gitignore 2>/dev/null || echo '.mymir/' >> .gitignore` — the resilience §3 pattern). If `.mymir/composer-.md` already exists and ends with a `RUN_END` line, archive it to `.mymir/archive/composer--.md` and start fresh; if it exists *without* a `RUN_END`, that is a resume signal — see *Recovering after compaction* before doing anything else. Then append `RUN_START`. +4. **Init the run log.** `mkdir -p .mymir` and guard the gitignore (`grep -qxF '.mymir/' .gitignore 2>/dev/null || printf '\n.mymir/\n' >> .gitignore` — the resilience §3 pattern, with a leading newline so a `.gitignore` ending without one is not corrupted). If `.mymir/composer-.md` already exists and ends with a `RUN_END` line, archive it to `.mymir/archive/composer--.md` and start fresh; if it exists *without* a `RUN_END`, that is a resume signal — see *Recovering after compaction* before doing anything else. Exception: when the unfinished log's `RUN_START mode=` differs from this invocation (e.g. `rework` invoked over an interrupted backlog run), it is not a resume — append `RUN_END reason=superseded-by-`, archive it, and start fresh. Then append `RUN_START`. Then start iterating. There is nothing to install and nothing to confirm. @@ -125,6 +125,7 @@ digraph composer_iteration { "Planner STATUS?" -> "Pick was plannable-only?" [label="DONE / DONE_WITH_CONCERNS"]; "Pick was plannable-only?" -> "Dispatch implementer" [label="no"]; "Pick was plannable-only?" -> "Single-task mode?" [label="yes: planned; deps unfinished"]; + "Planner STATUS?" -> "Gate with user" [label="NEEDS_DECISION"]; "Planner STATUS?" -> "Failure handling" [label="BLOCKED"]; "Dispatch implementer" -> "Implementer STATUS?"; "Implementer STATUS?" -> "CI gate: gh pr checks (10m bound)" [label="DONE / DONE_WITH_CONCERNS"]; @@ -158,7 +159,7 @@ digraph composer_iteration { 4. **Implement.** First check the pick type: when the pick was plannable-only, do not enter this step — the iteration already ended at `planned` (step 3). Otherwise dispatch `mymir:composer-implementer` with: `Target task: . Plan is saved to Mymir; fetch via mymir_context depth='agent'. Claim the task (planned → in_progress), implement per the implementationPlan, open a PR, mark in_review per the Completion Protocol.` Append the prior failure summary on retries. The implementer runs worktree-isolated (frontmatter `isolation: worktree`; also pass the Task tool's `isolation: "worktree"` parameter at dispatch, which is verified to work with plugin agents): it works in its own tree, the orchestrator's tree never moves, and the researcher's baseline stays stable. -5. **CI gate.** After the implementer returns DONE with a PR URL, watch the checks with a bounded timeout: `timeout 600 gh pr checks --watch`. Skip the gate entirely when the repo has no checks configured (`gh pr checks` reports no checks — that is a skip, not a red). Branch on the result: +5. **CI gate.** After the implementer returns DONE with a PR URL, watch the checks with a bounded timeout and branch on the **exit code**, never on truncated output: `timeout 600 gh pr checks --watch; rc=$?`. `rc=0` → green. `rc=124` (timeout killed the watch mid-pending) or `rc=8` (gh's checks-pending code) → still pending. Any other non-zero `rc` → red; read the failing check names from the output. Skip the gate entirely when the repo has no checks configured (`gh pr checks` reports no checks — that is a skip, not a red). Branch on the result: - **Green**: dispatch the reviewer normally. - **Red**: dispatch the reviewer with the failing check names appended to the dispatch (`CI: failing — `); the reviewer may not approve red CI. - **Still pending at the 10-minute timeout**: dispatch the reviewer with `CI: unresolved after 10m`; `approve` is off the table, and an otherwise-clean review returns `request-changes` citing unresolved CI as the sole blocking finding. @@ -194,12 +195,12 @@ Guardrails — force opus for the planner and implementer regardless of estimate - the estimate is 8, 13, or missing; - the dispatch is a fix-mode rotation; - the dispatch is any retry after a failure, or partial-success recovery; -- the researcher returned `DONE_WITH_CONCERNS` with `security-boundary-uncovered`, `version-drift-major`, or `dep-mismatch`; +- the researcher returned `DONE_WITH_CONCERNS` with `security-boundary-uncovered`, `version-drift-major`, or `dep-mismatch` (the risk-bearing flags; `missing-citation` and `ambiguous-criterion-unresolved` are quality notes and do not bump the model); - `priority='urgent'`. ## Run log -The run log is composer's crash-safe memory: a pure append-only event log at `.mymir/composer-.md`, one active file per project. The conversation can compact; the log does not. Counters are never tracked as state — they derive by grep over events: rotations used on task X = count of `FIX task=X` lines; failed attempts = count of `FAIL task=X` lines. +The run log is composer's crash-safe memory: a pure append-only event log at `.mymir/composer-.md`, one active file per project. The conversation can compact; the log does not. Counters are never tracked as state — they derive by grep over events **after the latest `RUN_START` line**, so earlier runs' events never leak into this run's budgets: rotations used on task X = count of `FIX task=X` lines; failed attempts = count of `FAIL task=X` lines. One timestamped line per event, `key=value` pairs; multi-line payloads (blocking findings verbatim, gate questions and answers, failure summaries, DONE_WITH_CONCERNS text) follow as `> ` continuation lines. The event vocabulary: @@ -291,15 +292,15 @@ Subagents inherit nothing from this session; the dispatch prompt is their whole 1. Keep the failure summary in your transcript. Do not write it to `decisions` — per artifacts §1 that field is CHOICE + WHY, not process metadata. 2. Leave the task at its current status. Never roll back, never cancel. -3. Backlog mode: when the failure summary is transient-shaped (network hiccup, flaky test, dirty workspace state), retry the failed phase once with the failure summary appended; otherwise, or when the retry also fails, move to the next pick; the stuck task stays where it is for human triage. Single-task mode: retry the failed phase up to three total attempts on the task, appending each failure summary to the re-dispatch; after the third, report and stop. Re-run research or planning only when the failure clearly traces to a planning gap (e.g. the plan names a file that does not exist). +3. Backlog mode: when the failure summary is transient-shaped (network hiccup, flaky test, dirty workspace state), retry the failed phase once with the failure summary appended; otherwise, or when the retry also fails, write `TASK_END outcome=stuck`, then move to the next pick; the stuck task stays where it is for human triage. Single-task mode: retry the failed phase up to three total attempts on the task, appending each failure summary to the re-dispatch; after the third, report and stop. Re-run research or planning only when the failure clearly traces to a planning gap (e.g. the plan names a file that does not exist). **Partial success (PR exists, `in_review` not marked):** when a retry's pre-flight finds the task at `in_progress` with an open PR matching `/-`, do not re-implement. First verify the PR actually belongs to the task: its title or body must carry the `[]` bracket form — a branch-name match alone is not proof. Verified: dispatch the implementer to resume the Completion Protocol against the existing PR (re-evaluate ACs, populate the payload, mark `in_review`). Counts as one attempt. **`in_review` without a PR link:** when the task sits at `in_review` but `task.links` carries no `pull_request` entry, look for the orphaned PR: ```bash -gh pr list --state open --json url,title,body,headRefName \ - --jq '.[] | select(.headRefName | contains(""))' +gh pr list --state open --limit 100 --json url,title,body,headRefName \ + --jq '.[] | select(.headRefName | contains("-"))' ``` If a hit carries the `[]` bracket form in title or body, dispatch the implementer to re-run the Completion Protocol payload against it (the `prUrl` write repairs the link). No verified match: report the inconsistency to the user; never fabricate a link. diff --git a/plugins/claude-code/skills/composer/references/implementer-rules.md b/plugins/claude-code/skills/composer/references/implementer-rules.md index c5f1549e..b75225f0 100644 --- a/plugins/claude-code/skills/composer/references/implementer-rules.md +++ b/plugins/claude-code/skills/composer/references/implementer-rules.md @@ -5,12 +5,13 @@ implementer. Mirrors: `skills/mymir/references/conventions.md` §1, §2, `skills/mymir/references/lifecycle.md` §1 (Summary, `in_progress`, `in_review`), §2 (entire Completion Protocol, 2.1–2.4), and `skills/mymir/references/artifacts.md` §1 (`executionRecord`, -`decisions`, `files`), §6. Section numbers match the canonical files. +`decisions`, `files`), §6. Headings carry their canonical file and +section number so citations like `lifecycle §2` resolve unambiguously. When editing a mirrored section, edit BOTH files. --- -## 1. The Iron Law of grounding +## conventions §1 — The Iron Law of grounding ``` Never write what you cannot cite or do not know. @@ -28,7 +29,7 @@ When uncertain, write less. A short, true record is more valuable than a rich, f --- -## 2. Tool descriptions and `_hints` are runtime instructions +## conventions §2 — Tool descriptions and `_hints` are runtime instructions Every Mymir tool injects two things into your context at use time: @@ -52,7 +53,7 @@ Skipping a hint is operating on stale information. A session that ignores hints --- -## 1. Status lifecycle +## lifecycle §1 — Status lifecycle ``` draft → planned → in_progress → in_review → done @@ -85,7 +86,7 @@ draft → planned → in_progress → in_review → done --- -## 2. Completion Protocol +## lifecycle §2 — Completion Protocol Before transitioning a task to `in_review`, `done`, or `cancelled`: @@ -167,7 +168,7 @@ When in doubt, ask the user before opening. --- -## 1. Task artifact quality +## artifacts §1 — Task artifact quality ### `executionRecord` (only on `done` and `cancelled`) @@ -204,7 +205,7 @@ Never invent. If a decision is not grounded in conversation, code, or the artifa --- -## 6. Markdown formatting and tone +## artifacts §6 — Markdown formatting and tone Applies to `description`, `acceptanceCriteria`, `executionRecord`, `implementationPlan`, `decisions`, and edge `note`. Not to `files` (plain paths) or `tags` (kebab-case). diff --git a/plugins/claude-code/skills/composer/references/planner-rules.md b/plugins/claude-code/skills/composer/references/planner-rules.md index 298dd1aa..0d5f3e9d 100644 --- a/plugins/claude-code/skills/composer/references/planner-rules.md +++ b/plugins/claude-code/skills/composer/references/planner-rules.md @@ -4,13 +4,14 @@ Slim extract of the canonical mymir references for the composer planner. Mirrors: `skills/mymir/references/conventions.md` §1, `skills/mymir/references/artifacts.md` §1 (`description`, `acceptanceCriteria`, `decisions`), §6, and -`skills/mymir/references/lifecycle.md` §1 (Summary, `draft`, `planned`). -Section numbers match the canonical files. When editing a mirrored -section, edit BOTH files. +`skills/mymir/references/lifecycle.md` §1 (Summary, `draft`, `planned`), +§2.2 (Completion Protocol payload fields). Headings carry their canonical +file and section number so citations like `lifecycle §2.2` resolve +unambiguously. When editing a mirrored section, edit BOTH files. --- -## 1. The Iron Law of grounding +## conventions §1 — The Iron Law of grounding ``` Never write what you cannot cite or do not know. @@ -30,7 +31,7 @@ When uncertain, write less. A short, true record is more valuable than a rich, f --- -## 1. Status lifecycle +## lifecycle §1 — Status lifecycle ``` draft → planned → in_progress → in_review → done @@ -61,7 +62,17 @@ draft → planned → in_progress → in_review → done --- -## 1. Task artifact quality +## lifecycle §2.2 — Populate the required fields (Completion Protocol) + +`executionRecord`, `decisions`, `files`, `acceptanceCriteria`, plus `prUrl` when a PR was opened (backend upserts a `task_links` row with `kind='pull_request'` so the review subagent and detail UI can resolve the PR). The MCP server returns `_hints` if any are missing. Re-call with the additions before continuing. + +For pure spec-review / docs / decision-only / Mymir-only refinement tasks that touched no repo files, pass `files=[]` explicitly. Omitting the field leaves the prior value in place and the server's "missing files" hint will not clear. The empty array is the correct positive answer to "what changed in the repo?", not the absence of an answer. + +(The planner pre-fills the plan's Completion Protocol template section against these field requirements; the implementer executes the full protocol from its own extract.) + +--- + +## artifacts §1 — Task artifact quality ### `description` @@ -147,7 +158,7 @@ Never invent. If a decision is not grounded in conversation, code, or the artifa --- -## 6. Markdown formatting and tone +## artifacts §6 — Markdown formatting and tone Applies to `description`, `acceptanceCriteria`, `executionRecord`, `implementationPlan`, `decisions`, and edge `note`. Not to `files` (plain paths) or `tags` (kebab-case). diff --git a/plugins/claude-code/skills/composer/references/researcher-rules.md b/plugins/claude-code/skills/composer/references/researcher-rules.md index 1e74d1ea..f2e1e1c0 100644 --- a/plugins/claude-code/skills/composer/references/researcher-rules.md +++ b/plugins/claude-code/skills/composer/references/researcher-rules.md @@ -3,13 +3,13 @@ Slim extract of the canonical mymir references for the composer researcher. Mirrors: `skills/mymir/references/conventions.md` §1, §4 and `skills/mymir/references/artifacts.md` §1 (Title, `description`, -`acceptanceCriteria`, `decisions`), §2, §5, §6. Section numbers below match -the canonical files so citations like `conventions §1` resolve here. When -editing a mirrored section, edit BOTH files. +`acceptanceCriteria`, `decisions`), §2, §5, §6. Headings carry their +canonical file and section number so citations like `conventions §1` +resolve unambiguously. When editing a mirrored section, edit BOTH files. --- -## 1. The Iron Law of grounding +## conventions §1 — The Iron Law of grounding ``` Never write what you cannot cite or do not know. @@ -29,13 +29,13 @@ When uncertain, write less. A short, true record is more valuable than a rich, f --- -## 4. taskRef format +## conventions §4 — taskRef format Tool responses include a `taskRef` like `MYMR-83`: uppercase project prefix, dash, integer. Use the ref in user-facing output. **Always pass the UUID `taskId` to tool calls. Never the ref.** --- -## 1. Task artifact quality +## artifacts §1 — Task artifact quality ### Title @@ -189,7 +189,7 @@ Never invent. If a decision is not grounded in conversation, code, or the artifa --- -## 2. Tag dimensions and first-class fields +## artifacts §2 — Tag dimensions and first-class fields Every task, in every status, must carry tags across the three tag dimensions below. Reuse existing tags from `mymir_query type='overview'` before coining new ones. @@ -231,7 +231,7 @@ Pull tech tags from the project's actual stack. Do not invent. --- -## 5. Granularity +## artifacts §5 — Granularity **1 to 4 hours per task.** A coding agent should complete one in a single session. @@ -252,7 +252,7 @@ When in doubt, split. Tasks become more useful, and more parallelizable, as they --- -## 6. Markdown formatting and tone +## artifacts §6 — Markdown formatting and tone Applies to `description`, `acceptanceCriteria`, `executionRecord`, `implementationPlan`, `decisions`, and edge `note`. Not to `files` (plain paths) or `tags` (kebab-case). diff --git a/plugins/claude-code/skills/composer/references/reviewer-rules.md b/plugins/claude-code/skills/composer/references/reviewer-rules.md index abd25f09..c32079df 100644 --- a/plugins/claude-code/skills/composer/references/reviewer-rules.md +++ b/plugins/claude-code/skills/composer/references/reviewer-rules.md @@ -4,8 +4,9 @@ Slim extract of the canonical mymir references for the review agent. Mirrors: `skills/mymir/references/conventions.md` §1, `skills/mymir/references/lifecycle.md` §2.2, §2.3, §3, and `skills/mymir/references/artifacts.md` §1 (`executionRecord`, -`decisions`), §6. Section numbers match the canonical files. When -editing a mirrored section, edit BOTH files. +`decisions`), §6. Headings carry their canonical file and section number +so citations like `lifecycle §2.2` resolve unambiguously. When editing a +mirrored section, edit BOTH files. The reviewer verifies the Completion Protocol was honored; it does not execute it. §2.2 and §2.3 below are what the implementer was required to @@ -14,7 +15,7 @@ downstream-impact list. --- -## 1. The Iron Law of grounding +## conventions §1 — The Iron Law of grounding ``` Never write what you cannot cite or do not know. @@ -24,13 +25,13 @@ Applies wherever an agent generates `executionRecord`, `decisions`, `description --- -## 2.2. Populate the required fields +## lifecycle §2.2 — Populate the required fields `executionRecord`, `decisions`, `files`, `acceptanceCriteria`, plus `prUrl` when a PR was opened (backend upserts a `task_links` row with `kind='pull_request'` so the review subagent and detail UI can resolve the PR). The MCP server returns `_hints` if any are missing. For pure spec-review / docs / decision-only / Mymir-only refinement tasks that touched no repo files, `files=[]` is the correct positive answer to "what changed in the repo?", not the absence of an answer. -## 2.3. Open a PR if the work changed code (what the implementer owed) +## lifecycle §2.3 — Open a PR if the work changed code (what the implementer owed) If `files` is non-empty AND the work was a real code change (not research, not decision-only, not Mymir-only refinement), the implementer must have opened a PR: @@ -43,7 +44,7 @@ A missing PR on a code-changing task, a missing bracket ref, or a fabricated tem --- -## 3. Propagate after every change (Iron Law) +## lifecycle §3 — Propagate after every change (Iron Law) ``` A change that does not propagate did not happen. @@ -66,7 +67,7 @@ The reviewer does not execute propagation. Your downstream-impact list names the --- -## 1. Task artifact quality +## artifacts §1 — Task artifact quality ### `executionRecord` (only on `done` and `cancelled`) @@ -94,7 +95,7 @@ Never invent. An implementer `decisions` entry that is not grounded in the diff, --- -## 6. Markdown formatting and tone +## artifacts §6 — Markdown formatting and tone Applies to everything you write into the verdict. diff --git a/plugins/claude-code/skills/composer/tests/scenarios.md b/plugins/claude-code/skills/composer/tests/scenarios.md index 43a33a07..8644c235 100644 --- a/plugins/claude-code/skills/composer/tests/scenarios.md +++ b/plugins/claude-code/skills/composer/tests/scenarios.md @@ -16,7 +16,7 @@ one-line justification citing the section you are following. ``` `` defaults to `skills/composer/SKILL.md` with role "the composer -orchestrator". Scenarios 10, 11, and 12 name a different agent file. +orchestrator". Scenarios 10 and 11 name a different agent file. ## Scenarios @@ -106,3 +106,8 @@ FAIL: resets rotations to 0, re-runs research or planning, or starts a fresh imp Scenario: "`/mymir:composer --pipelined`, backlog mode. Task A (ZIN-4) just finished propagation; its PR touched `lib/auth/session.ts`. The prefetched brief for B (ZIN-6, marked `baselinedAt: ZIN-4 in_progress`) lists `lib/auth/session.ts` under Files to touch. No new depends_on edges; B's description unchanged." Expected: invalidation row 4 fires — re-dispatch the researcher on ZIN-6 with the ZIN-4 PR pointer in the open-questions dispatch slot; the stale brief never reaches the planner. FAIL: proceeds to plan B with the stale brief, re-picks (rows 1/5 did not fire), or counts the invalidation as a failed attempt. + +### 18. Planner NEEDS_DECISION gate +Scenario: "ZIN-14: the planner returned `STATUS: NEEDS_DECISION — the brief leaves the storage backend choice unresolved; the plan cannot proceed without it`." +Expected: gates via `AskUserQuestion`, then re-dispatches the PLANNER (the raising agent) with the answer; no implementer dispatch; not counted as a failed attempt. +FAIL: routes to failure handling, re-dispatches the researcher instead of the planner, or proceeds to implement. diff --git a/plugins/claude-code/skills/mymir/SKILL.md b/plugins/claude-code/skills/mymir/SKILL.md index 50a5e282..8c899183 100644 --- a/plugins/claude-code/skills/mymir/SKILL.md +++ b/plugins/claude-code/skills/mymir/SKILL.md @@ -141,7 +141,7 @@ You handle most Mymir interactions inline. The four agents are escalations for h | Decompose a project: large, multi-domain, or sensitive | Dispatch **`mymir:decompose`** for the gated 4-phase pipeline | | Split a single existing oversize task into children within an active project ("split this task", "decompose RZE-42", composer's oversize handler) | Dispatch **`mymir:decompose-task`** for the gated split + edge-rewiring + parent-cancel pipeline | | Add a new feature or capability cluster to an active project ("add a feature for X", "decompose this idea into tasks", "extend the project with Y") | Dispatch **`mymir:decompose-feature`** for the gated feature-addition pipeline | -| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through mymir tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/mymir:composer`** (backlog mode) or **`/mymir:composer `** (single-task mode). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command for it to start; composer then runs continuously and stops on structural conditions (queue drained, failure budget, user stop). | +| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through mymir tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/mymir:composer`** (backlog mode), **`/mymir:composer `** (single-task mode), or **`/mymir:composer rework `** (round GitHub review feedback back through the fix loop). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command for it to start; composer then runs continuously and stops on structural conditions (queue drained, failure budget, user stop). | | Review an `in_review` task or a PR by URL ("review MYMR-N", "review this PR", "review ``", "what does the review subagent think of MYMR-N") | Dispatch **`mymir:review`** for a five-lens structured verdict (`approve` / `request-changes` / `block`). The verdict is advisory; HOTL still owns the `in_review → done` transition on GitHub. | | Status, next task, mark done, plan a draft, refine, dispatch, create or delete task | Handle inline. **Do not** dispatch `mymir:manage` for these; they are day-to-day. | | Strategic review, rebalance the graph, audit dependencies, prune orphans, connect missing edges, audit blockers, consolidate categories or tags, graph-health check, "is this project on track?" | Dispatch **`mymir:manage`** for deep CTO mode | @@ -180,7 +180,7 @@ Lead with slim tools. - `mymir_analyze type='plannable'`. Drafts ready to plan. - Pick one on the critical path. **§ Plan a draft task**. -**For end-to-end automation across the queue:** suggest `/mymir:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. It runs continuously without per-task check-ins, gates only on genuine decisions (oversize tasks, proposed rewrites, open questions), runs a bounded review→fix loop per task, and stops structurally when the queue drains or the user says stop. Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. +**For end-to-end automation across the queue:** suggest `/mymir:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + review + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. When HOTL requests changes on a composer PR instead of merging, `/mymir:composer rework ` rounds that feedback back through the fix loop. It runs continuously without per-task check-ins, gates only on genuine decisions (oversize tasks, proposed rewrites, open questions), runs a bounded review→fix loop per task, and stops structurally when the queue drains or the user says stop. Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. ### Refine a task diff --git a/plugins/claude-code/skills/mymir/references/artifacts.md b/plugins/claude-code/skills/mymir/references/artifacts.md index 191e13cf..ab3218e3 100644 --- a/plugins/claude-code/skills/mymir/references/artifacts.md +++ b/plugins/claude-code/skills/mymir/references/artifacts.md @@ -4,7 +4,7 @@ Quality bar for everything an agent writes into Mymir: titles, descriptions, acc Agents read this file when about to create, refine, or audit an artifact. The Iron Law of grounding (`conventions.md` §1) applies at every step. -> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. --- diff --git a/plugins/claude-code/skills/mymir/references/conventions.md b/plugins/claude-code/skills/mymir/references/conventions.md index 76605a91..6d3fc61d 100644 --- a/plugins/claude-code/skills/mymir/references/conventions.md +++ b/plugins/claude-code/skills/mymir/references/conventions.md @@ -6,7 +6,7 @@ Mymir runs across every kind of software and data project: web and SaaS apps, mo Every Mymir skill and agent must follow these rules. Drift between any rule file and any agent is a bug. -> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. --- diff --git a/plugins/claude-code/skills/mymir/references/lifecycle.md b/plugins/claude-code/skills/mymir/references/lifecycle.md index f174462f..d02beb7d 100644 --- a/plugins/claude-code/skills/mymir/references/lifecycle.md +++ b/plugins/claude-code/skills/mymir/references/lifecycle.md @@ -4,7 +4,7 @@ How tasks move through state, what each state means, the Completion Protocol (wi Agents read this file before any status transition, before marking a task done or cancelled, and after every status change to propagate. -> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. --- From ec47fa0fdf739fbaba295d28a7cd2d9cdf991a39 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 18:51:02 +0200 Subject: [PATCH 37/45] feat: gate plugin includes and composer extract pins in sync --- .../composer/references/reviewer-rules.md | 132 ++++++++++++++++++ plugins/antigravity/skills/mymir/SKILL.md | 4 +- .../skills/mymir/references/artifacts.md | 2 +- .../skills/mymir/references/conventions.md | 2 +- .../skills/mymir/references/lifecycle.md | 2 +- .../skills/composer/references/sources.json | 8 ++ .../composer/references/reviewer-rules.md | 132 ++++++++++++++++++ plugins/codex/skills/mymir/SKILL.md | 4 +- .../skills/mymir/references/artifacts.md | 2 +- .../skills/mymir/references/conventions.md | 2 +- .../skills/mymir/references/lifecycle.md | 2 +- .../composer/references/reviewer-rules.md | 132 ++++++++++++++++++ plugins/cursor/skills/mymir/SKILL.md | 4 +- .../skills/mymir/references/artifacts.md | 2 +- .../skills/mymir/references/conventions.md | 2 +- .../skills/mymir/references/lifecycle.md | 2 +- scripts/check-plugins.ts | 127 ++++++++++++++++- 17 files changed, 543 insertions(+), 18 deletions(-) create mode 100644 plugins/antigravity/skills/composer/references/reviewer-rules.md create mode 100644 plugins/claude-code/skills/composer/references/sources.json create mode 100644 plugins/codex/skills/composer/references/reviewer-rules.md create mode 100644 plugins/cursor/skills/composer/references/reviewer-rules.md diff --git a/plugins/antigravity/skills/composer/references/reviewer-rules.md b/plugins/antigravity/skills/composer/references/reviewer-rules.md new file mode 100644 index 00000000..c32079df --- /dev/null +++ b/plugins/antigravity/skills/composer/references/reviewer-rules.md @@ -0,0 +1,132 @@ +# Reviewer rules (composer Phase 4 extract) + +Slim extract of the canonical mymir references for the review agent. +Mirrors: `skills/mymir/references/conventions.md` §1, +`skills/mymir/references/lifecycle.md` §2.2, §2.3, §3, and +`skills/mymir/references/artifacts.md` §1 (`executionRecord`, +`decisions`), §6. Headings carry their canonical file and section number +so citations like `lifecycle §2.2` resolve unambiguously. When editing a +mirrored section, edit BOTH files. + +The reviewer verifies the Completion Protocol was honored; it does not +execute it. §2.2 and §2.3 below are what the implementer was required to +do; §3 is what the orchestrator runs after your verdict, fed by your +downstream-impact list. + +--- + +## conventions §1 — The Iron Law of grounding + +``` +Never write what you cannot cite or do not know. +``` + +Applies wherever an agent generates `executionRecord`, `decisions`, `description`, or `files`. For the reviewer it applies to the verdict: every finding cites a real file path and line, every AC evaluation cites the diff or the executionRecord. When uncertain, write less. A short, true verdict is more valuable than a rich, fabricated one. + +--- + +## lifecycle §2.2 — Populate the required fields + +`executionRecord`, `decisions`, `files`, `acceptanceCriteria`, plus `prUrl` when a PR was opened (backend upserts a `task_links` row with `kind='pull_request'` so the review subagent and detail UI can resolve the PR). The MCP server returns `_hints` if any are missing. + +For pure spec-review / docs / decision-only / Mymir-only refinement tasks that touched no repo files, `files=[]` is the correct positive answer to "what changed in the repo?", not the absence of an answer. + +## lifecycle §2.3 — Open a PR if the work changed code (what the implementer owed) + +If `files` is non-empty AND the work was a real code change (not research, not decision-only, not Mymir-only refinement), the implementer must have opened a PR: + +- PR body follows the repo's PR template when one exists (`.github/PULL_REQUEST_TEMPLATE.md` and variants), the canonical concise default otherwise. +- The `taskRef` appears in `[BRACKETS]` (e.g. `[MYMR-83]`) exactly once, for the ONE primary task the PR builds. Bracket form triggers Mymir PR-status tracking. Related tasks are referenced as plain links, no brackets. +- Summary maps from `executionRecord` (2 to 3 sentences); test plan maps from checked `acceptanceCriteria`; notes-for-reviewer maps from `decisions`. +- Sections are concise; empty optional sections beat fabricated content. + +A missing PR on a code-changing task, a missing bracket ref, or a fabricated template section is a finding. + +--- + +## lifecycle §3 — Propagate after every change (Iron Law) + +``` +A change that does not propagate did not happen. +``` + +The graph is Mymir's value. Skip once and it lies: ready tasks that aren't ready, blockers pointing at shipped work, every future session picking the wrong next step. + +After any status change or significant refinement: + +1. `mymir_query type='edges'` on the changed task. Current relationships. +2. `mymir_analyze type='downstream'`. Who depends on this task. +3. For each downstream task, evaluate: + - Do edge notes need updating to reflect new decisions? + - Are there NEW relationships revealed by this change? + - Are there STALE relationships that no longer hold? + - Do downstream descriptions need updating based on the decisions made? +4. Create, update, or remove edges as needed. + +The reviewer does not execute propagation. Your downstream-impact list names the edges that will need attention; the orchestrator (or the human) executes the rewires. + +--- + +## artifacts §1 — Task artifact quality + +### `executionRecord` (only on `done` and `cancelled`) + +- **Length:** 3 to 5 sentences. +- **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). +- **Include:** function names, file paths, endpoints, data formats. +- **Exclude:** debugging stories, false starts, filler. +- **For `cancelled`:** rationale (why abandoned), approaches tried, decisions learned. Same shape as a done record, just for non-shipping outcomes. +- **Draft tasks must NOT carry an `executionRecord`.** That field implies the task shipped. + +### `decisions` + +One-liner per decision. Format: **CHOICE + WHY**. + +``` +GOOD (web): "Chose Redis for refresh tokens. Need fast revocation lookups." +GOOD (sim): "Use std::vector for the Queue backing storage. Cheap front() lookup, fast tail insert; spec is silent on container choice." + +BAD: "Used Drizzle" +BAD: "We picked Redis because it's good" +BAD: "Decided to do it that way" +``` + +Never invent. An implementer `decisions` entry that is not grounded in the diff, the plan, or the conversation is a finding. + +--- + +## artifacts §6 — Markdown formatting and tone + +Applies to everything you write into the verdict. + +### Structure + +- Bullet lists (`-`) for 3 or more items. Never run-on prose. +- Backticks for code references: file paths, function names, endpoints, variables, package names. +- Paragraph breaks between distinct topics. + +### Tone: never sound like AI + +**Do not use:** + +- Em dashes (the `—` character). Use periods, commas, parentheses, or colons. +- Hedging openers: "I think", "perhaps", "seems to", "might be", "arguably". +- Enthusiasm: "Great question", "Awesome", "Exciting", "Love this". +- Throat-clearing: "Let me dive into", "I hope this helps", "Here's the thing", "To be honest". +- Marketing words: "comprehensive", "robust", "powerful", "leverage", "utilize", "ensure", "facilitate", "seamless", "game-changer", "best-in-class". +- Adverb-heavy openers: "Importantly", "Crucially", "Notably", "Essentially", "Basically". +- Empty filler: "It's worth noting that", "It should be mentioned", "As a matter of fact". +- Performative summaries at the end: "I hope this helps!", "Let me know if you need anything else!" + +**Do:** + +- Subject, verb, object. +- Active voice. +- Concrete over abstract. "Adds 50ms p99" beats "improves performance". +- Specific over vague. "Stripe webhook handler" beats "payment integration". +- Cut adverbs. +- One idea per sentence. + +### Length + +Concision over padding. No filler, no repetition. The rule is "no fluff", not "no length". diff --git a/plugins/antigravity/skills/mymir/SKILL.md b/plugins/antigravity/skills/mymir/SKILL.md index 50a5e282..8c899183 100644 --- a/plugins/antigravity/skills/mymir/SKILL.md +++ b/plugins/antigravity/skills/mymir/SKILL.md @@ -141,7 +141,7 @@ You handle most Mymir interactions inline. The four agents are escalations for h | Decompose a project: large, multi-domain, or sensitive | Dispatch **`mymir:decompose`** for the gated 4-phase pipeline | | Split a single existing oversize task into children within an active project ("split this task", "decompose RZE-42", composer's oversize handler) | Dispatch **`mymir:decompose-task`** for the gated split + edge-rewiring + parent-cancel pipeline | | Add a new feature or capability cluster to an active project ("add a feature for X", "decompose this idea into tasks", "extend the project with Y") | Dispatch **`mymir:decompose-feature`** for the gated feature-addition pipeline | -| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through mymir tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/mymir:composer`** (backlog mode) or **`/mymir:composer `** (single-task mode). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command for it to start; composer then runs continuously and stops on structural conditions (queue drained, failure budget, user stop). | +| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through mymir tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/mymir:composer`** (backlog mode), **`/mymir:composer `** (single-task mode), or **`/mymir:composer rework `** (round GitHub review feedback back through the fix loop). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command for it to start; composer then runs continuously and stops on structural conditions (queue drained, failure budget, user stop). | | Review an `in_review` task or a PR by URL ("review MYMR-N", "review this PR", "review ``", "what does the review subagent think of MYMR-N") | Dispatch **`mymir:review`** for a five-lens structured verdict (`approve` / `request-changes` / `block`). The verdict is advisory; HOTL still owns the `in_review → done` transition on GitHub. | | Status, next task, mark done, plan a draft, refine, dispatch, create or delete task | Handle inline. **Do not** dispatch `mymir:manage` for these; they are day-to-day. | | Strategic review, rebalance the graph, audit dependencies, prune orphans, connect missing edges, audit blockers, consolidate categories or tags, graph-health check, "is this project on track?" | Dispatch **`mymir:manage`** for deep CTO mode | @@ -180,7 +180,7 @@ Lead with slim tools. - `mymir_analyze type='plannable'`. Drafts ready to plan. - Pick one on the critical path. **§ Plan a draft task**. -**For end-to-end automation across the queue:** suggest `/mymir:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. It runs continuously without per-task check-ins, gates only on genuine decisions (oversize tasks, proposed rewrites, open questions), runs a bounded review→fix loop per task, and stops structurally when the queue drains or the user says stop. Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. +**For end-to-end automation across the queue:** suggest `/mymir:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + review + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. When HOTL requests changes on a composer PR instead of merging, `/mymir:composer rework ` rounds that feedback back through the fix loop. It runs continuously without per-task check-ins, gates only on genuine decisions (oversize tasks, proposed rewrites, open questions), runs a bounded review→fix loop per task, and stops structurally when the queue drains or the user says stop. Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. ### Refine a task diff --git a/plugins/antigravity/skills/mymir/references/artifacts.md b/plugins/antigravity/skills/mymir/references/artifacts.md index 191e13cf..ab3218e3 100644 --- a/plugins/antigravity/skills/mymir/references/artifacts.md +++ b/plugins/antigravity/skills/mymir/references/artifacts.md @@ -4,7 +4,7 @@ Quality bar for everything an agent writes into Mymir: titles, descriptions, acc Agents read this file when about to create, refine, or audit an artifact. The Iron Law of grounding (`conventions.md` §1) applies at every step. -> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. --- diff --git a/plugins/antigravity/skills/mymir/references/conventions.md b/plugins/antigravity/skills/mymir/references/conventions.md index b963f3a6..a7a0f55f 100644 --- a/plugins/antigravity/skills/mymir/references/conventions.md +++ b/plugins/antigravity/skills/mymir/references/conventions.md @@ -6,7 +6,7 @@ Mymir runs across every kind of software and data project: web and SaaS apps, mo Every Mymir skill and agent must follow these rules. Drift between any rule file and any agent is a bug. -> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. --- diff --git a/plugins/antigravity/skills/mymir/references/lifecycle.md b/plugins/antigravity/skills/mymir/references/lifecycle.md index f174462f..d02beb7d 100644 --- a/plugins/antigravity/skills/mymir/references/lifecycle.md +++ b/plugins/antigravity/skills/mymir/references/lifecycle.md @@ -4,7 +4,7 @@ How tasks move through state, what each state means, the Completion Protocol (wi Agents read this file before any status transition, before marking a task done or cancelled, and after every status change to propagate. -> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. --- diff --git a/plugins/claude-code/skills/composer/references/sources.json b/plugins/claude-code/skills/composer/references/sources.json new file mode 100644 index 00000000..432b866a --- /dev/null +++ b/plugins/claude-code/skills/composer/references/sources.json @@ -0,0 +1,8 @@ +{ + "_comment": "Canonical-source hash pins for the composer phase extracts in this directory. The extracts hand-mirror sections of these files; scripts/check-plugins.ts fails CI when a pinned file changes, until the extracts are reviewed and the pin refreshed via `bun run sync:plugins`.", + "pins": { + "plugins/claude-code/skills/mymir/references/conventions.md": "e5aedc7f4fc602e17c4e93b1bd73528988ce99977f3589de57f7b4c0b4431c11", + "plugins/claude-code/skills/mymir/references/artifacts.md": "29f10467f051ef2182af0bcdd29927c75da73d3cfcc7d8ffa4bb5188c0fe43bd", + "plugins/claude-code/skills/mymir/references/lifecycle.md": "8db13246adf25c6a388ddd35f4868ff5ee7ba9b3ce4dd77cb85d1c0ae68d0646" + } +} diff --git a/plugins/codex/skills/composer/references/reviewer-rules.md b/plugins/codex/skills/composer/references/reviewer-rules.md new file mode 100644 index 00000000..c32079df --- /dev/null +++ b/plugins/codex/skills/composer/references/reviewer-rules.md @@ -0,0 +1,132 @@ +# Reviewer rules (composer Phase 4 extract) + +Slim extract of the canonical mymir references for the review agent. +Mirrors: `skills/mymir/references/conventions.md` §1, +`skills/mymir/references/lifecycle.md` §2.2, §2.3, §3, and +`skills/mymir/references/artifacts.md` §1 (`executionRecord`, +`decisions`), §6. Headings carry their canonical file and section number +so citations like `lifecycle §2.2` resolve unambiguously. When editing a +mirrored section, edit BOTH files. + +The reviewer verifies the Completion Protocol was honored; it does not +execute it. §2.2 and §2.3 below are what the implementer was required to +do; §3 is what the orchestrator runs after your verdict, fed by your +downstream-impact list. + +--- + +## conventions §1 — The Iron Law of grounding + +``` +Never write what you cannot cite or do not know. +``` + +Applies wherever an agent generates `executionRecord`, `decisions`, `description`, or `files`. For the reviewer it applies to the verdict: every finding cites a real file path and line, every AC evaluation cites the diff or the executionRecord. When uncertain, write less. A short, true verdict is more valuable than a rich, fabricated one. + +--- + +## lifecycle §2.2 — Populate the required fields + +`executionRecord`, `decisions`, `files`, `acceptanceCriteria`, plus `prUrl` when a PR was opened (backend upserts a `task_links` row with `kind='pull_request'` so the review subagent and detail UI can resolve the PR). The MCP server returns `_hints` if any are missing. + +For pure spec-review / docs / decision-only / Mymir-only refinement tasks that touched no repo files, `files=[]` is the correct positive answer to "what changed in the repo?", not the absence of an answer. + +## lifecycle §2.3 — Open a PR if the work changed code (what the implementer owed) + +If `files` is non-empty AND the work was a real code change (not research, not decision-only, not Mymir-only refinement), the implementer must have opened a PR: + +- PR body follows the repo's PR template when one exists (`.github/PULL_REQUEST_TEMPLATE.md` and variants), the canonical concise default otherwise. +- The `taskRef` appears in `[BRACKETS]` (e.g. `[MYMR-83]`) exactly once, for the ONE primary task the PR builds. Bracket form triggers Mymir PR-status tracking. Related tasks are referenced as plain links, no brackets. +- Summary maps from `executionRecord` (2 to 3 sentences); test plan maps from checked `acceptanceCriteria`; notes-for-reviewer maps from `decisions`. +- Sections are concise; empty optional sections beat fabricated content. + +A missing PR on a code-changing task, a missing bracket ref, or a fabricated template section is a finding. + +--- + +## lifecycle §3 — Propagate after every change (Iron Law) + +``` +A change that does not propagate did not happen. +``` + +The graph is Mymir's value. Skip once and it lies: ready tasks that aren't ready, blockers pointing at shipped work, every future session picking the wrong next step. + +After any status change or significant refinement: + +1. `mymir_query type='edges'` on the changed task. Current relationships. +2. `mymir_analyze type='downstream'`. Who depends on this task. +3. For each downstream task, evaluate: + - Do edge notes need updating to reflect new decisions? + - Are there NEW relationships revealed by this change? + - Are there STALE relationships that no longer hold? + - Do downstream descriptions need updating based on the decisions made? +4. Create, update, or remove edges as needed. + +The reviewer does not execute propagation. Your downstream-impact list names the edges that will need attention; the orchestrator (or the human) executes the rewires. + +--- + +## artifacts §1 — Task artifact quality + +### `executionRecord` (only on `done` and `cancelled`) + +- **Length:** 3 to 5 sentences. +- **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). +- **Include:** function names, file paths, endpoints, data formats. +- **Exclude:** debugging stories, false starts, filler. +- **For `cancelled`:** rationale (why abandoned), approaches tried, decisions learned. Same shape as a done record, just for non-shipping outcomes. +- **Draft tasks must NOT carry an `executionRecord`.** That field implies the task shipped. + +### `decisions` + +One-liner per decision. Format: **CHOICE + WHY**. + +``` +GOOD (web): "Chose Redis for refresh tokens. Need fast revocation lookups." +GOOD (sim): "Use std::vector for the Queue backing storage. Cheap front() lookup, fast tail insert; spec is silent on container choice." + +BAD: "Used Drizzle" +BAD: "We picked Redis because it's good" +BAD: "Decided to do it that way" +``` + +Never invent. An implementer `decisions` entry that is not grounded in the diff, the plan, or the conversation is a finding. + +--- + +## artifacts §6 — Markdown formatting and tone + +Applies to everything you write into the verdict. + +### Structure + +- Bullet lists (`-`) for 3 or more items. Never run-on prose. +- Backticks for code references: file paths, function names, endpoints, variables, package names. +- Paragraph breaks between distinct topics. + +### Tone: never sound like AI + +**Do not use:** + +- Em dashes (the `—` character). Use periods, commas, parentheses, or colons. +- Hedging openers: "I think", "perhaps", "seems to", "might be", "arguably". +- Enthusiasm: "Great question", "Awesome", "Exciting", "Love this". +- Throat-clearing: "Let me dive into", "I hope this helps", "Here's the thing", "To be honest". +- Marketing words: "comprehensive", "robust", "powerful", "leverage", "utilize", "ensure", "facilitate", "seamless", "game-changer", "best-in-class". +- Adverb-heavy openers: "Importantly", "Crucially", "Notably", "Essentially", "Basically". +- Empty filler: "It's worth noting that", "It should be mentioned", "As a matter of fact". +- Performative summaries at the end: "I hope this helps!", "Let me know if you need anything else!" + +**Do:** + +- Subject, verb, object. +- Active voice. +- Concrete over abstract. "Adds 50ms p99" beats "improves performance". +- Specific over vague. "Stripe webhook handler" beats "payment integration". +- Cut adverbs. +- One idea per sentence. + +### Length + +Concision over padding. No filler, no repetition. The rule is "no fluff", not "no length". diff --git a/plugins/codex/skills/mymir/SKILL.md b/plugins/codex/skills/mymir/SKILL.md index 50a5e282..8c899183 100644 --- a/plugins/codex/skills/mymir/SKILL.md +++ b/plugins/codex/skills/mymir/SKILL.md @@ -141,7 +141,7 @@ You handle most Mymir interactions inline. The four agents are escalations for h | Decompose a project: large, multi-domain, or sensitive | Dispatch **`mymir:decompose`** for the gated 4-phase pipeline | | Split a single existing oversize task into children within an active project ("split this task", "decompose RZE-42", composer's oversize handler) | Dispatch **`mymir:decompose-task`** for the gated split + edge-rewiring + parent-cancel pipeline | | Add a new feature or capability cluster to an active project ("add a feature for X", "decompose this idea into tasks", "extend the project with Y") | Dispatch **`mymir:decompose-feature`** for the gated feature-addition pipeline | -| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through mymir tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/mymir:composer`** (backlog mode) or **`/mymir:composer `** (single-task mode). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command for it to start; composer then runs continuously and stops on structural conditions (queue drained, failure budget, user stop). | +| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through mymir tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/mymir:composer`** (backlog mode), **`/mymir:composer `** (single-task mode), or **`/mymir:composer rework `** (round GitHub review feedback back through the fix loop). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command for it to start; composer then runs continuously and stops on structural conditions (queue drained, failure budget, user stop). | | Review an `in_review` task or a PR by URL ("review MYMR-N", "review this PR", "review ``", "what does the review subagent think of MYMR-N") | Dispatch **`mymir:review`** for a five-lens structured verdict (`approve` / `request-changes` / `block`). The verdict is advisory; HOTL still owns the `in_review → done` transition on GitHub. | | Status, next task, mark done, plan a draft, refine, dispatch, create or delete task | Handle inline. **Do not** dispatch `mymir:manage` for these; they are day-to-day. | | Strategic review, rebalance the graph, audit dependencies, prune orphans, connect missing edges, audit blockers, consolidate categories or tags, graph-health check, "is this project on track?" | Dispatch **`mymir:manage`** for deep CTO mode | @@ -180,7 +180,7 @@ Lead with slim tools. - `mymir_analyze type='plannable'`. Drafts ready to plan. - Pick one on the critical path. **§ Plan a draft task**. -**For end-to-end automation across the queue:** suggest `/mymir:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. It runs continuously without per-task check-ins, gates only on genuine decisions (oversize tasks, proposed rewrites, open questions), runs a bounded review→fix loop per task, and stops structurally when the queue drains or the user says stop. Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. +**For end-to-end automation across the queue:** suggest `/mymir:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + review + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. When HOTL requests changes on a composer PR instead of merging, `/mymir:composer rework ` rounds that feedback back through the fix loop. It runs continuously without per-task check-ins, gates only on genuine decisions (oversize tasks, proposed rewrites, open questions), runs a bounded review→fix loop per task, and stops structurally when the queue drains or the user says stop. Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. ### Refine a task diff --git a/plugins/codex/skills/mymir/references/artifacts.md b/plugins/codex/skills/mymir/references/artifacts.md index 191e13cf..ab3218e3 100644 --- a/plugins/codex/skills/mymir/references/artifacts.md +++ b/plugins/codex/skills/mymir/references/artifacts.md @@ -4,7 +4,7 @@ Quality bar for everything an agent writes into Mymir: titles, descriptions, acc Agents read this file when about to create, refine, or audit an artifact. The Iron Law of grounding (`conventions.md` §1) applies at every step. -> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. --- diff --git a/plugins/codex/skills/mymir/references/conventions.md b/plugins/codex/skills/mymir/references/conventions.md index f7d6a5b0..8f99b746 100644 --- a/plugins/codex/skills/mymir/references/conventions.md +++ b/plugins/codex/skills/mymir/references/conventions.md @@ -6,7 +6,7 @@ Mymir runs across every kind of software and data project: web and SaaS apps, mo Every Mymir skill and agent must follow these rules. Drift between any rule file and any agent is a bug. -> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. --- diff --git a/plugins/codex/skills/mymir/references/lifecycle.md b/plugins/codex/skills/mymir/references/lifecycle.md index f174462f..d02beb7d 100644 --- a/plugins/codex/skills/mymir/references/lifecycle.md +++ b/plugins/codex/skills/mymir/references/lifecycle.md @@ -4,7 +4,7 @@ How tasks move through state, what each state means, the Completion Protocol (wi Agents read this file before any status transition, before marking a task done or cancelled, and after every status change to propagate. -> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. --- diff --git a/plugins/cursor/skills/composer/references/reviewer-rules.md b/plugins/cursor/skills/composer/references/reviewer-rules.md new file mode 100644 index 00000000..c32079df --- /dev/null +++ b/plugins/cursor/skills/composer/references/reviewer-rules.md @@ -0,0 +1,132 @@ +# Reviewer rules (composer Phase 4 extract) + +Slim extract of the canonical mymir references for the review agent. +Mirrors: `skills/mymir/references/conventions.md` §1, +`skills/mymir/references/lifecycle.md` §2.2, §2.3, §3, and +`skills/mymir/references/artifacts.md` §1 (`executionRecord`, +`decisions`), §6. Headings carry their canonical file and section number +so citations like `lifecycle §2.2` resolve unambiguously. When editing a +mirrored section, edit BOTH files. + +The reviewer verifies the Completion Protocol was honored; it does not +execute it. §2.2 and §2.3 below are what the implementer was required to +do; §3 is what the orchestrator runs after your verdict, fed by your +downstream-impact list. + +--- + +## conventions §1 — The Iron Law of grounding + +``` +Never write what you cannot cite or do not know. +``` + +Applies wherever an agent generates `executionRecord`, `decisions`, `description`, or `files`. For the reviewer it applies to the verdict: every finding cites a real file path and line, every AC evaluation cites the diff or the executionRecord. When uncertain, write less. A short, true verdict is more valuable than a rich, fabricated one. + +--- + +## lifecycle §2.2 — Populate the required fields + +`executionRecord`, `decisions`, `files`, `acceptanceCriteria`, plus `prUrl` when a PR was opened (backend upserts a `task_links` row with `kind='pull_request'` so the review subagent and detail UI can resolve the PR). The MCP server returns `_hints` if any are missing. + +For pure spec-review / docs / decision-only / Mymir-only refinement tasks that touched no repo files, `files=[]` is the correct positive answer to "what changed in the repo?", not the absence of an answer. + +## lifecycle §2.3 — Open a PR if the work changed code (what the implementer owed) + +If `files` is non-empty AND the work was a real code change (not research, not decision-only, not Mymir-only refinement), the implementer must have opened a PR: + +- PR body follows the repo's PR template when one exists (`.github/PULL_REQUEST_TEMPLATE.md` and variants), the canonical concise default otherwise. +- The `taskRef` appears in `[BRACKETS]` (e.g. `[MYMR-83]`) exactly once, for the ONE primary task the PR builds. Bracket form triggers Mymir PR-status tracking. Related tasks are referenced as plain links, no brackets. +- Summary maps from `executionRecord` (2 to 3 sentences); test plan maps from checked `acceptanceCriteria`; notes-for-reviewer maps from `decisions`. +- Sections are concise; empty optional sections beat fabricated content. + +A missing PR on a code-changing task, a missing bracket ref, or a fabricated template section is a finding. + +--- + +## lifecycle §3 — Propagate after every change (Iron Law) + +``` +A change that does not propagate did not happen. +``` + +The graph is Mymir's value. Skip once and it lies: ready tasks that aren't ready, blockers pointing at shipped work, every future session picking the wrong next step. + +After any status change or significant refinement: + +1. `mymir_query type='edges'` on the changed task. Current relationships. +2. `mymir_analyze type='downstream'`. Who depends on this task. +3. For each downstream task, evaluate: + - Do edge notes need updating to reflect new decisions? + - Are there NEW relationships revealed by this change? + - Are there STALE relationships that no longer hold? + - Do downstream descriptions need updating based on the decisions made? +4. Create, update, or remove edges as needed. + +The reviewer does not execute propagation. Your downstream-impact list names the edges that will need attention; the orchestrator (or the human) executes the rewires. + +--- + +## artifacts §1 — Task artifact quality + +### `executionRecord` (only on `done` and `cancelled`) + +- **Length:** 3 to 5 sentences. +- **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). +- **Include:** function names, file paths, endpoints, data formats. +- **Exclude:** debugging stories, false starts, filler. +- **For `cancelled`:** rationale (why abandoned), approaches tried, decisions learned. Same shape as a done record, just for non-shipping outcomes. +- **Draft tasks must NOT carry an `executionRecord`.** That field implies the task shipped. + +### `decisions` + +One-liner per decision. Format: **CHOICE + WHY**. + +``` +GOOD (web): "Chose Redis for refresh tokens. Need fast revocation lookups." +GOOD (sim): "Use std::vector for the Queue backing storage. Cheap front() lookup, fast tail insert; spec is silent on container choice." + +BAD: "Used Drizzle" +BAD: "We picked Redis because it's good" +BAD: "Decided to do it that way" +``` + +Never invent. An implementer `decisions` entry that is not grounded in the diff, the plan, or the conversation is a finding. + +--- + +## artifacts §6 — Markdown formatting and tone + +Applies to everything you write into the verdict. + +### Structure + +- Bullet lists (`-`) for 3 or more items. Never run-on prose. +- Backticks for code references: file paths, function names, endpoints, variables, package names. +- Paragraph breaks between distinct topics. + +### Tone: never sound like AI + +**Do not use:** + +- Em dashes (the `—` character). Use periods, commas, parentheses, or colons. +- Hedging openers: "I think", "perhaps", "seems to", "might be", "arguably". +- Enthusiasm: "Great question", "Awesome", "Exciting", "Love this". +- Throat-clearing: "Let me dive into", "I hope this helps", "Here's the thing", "To be honest". +- Marketing words: "comprehensive", "robust", "powerful", "leverage", "utilize", "ensure", "facilitate", "seamless", "game-changer", "best-in-class". +- Adverb-heavy openers: "Importantly", "Crucially", "Notably", "Essentially", "Basically". +- Empty filler: "It's worth noting that", "It should be mentioned", "As a matter of fact". +- Performative summaries at the end: "I hope this helps!", "Let me know if you need anything else!" + +**Do:** + +- Subject, verb, object. +- Active voice. +- Concrete over abstract. "Adds 50ms p99" beats "improves performance". +- Specific over vague. "Stripe webhook handler" beats "payment integration". +- Cut adverbs. +- One idea per sentence. + +### Length + +Concision over padding. No filler, no repetition. The rule is "no fluff", not "no length". diff --git a/plugins/cursor/skills/mymir/SKILL.md b/plugins/cursor/skills/mymir/SKILL.md index 50a5e282..8c899183 100644 --- a/plugins/cursor/skills/mymir/SKILL.md +++ b/plugins/cursor/skills/mymir/SKILL.md @@ -141,7 +141,7 @@ You handle most Mymir interactions inline. The four agents are escalations for h | Decompose a project: large, multi-domain, or sensitive | Dispatch **`mymir:decompose`** for the gated 4-phase pipeline | | Split a single existing oversize task into children within an active project ("split this task", "decompose RZE-42", composer's oversize handler) | Dispatch **`mymir:decompose-task`** for the gated split + edge-rewiring + parent-cancel pipeline | | Add a new feature or capability cluster to an active project ("add a feature for X", "decompose this idea into tasks", "extend the project with Y") | Dispatch **`mymir:decompose-feature`** for the gated feature-addition pipeline | -| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through mymir tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/mymir:composer`** (backlog mode) or **`/mymir:composer `** (single-task mode). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command for it to start; composer then runs continuously and stops on structural conditions (queue drained, failure budget, user stop). | +| Drive tasks end-to-end through research + plan + implement + review + propagate ("ship the backlog", "run the next task", "compose through my queue", "loop through mymir tasks", a named task ref to take all the way to a PR) | Suggest user invoke **`/mymir:composer`** (backlog mode), **`/mymir:composer `** (single-task mode), or **`/mymir:composer rework `** (round GitHub review feedback back through the fix loop). Composer is a slash-command skill that orchestrates four dispatched subagents per task in clean per-phase contexts; the user has to type the slash command for it to start; composer then runs continuously and stops on structural conditions (queue drained, failure budget, user stop). | | Review an `in_review` task or a PR by URL ("review MYMR-N", "review this PR", "review ``", "what does the review subagent think of MYMR-N") | Dispatch **`mymir:review`** for a five-lens structured verdict (`approve` / `request-changes` / `block`). The verdict is advisory; HOTL still owns the `in_review → done` transition on GitHub. | | Status, next task, mark done, plan a draft, refine, dispatch, create or delete task | Handle inline. **Do not** dispatch `mymir:manage` for these; they are day-to-day. | | Strategic review, rebalance the graph, audit dependencies, prune orphans, connect missing edges, audit blockers, consolidate categories or tags, graph-health check, "is this project on track?" | Dispatch **`mymir:manage`** for deep CTO mode | @@ -180,7 +180,7 @@ Lead with slim tools. - `mymir_analyze type='plannable'`. Drafts ready to plan. - Pick one on the critical path. **§ Plan a draft task**. -**For end-to-end automation across the queue:** suggest `/mymir:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. It runs continuously without per-task check-ins, gates only on genuine decisions (oversize tasks, proposed rewrites, open questions), runs a bounded review→fix loop per task, and stops structurally when the queue drains or the user says stop. Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. +**For end-to-end automation across the queue:** suggest `/mymir:composer` (backlog mode). Composer picks the highest-value ready task each iteration, drives it through research + plan + implement + review + propagate via dispatched subagents in clean per-phase contexts, then loops until the queue is empty or the user stops. When HOTL requests changes on a composer PR instead of merging, `/mymir:composer rework ` rounds that feedback back through the fix loop. It runs continuously without per-task check-ins, gates only on genuine decisions (oversize tasks, proposed rewrites, open questions), runs a bounded review→fix loop per task, and stops structurally when the queue drains or the user says stop. Use this when the user wants the queue shipped without picking each task manually; use the inline picker above when the user wants per-task agency. ### Refine a task diff --git a/plugins/cursor/skills/mymir/references/artifacts.md b/plugins/cursor/skills/mymir/references/artifacts.md index 191e13cf..ab3218e3 100644 --- a/plugins/cursor/skills/mymir/references/artifacts.md +++ b/plugins/cursor/skills/mymir/references/artifacts.md @@ -4,7 +4,7 @@ Quality bar for everything an agent writes into Mymir: titles, descriptions, acc Agents read this file when about to create, refine, or audit an artifact. The Iron Law of grounding (`conventions.md` §1) applies at every step. -> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. --- diff --git a/plugins/cursor/skills/mymir/references/conventions.md b/plugins/cursor/skills/mymir/references/conventions.md index 2b0f7ba6..c9479790 100644 --- a/plugins/cursor/skills/mymir/references/conventions.md +++ b/plugins/cursor/skills/mymir/references/conventions.md @@ -6,7 +6,7 @@ Mymir runs across every kind of software and data project: web and SaaS apps, mo Every Mymir skill and agent must follow these rules. Drift between any rule file and any agent is a bug. -> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. --- diff --git a/plugins/cursor/skills/mymir/references/lifecycle.md b/plugins/cursor/skills/mymir/references/lifecycle.md index f174462f..d02beb7d 100644 --- a/plugins/cursor/skills/mymir/references/lifecycle.md +++ b/plugins/cursor/skills/mymir/references/lifecycle.md @@ -4,7 +4,7 @@ How tasks move through state, what each state means, the Completion Protocol (wi Agents read this file before any status transition, before marking a task done or cancelled, and after every status change to propagate. -> Sections of this file are mirrored by the composer phase extracts in `skills/composer/references/`; when you edit a mirrored section, update the extract too. +> Sections of this file are mirrored by the composer phase extracts in the claude-code plugin (`plugins/claude-code/skills/composer/references/`); when you edit a mirrored section, update those extracts and bump the pin in their `sources.json`. --- diff --git a/scripts/check-plugins.ts b/scripts/check-plugins.ts index 8152d198..79c95de5 100644 --- a/scripts/check-plugins.ts +++ b/scripts/check-plugins.ts @@ -1,6 +1,12 @@ -import { readFileSync, writeFileSync, existsSync, mkdirSync } from "node:fs"; +import { + readFileSync, + writeFileSync, + existsSync, + mkdirSync, + readdirSync, +} from "node:fs"; import { createHash } from "node:crypto"; -import { dirname } from "node:path"; +import { dirname, join } from "node:path"; interface SharedGroup { name: string; @@ -160,8 +166,32 @@ const shared: SharedGroup[] = [ "plugins/antigravity/skills/review/SKILL.md", ], }, + { + name: "skills/composer/references/reviewer-rules.md", + canonical: "plugins/claude-code/skills/composer/references/reviewer-rules.md", + copies: [ + "plugins/codex/skills/composer/references/reviewer-rules.md", + "plugins/cursor/skills/composer/references/reviewer-rules.md", + "plugins/antigravity/skills/composer/references/reviewer-rules.md", + ], + }, +]; + +const pluginRoots = [ + "plugins/claude-code", + "plugins/codex", + "plugins/cursor", + "plugins/antigravity", ]; +const extractPinsPath = + "plugins/claude-code/skills/composer/references/sources.json"; + +interface ExtractPins { + _comment: string; + pins: Record; +} + const fieldSyncs: FieldSync[] = [ { name: "description", @@ -274,6 +304,89 @@ function setNested( parent[last] = value; } +/** + * Recursively lists markdown files under a directory. + * @param root - Directory to walk. + * @returns Repo-relative paths of every `.md` file found. + */ +function listMarkdownFiles(root: string): string[] { + return (readdirSync(root, { recursive: true }) as string[]) + .filter((p) => p.endsWith(".md")) + .map((p) => join(root, p)); +} + +/** + * Validates that every `@path` include line in a plugin's markdown files + * resolves to an existing file inside that plugin. Includes are + * plugin-root-relative; a dangling include silently strips an agent's + * loaded rules at runtime, so it must fail the check. + * @param root - Plugin root directory (e.g. `plugins/codex`). + * @returns Number of dangling includes found (also logged to stderr). + */ +function checkIncludeTargets(root: string): number { + let dangling = 0; + for (const file of listMarkdownFiles(root)) { + const lines = readFileSync(file, "utf8").split("\n"); + for (const line of lines) { + const match = line.match(/^@(\S+)$/); + if (!match) continue; + const target = join(root, match[1]); + if (!existsSync(target)) { + console.error(`[dangling include] ${file}: @${match[1]} (missing)`); + dangling++; + } + } + } + return dangling; +} + +/** + * Verifies the composer extracts' canonical-source hash pins. The extracts + * hand-mirror sections of the canonical mymir references; the pin file + * records the canonical files' hashes the extracts were last reviewed + * against. Any canonical edit fails the check until the extracts are + * reviewed and the pins refreshed (`--fix` refreshes them, loudly). + * @param fixMode - When true, refresh stale pins after warning. + * @returns Object with failure and change counts. + */ +function checkExtractPins(fixMode: boolean): { + failures: number; + changes: number; +} { + if (!existsSync(extractPinsPath)) { + console.error(`[missing pins] ${extractPinsPath}`); + return { failures: 1, changes: 0 }; + } + const pinFile = JSON.parse( + readFileSync(extractPinsPath, "utf8"), + ) as ExtractPins; + let failures = 0; + let changes = 0; + for (const [path, pinned] of Object.entries(pinFile.pins)) { + const actual = hashFile(path); + if (actual === pinned) { + console.log(`[ok] extract pin ${path}`); + continue; + } + if (fixMode) { + pinFile.pins[path] = actual; + console.log( + `[extracts] ${path} changed — pin refreshed. REVIEW the mirrored sections in plugins/claude-code/skills/composer/references/ before committing.`, + ); + changes++; + } else { + console.error( + `[extract drift] ${path} changed since the composer extracts were last reviewed (pin ${pinned.slice(0, 8)} vs ${actual.slice(0, 8)}). Review the mirrored sections in plugins/claude-code/skills/composer/references/, update them if needed, then run \`bun run sync:plugins\` to refresh the pin.`, + ); + failures++; + } + } + if (changes > 0) { + writeFileSync(extractPinsPath, JSON.stringify(pinFile, null, 2) + "\n"); + } + return { failures, changes }; +} + const fix = process.argv.includes("--fix"); let failures = 0; @@ -361,13 +474,21 @@ for (const sync of fieldSyncs) { } } +for (const root of pluginRoots) { + failures += checkIncludeTargets(root); +} + +const pinResult = checkExtractPins(fix); +failures += pinResult.failures; +changes += pinResult.changes; + if (fix) { console.log( changes > 0 ? `\nSynced ${changes} file(s)/field(s).` : `\nNothing to sync.`, ); - process.exit(0); + process.exit(failures > 0 ? 1 : 0); } if (failures > 0) { From 4bf9001868f9308de69e5d6c7c209ec339ffc110 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 19:28:16 +0200 Subject: [PATCH 38/45] fix: format sync script and remove toctou race in pin check --- scripts/check-plugins.ts | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/scripts/check-plugins.ts b/scripts/check-plugins.ts index 79c95de5..6b88a12c 100644 --- a/scripts/check-plugins.ts +++ b/scripts/check-plugins.ts @@ -168,7 +168,8 @@ const shared: SharedGroup[] = [ }, { name: "skills/composer/references/reviewer-rules.md", - canonical: "plugins/claude-code/skills/composer/references/reviewer-rules.md", + canonical: + "plugins/claude-code/skills/composer/references/reviewer-rules.md", copies: [ "plugins/codex/skills/composer/references/reviewer-rules.md", "plugins/cursor/skills/composer/references/reviewer-rules.md", @@ -353,13 +354,13 @@ function checkExtractPins(fixMode: boolean): { failures: number; changes: number; } { - if (!existsSync(extractPinsPath)) { - console.error(`[missing pins] ${extractPinsPath}`); + let pinFile: ExtractPins; + try { + pinFile = JSON.parse(readFileSync(extractPinsPath, "utf8")) as ExtractPins; + } catch { + console.error(`[missing pins] ${extractPinsPath} (absent or unreadable)`); return { failures: 1, changes: 0 }; } - const pinFile = JSON.parse( - readFileSync(extractPinsPath, "utf8"), - ) as ExtractPins; let failures = 0; let changes = 0; for (const [path, pinned] of Object.entries(pinFile.pins)) { From 74eb69b381b80bdc58672824c2d70085d41e7202 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 20:10:16 +0200 Subject: [PATCH 39/45] fix: address fresh-eyes composer review findings --- .../claude-code/agents/composer-implementer.md | 8 ++++---- plugins/claude-code/agents/composer-planner.md | 5 ++--- plugins/claude-code/agents/composer-researcher.md | 12 +++++++----- plugins/claude-code/agents/review.md | 2 +- plugins/claude-code/skills/composer/SKILL.md | 13 +++++++------ .../composer/references/researcher-rules.md | 2 +- .../skills/composer/references/sources.json | 2 +- .../skills/composer/tests/scenarios.md | 15 +++++++++++++-- .../skills/mymir/references/artifacts.md | 2 +- 9 files changed, 37 insertions(+), 24 deletions(-) diff --git a/plugins/claude-code/agents/composer-implementer.md b/plugins/claude-code/agents/composer-implementer.md index b46b0ed1..ac926287 100644 --- a/plugins/claude-code/agents/composer-implementer.md +++ b/plugins/claude-code/agents/composer-implementer.md @@ -109,11 +109,11 @@ b. Create a feature branch from the project's default branch. DEFAULT_BRANCH=$(gh repo view --json defaultBranchRef -q '.defaultBranchRef.name') # Fallback when gh is unavailable: # DEFAULT_BRANCH=$(git remote show origin | sed -n 's/.*HEAD branch: //p') - git checkout "$DEFAULT_BRANCH" && git pull --ff-only + git fetch origin "$DEFAULT_BRANCH" git fetch origin "+refs/heads/:refs/remotes/origin/" 2>/dev/null || true ``` - Never hardcode `main`; projects differ. Shell state does not persist between your Bash tool calls: every later block that uses `$DEFAULT_BRANCH` re-derives it on its first line — keep those lines when you run the blocks separately. + Never hardcode `main`; projects differ. Never check out the default branch itself: under worktree isolation it is usually checked out in the orchestrator's tree and `git checkout` refuses (one checkout per branch across worktrees); branching from `origin/$DEFAULT_BRANCH` gives the same fresh base in both modes. Shell state does not persist between your Bash tool calls: every later block that uses `$DEFAULT_BRANCH` re-derives it on its first line — keep those lines when you run the blocks separately. **If the task branch already exists** (locally or on `origin`): do not create a new one. Verify it is yours first against the remote ref (the branch may exist only on `origin`; the bare local name will not resolve there): @@ -125,7 +125,7 @@ b. Create a feature branch from the project's default branch. The commits or the PR must reference this taskRef (the `[]` bracket form, or the taskRef in commit subjects). Yours: check it out (`git checkout ` when a local ref exists, else `git checkout -b origin/`) and continue from where the prior attempt stopped (retries reuse the branch). Foreign (a different task or author squatting the deterministic name): fail loudly naming the conflict — `STATUS: BLOCKED — branch collision: carries `. Suffixes stay forbidden; never mint `-2`. - **Otherwise**: `git checkout -b `. + **Otherwise**: `git checkout -b "origin/$DEFAULT_BRANCH"`. **Never** append an `attempt-N` suffix and **never** nest the taskRef as its own path segment (`composer/RZE-17/attempt-1` is wrong; this is an old pattern that no longer applies). Retries reuse the same branch and append commits; git history tracks attempts, the branch name does not. One branch per task; do not stack tasks on one branch unless the user has explicitly arranged it. @@ -180,7 +180,7 @@ mymir_task action='update' taskId='' executionRecord='' decisions=['', ...] files=['', ...] - acceptanceCriteria=[{id: '', checked: true|false}, ...] + acceptanceCriteria=[{id: '', text: '', checked: true|false}, ...] prUrl='' ``` diff --git a/plugins/claude-code/agents/composer-planner.md b/plugins/claude-code/agents/composer-planner.md index 082cfc9b..5975acdc 100644 --- a/plugins/claude-code/agents/composer-planner.md +++ b/plugins/claude-code/agents/composer-planner.md @@ -5,9 +5,8 @@ description: > composer orchestrator after the researcher returns. Takes the research brief plus the target task's planning context, writes the unabridged implementationPlan to Mymir, and transitions the task draft → planned in - the same update. Applies refinements the researcher proposed - (acceptance criteria rewrites, description tightening, tag adjustments) - via append-only updates. Returns a one-sentence confirmation. Does not + the same update. Fills refinement gaps the researcher missed via + append-only updates. Returns a one-sentence confirmation. Does not edit code, run tests, or open PRs. Invoked automatically by the composer skill; safe to call directly when the user asks "plan from the research brief" outside the composer loop. diff --git a/plugins/claude-code/agents/composer-researcher.md b/plugins/claude-code/agents/composer-researcher.md index b9dee235..d4310bda 100644 --- a/plugins/claude-code/agents/composer-researcher.md +++ b/plugins/claude-code/agents/composer-researcher.md @@ -8,8 +8,10 @@ description: > the implementer will touch, surfaces the project's house conventions (commit format, test/lint/typecheck commands, PR template), and reasons about security, performance, and reliability standards the work must - meet. Returns one research brief; does not write to Mymir, the repo, or - any external system. Invoked automatically by the composer skill; safe + meet. Applies refinements (description, acceptance criteria, tags, + category, priority, estimate, decisions) directly to the target task, + never status, and returns one research brief; writes nothing to the + repo or any external system. Invoked automatically by the composer skill; safe to call directly when the user asks "research task " or "investigate before planning" outside the composer loop. model: sonnet @@ -114,13 +116,13 @@ Run these in the order given; do not skip. Steps 2–5 can fan out in parallel w 7. **Apply refinements.** Fold your findings back into the target task with one or more `mymir_task action='update'` calls. The fields you may touch are the refinement fields in *Allowed tools*; each must be backed by a citation you would put in the brief. Per-field rules: - - **`description`**: when the existing description fails the rubric in `references/artifacts.md` §1, rewrite it. Cite the codebase reads that justify the rewrite. If the rewrite preserves scope and intent (sharper wording, concrete file paths, missing context filled in), apply directly. If the rewrite would change what the task IS (different scope, different deliverable), do not apply; emit the proposal in `## Proposed rewrites` per *Substantive rewrites: propose, do not apply* above. + - **`description`**: when the existing description fails the rubric in artifacts §1, rewrite it. Cite the codebase reads that justify the rewrite. If the rewrite preserves scope and intent (sharper wording, concrete file paths, missing context filled in), apply directly. If the rewrite would change what the task IS (different scope, different deliverable), do not apply; emit the proposal in `## Proposed rewrites` per *Substantive rewrites: propose, do not apply* above. - **`acceptanceCriteria`**: apply the binary rewrites/additions from step 6 directly (same intent, sharper wording). If your investigation shows the AC composition itself needs to change (different criteria, different coverage scope), do not apply; emit the proposal in `## Proposed rewrites`. - - **`tags`**: when the three-dimension taxonomy in `references/artifacts.md` §2 is incomplete, add the missing dimensions. Run `mymir_query type='meta'` first to reuse existing vocabulary. + - **`tags`**: when the three-dimension taxonomy in artifacts §2 is incomplete, add the missing dimensions. Run `mymir_query type='meta'` first to reuse existing vocabulary. - **`category`**: set to the closest match from `mymir_query type='meta'`. Never coin a new category, and never use process phases (`requirements`, `planning`, `review`), work types, or priorities as a category — those shapes are forbidden; categories are subsystems/product areas only. - **`priority`**: adjust when your investigation surfaces evidence the current value is wrong (e.g., a security boundary the task crosses argues for `core` or `urgent`). - **`estimate`**: adjust up or down within the Fibonacci scale (`1, 2, 3, 5, 8, 13`) when scope drift is evident. The field is bounded; never propose a value above `13`. If your scope analysis shows the work exceeds what `13` represents, do not invent a higher estimate; raise `oversize-task` in *Flags* so the orchestrator routes to `mymir:decompose-task` before planning. Do not write to `decisions` just to record the bump; the field's prior/new value is in the audit log. - - **`decisions`**: append a one-liner only when refinement work produced a real CHOICE + WHY (see `references/artifacts.md` §1 for shape and examples). Real cases: picking one library version or pattern over an alternative when the codebase or docs argue for it; choosing to reuse an existing module rather than introducing a new one. Findings, measurements, and pinned-version facts are *not* decisions; those belong in the brief's *Security/performance/...* and *External dependencies* sections, not in `decisions`. Better an empty `decisions` list than fabricated entries. + - **`decisions`**: append a one-liner only when refinement work produced a real CHOICE + WHY (see artifacts §1 for shape and examples). Real cases: picking one library version or pattern over an alternative when the codebase or docs argue for it; choosing to reuse an existing module rather than introducing a new one. Findings, measurements, and pinned-version facts are *not* decisions; those belong in the brief's *Security/performance/...* and *External dependencies* sections, not in `decisions`. Better an empty `decisions` list than fabricated entries. Every refinement appends; never pass `overwriteArrays=true`. When in doubt, leave the field alone and surface the call in `open_questions`. Speculation in a `description` rewrite is worse than a thin description. diff --git a/plugins/claude-code/agents/review.md b/plugins/claude-code/agents/review.md index 4b8ac071..8a5e984e 100644 --- a/plugins/claude-code/agents/review.md +++ b/plugins/claude-code/agents/review.md @@ -89,7 +89,7 @@ You own zero transitions. The implementer wrote `in_progress → in_review` with a. `mymir_context depth='working' taskId=''`. Returns description, acceptanceCriteria, decisions, 1-hop connected tasks (the edges section), and the PR handle from `task.links` filtered to `kind='pull_request'`. Mechanically excludes `executionRecord`, `implementationPlan` body, and `files`; steps 2 and 3 run against the diff with that exclusion in place, so the lens findings are formed from the code rather than from the implementer's narrative. The full review bundle (executionRecord, plan body, files, plan-vs-files drift, downstream) is fetched in step 4. -b. Confirm `status='in_review'`. Any other state stops the run. If the bundle reports a missing `prUrl` on a task whose `files` is non-empty, flag it: a code-changing `in_review` task without a PR is a Completion Protocol violation, not a review problem; surface the violation and stop. +b. Confirm `status='in_review'`. Any other state stops the run. If the bundle carries no PR handle (`task.links` has no `pull_request` entry) and the dispatch supplied no PR URL, stop: there is no diff to review. Either the task legitimately shipped without a PR (lifecycle §2.4 task types) or the Completion Protocol was violated on a code-changing task; the `working` bundle excludes `files`, so do not guess which — report the missing handle and return `STATUS: BLOCKED — PR handle missing`. When the dispatch supplies a PR URL but `task.links` lacks the row, proceed with the dispatch URL and flag the missing link as a Completion Protocol process note in the verdict. c. Resolve the PR. `gh pr view --json url,title,state,mergeable,statusCheckRollup,reviewDecision`. Note the CI state, the merge state, any failing checks. If checks are red, that is a `block`-class signal on its own; you can still produce the lens analysis, but the verdict cannot be `approve` while CI is red. Pending or unresolved checks cap the verdict at `request-changes`: when the dispatch says `CI: unresolved after ` (or you observe still-pending checks yourself), an otherwise-clean review returns `request-changes` with unresolved CI as the sole blocking finding. diff --git a/plugins/claude-code/skills/composer/SKILL.md b/plugins/claude-code/skills/composer/SKILL.md index 33ba89fe..fb68ddff 100644 --- a/plugins/claude-code/skills/composer/SKILL.md +++ b/plugins/claude-code/skills/composer/SKILL.md @@ -23,7 +23,7 @@ Composer is glue. The heavy lifting (task selection, refinement, the Completion - **`/mymir:composer rework `**: rework mode. HOTL requested changes on GitHub instead of merging; composer rounds that feedback back through the fix loop. - **`/mymir:composer --pipelined`**: backlog mode with research-ahead. While task A is in review/fix, the researcher for the next task B runs in the background. Lookahead is hard-capped at 1. Backlog mode only — the flag is ignored in single-task and rework modes. -No argument means backlog mode; `rework` plus an argument means rework mode; anything else is single-task. +No argument means backlog mode; `--pipelined` alone is still backlog mode (with research-ahead); `rework` plus an argument means rework mode; anything else is single-task. ## Mymir operating context @@ -45,7 +45,7 @@ Each is a registered plugin agent dispatched via the Task tool by `subagent_type | 3. Implement | `mymir:composer-implementer` | `status='in_progress'` (claim), `status='in_review'` (+ full Completion Protocol payload); in fix mode rotates `in_review → in_progress → in_review` | PR URL + one-line summary + `STATUS` line | | 4. Review | `mymir:review` | Nothing (read-only over Mymir) | Structured verdict + `STATUS` line | -The task row is the single source of truth. The researcher refines it before planning; the planner saves the plan to it; the implementer reads everything (refined description, ACs, plan, upstream decisions) from `mymir_context depth='agent'`; the reviewer reads `mymir_context depth='review'`. Dispatch payloads stay minimal (see *Dispatch hygiene*). +The task row is the single source of truth. The researcher refines it before planning; the planner saves the plan to it; the implementer reads everything (refined description, ACs, plan, upstream decisions) from `mymir_context depth='agent'`; the reviewer runs its own two-phase fetch (`depth='working'`, then `depth='review'`). Dispatch payloads stay minimal (see *Dispatch hygiene*). ## Status vocabulary @@ -54,7 +54,7 @@ Every subagent return ends with `STATUS: `. Branch | STATUS | Meaning | Orchestrator reaction | | --- | --- | --- | | `DONE` | Phase output complete | Advance to the next phase | -| `DONE_WITH_CONCERNS` | Complete, but the agent flagged doubts | Quote the concerns in the iteration log, then advance | +| `DONE_WITH_CONCERNS` | Complete, but the agent flagged doubts | Quote the concerns in the run log, then advance | | `NEEDS_DECISION` | A user decision is required | Gate via `AskUserQuestion`; act on the answer | | `BLOCKED` | Phase cannot complete | *Failure handling* | @@ -166,7 +166,7 @@ digraph composer_iteration { The gate re-runs after every fix rotation's implementer DONE. -6. **Review and the fix loop.** Dispatch `mymir:review` with: `Target task: . PR URL: . Mode: composer-phase-4. Fetch the bundle via mymir_context depth='review'.` On `STATUS: DONE`, branch on the verdict payload: +6. **Review and the fix loop.** Dispatch `mymir:review` with: `Target task: . PR URL: . Mode: composer-phase-4.` — nothing more; the reviewer's two-phase context fetch is its own contract, and instructing a `depth='review'` fetch up front would defeat it. On `STATUS: DONE`, branch on the verdict payload: - **`approve`**: go to step 7. - **`request-changes`**, fewer than 2 fix rotations used this task: dispatch the implementer in fix mode — `Target task: . Fix mode. PR: . Address exactly these review findings, re-run verification, re-mark in_review:` followed by the verdict's blocking findings verbatim. On the implementer's `DONE`, re-run the CI gate (step 5), then re-dispatch the reviewer (same dispatch shape). Each fix dispatch + re-review is one rotation. - **`request-changes`** with 2 rotations used, or **`block`**: stop fixing. Escalate every verdict from this task to HOTL and go to step 7. `block` is never auto-fixed; review.md calibrates it as "one rotation will not land this". @@ -215,6 +215,7 @@ One timestamped line per event, `key=value` pairs; multi-line payloads (blocking | `ESCALATE` | rotations exhausted or a `block` verdict goes to HOTL | | `SURFACED` | the final verdict is quoted to the user | | `PROPAGATED` | step 7 propagation completes (`edges= unblocked=`) | +| `BRIEF` | a `--pipelined` prefetch brief lands (`task= baselinedAt=`; the brief verbatim as continuations) | | `FAIL` | a phase returns BLOCKED (failure summary as continuation) | | `TASK_END` | the iteration ends (`outcome=in_review\|planned\|stuck\|skipped rotations=`) | | `RESUME` | recovery appends this after reading the log | @@ -246,7 +247,7 @@ Pull-based: the backend has no webhooks, and `task_links` is the only PR record. 1. **Resolve the pair.** Given a taskRef, read `task.links` filtered to `kind='pull_request'`; given a PR URL, resolve the task from the `[]` bracket in the PR title/body (verify the link row agrees). When several PR links exist, prefer the newest open PR — never trust oldest-link-wins. Every downstream dispatch carries the explicit PR URL. 2. **Reviewer-led intake.** Dispatch `mymir:review` with: `Target task: . PR URL: . Mode: rework-intake.` The intake re-verifies the human feedback against current HEAD and returns a standard verdict. 3. **Branch on the intake verdict.** - - `request-changes`: the blocking findings are the human's items with fresh file:line citations. Run *Review and the fix loop* verbatim from the fix-dispatch step, with a **fresh rotation budget of 2 for this rework invocation** (it is a new review cycle; prior runs' rotations do not count). The CI gate (step 5) applies to each rotation as usual. + - `request-changes`: the blocking findings are the human's items with fresh file:line citations. Run *Review and the fix loop* from the fix-dispatch step, with two changes: prefix each fix dispatch with `Rework.` (the implementer accepts an `in_progress` entry only when the dispatch says rework — HOTL may flip `in_review → in_progress` to signal rework), and use a **fresh rotation budget of 2 for this rework invocation** (it is a new review cycle; prior runs' rotations do not count). The CI gate (step 5) applies to each rotation as usual. - approve-shaped "nothing to rework": zero unresolved feedback. Report it and stop; the iteration is complete. - `BLOCKED` (PR merged/closed, task `done`/`cancelled`): report and stop; there is nothing legal to do. 4. **Finish like any iteration.** Surface the final verdict, propagate (step 7), `TASK_END`. The run log records the whole run with `RUN_START mode=rework`. @@ -260,7 +261,7 @@ Only under `--pipelined`, only in backlog mode, lookahead 1. The win is latency - **Trigger:** dispatch researcher(B) in the background only after implementer(A) returns DONE — overlap covers A's CI gate, review, and fix rotations only. Never manage background work while A is still implementing. - **Pick B excluding A.** B must be ready independently of A by construction — `in_review` unblocks nothing, so the ready set already excludes A's dependents. - **Isolation:** researcher(B) is dispatched with worktree isolation and `run_in_background`; the orchestrator's tree and A's review baseline never move. -- **Brief custody:** when researcher(B) returns, append the brief verbatim to the run log with a baseline marker line: `briefFor: , baselinedAt: in_progress, `. The transcript copy is working memory; the log copy survives compaction. +- **Brief custody:** when researcher(B) returns, append a `BRIEF` event to the run log (`task= baselinedAt=`) with the brief verbatim as `> ` continuation lines. The transcript copy is working memory; the log copy survives compaction. The prefetch is not a `PICK`: B's `PICK` line is written when B's own iteration starts, so recovery's last-`PICK`-without-`TASK_END` rule still finds A. - **Gates queue.** A `NEEDS_DECISION` from researcher(B) queues until A's iteration boundary; never interrupt A's review/fix cycle to gate on B. - **Propagation(A) never runs while researcher(B) is in flight.** Wait for the researcher's return (or stop it) before touching edges. - **One motion at a time:** at most one task is ever in the `planned → in_progress → in_review` motion. B is never planned, claimed, or implemented early. diff --git a/plugins/claude-code/skills/composer/references/researcher-rules.md b/plugins/claude-code/skills/composer/references/researcher-rules.md index f2e1e1c0..73e44b7d 100644 --- a/plugins/claude-code/skills/composer/references/researcher-rules.md +++ b/plugins/claude-code/skills/composer/references/researcher-rules.md @@ -191,7 +191,7 @@ Never invent. If a decision is not grounded in conversation, code, or the artifa ## artifacts §2 — Tag dimensions and first-class fields -Every task, in every status, must carry tags across the three tag dimensions below. Reuse existing tags from `mymir_query type='overview'` before coining new ones. +Every task, in every status, must carry tags across the three tag dimensions below. Reuse existing tags from `mymir_query type='meta'` before coining new ones. | Dimension | Count | Vocabulary | |---|---|---| diff --git a/plugins/claude-code/skills/composer/references/sources.json b/plugins/claude-code/skills/composer/references/sources.json index 432b866a..c7636104 100644 --- a/plugins/claude-code/skills/composer/references/sources.json +++ b/plugins/claude-code/skills/composer/references/sources.json @@ -2,7 +2,7 @@ "_comment": "Canonical-source hash pins for the composer phase extracts in this directory. The extracts hand-mirror sections of these files; scripts/check-plugins.ts fails CI when a pinned file changes, until the extracts are reviewed and the pin refreshed via `bun run sync:plugins`.", "pins": { "plugins/claude-code/skills/mymir/references/conventions.md": "e5aedc7f4fc602e17c4e93b1bd73528988ce99977f3589de57f7b4c0b4431c11", - "plugins/claude-code/skills/mymir/references/artifacts.md": "29f10467f051ef2182af0bcdd29927c75da73d3cfcc7d8ffa4bb5188c0fe43bd", + "plugins/claude-code/skills/mymir/references/artifacts.md": "0d56779937d0beaa30a502677d593d416a097b64e36520fb7979b887db39cdb9", "plugins/claude-code/skills/mymir/references/lifecycle.md": "8db13246adf25c6a388ddd35f4868ff5ee7ba9b3ce4dd77cb85d1c0ae68d0646" } } diff --git a/plugins/claude-code/skills/composer/tests/scenarios.md b/plugins/claude-code/skills/composer/tests/scenarios.md index 8644c235..078de3a2 100644 --- a/plugins/claude-code/skills/composer/tests/scenarios.md +++ b/plugins/claude-code/skills/composer/tests/scenarios.md @@ -16,7 +16,7 @@ one-line justification citing the section you are following. ``` `` defaults to `skills/composer/SKILL.md` with role "the composer -orchestrator". Scenarios 10 and 11 name a different agent file. +orchestrator". Scenarios 10, 11, and 20 name a different agent file. ## Scenarios @@ -103,7 +103,7 @@ Expected: derives that rotation 1 of 2 is already consumed (the FIX line), appen FAIL: resets rotations to 0, re-runs research or planning, or starts a fresh implementation. ### 17. Pipelined invalidation, file overlap (row 4) -Scenario: "`/mymir:composer --pipelined`, backlog mode. Task A (ZIN-4) just finished propagation; its PR touched `lib/auth/session.ts`. The prefetched brief for B (ZIN-6, marked `baselinedAt: ZIN-4 in_progress`) lists `lib/auth/session.ts` under Files to touch. No new depends_on edges; B's description unchanged." +Scenario: "`/mymir:composer --pipelined`, backlog mode. Task A (ZIN-4) just finished propagation; its PR touched `lib/auth/session.ts`. The prefetched brief for B (ZIN-6, logged as `BRIEF task=ZIN-6 baselinedAt=ZIN-4`) lists `lib/auth/session.ts` under Files to touch. No new depends_on edges; B's description unchanged." Expected: invalidation row 4 fires — re-dispatch the researcher on ZIN-6 with the ZIN-4 PR pointer in the open-questions dispatch slot; the stale brief never reaches the planner. FAIL: proceeds to plan B with the stale brief, re-picks (rows 1/5 did not fire), or counts the invalidation as a failed attempt. @@ -111,3 +111,14 @@ FAIL: proceeds to plan B with the stale brief, re-picks (rows 1/5 did not fire), Scenario: "ZIN-14: the planner returned `STATUS: NEEDS_DECISION — the brief leaves the storage backend choice unresolved; the plan cannot proceed without it`." Expected: gates via `AskUserQuestion`, then re-dispatches the PLANNER (the raising agent) with the answer; no implementer dispatch; not counted as a failed attempt. FAIL: routes to failure handling, re-dispatches the researcher instead of the planner, or proceeds to implement. + +### 19. Rework fix dispatch carries the rework marker +Scenario: "Rework mode on ZIN-16. HOTL flipped the task `in_review → in_progress`; intake returned `request-changes` with one finding re-anchored to current HEAD. You are about to dispatch the implementer." +Expected: fix-mode dispatch prefixed with `Rework.`, carrying the PR URL and the finding verbatim. +FAIL: a fix dispatch without the rework marker, a fresh (non-fix-mode) implementer dispatch, or refusing because the entry status is `in_progress`. + +### 20. Worktree branch creation +Agent file: `agents/composer-implementer.md`; role "the composer implementer". +Scenario: "You run worktree-isolated; the orchestrator's tree has the default branch `main` checked out. Pre-flight passed and the claim is written. The task branch does not exist locally or on origin. Reply with the exact branch-creation commands." +Expected: derives `$DEFAULT_BRANCH`, fetches it, creates the branch with `git checkout -b "origin/$DEFAULT_BRANCH"`; never checks out the default branch itself. +FAIL: runs `git checkout "$DEFAULT_BRANCH"` (refused in a worktree while it is checked out elsewhere) or hardcodes `main`. diff --git a/plugins/claude-code/skills/mymir/references/artifacts.md b/plugins/claude-code/skills/mymir/references/artifacts.md index ab3218e3..657781f4 100644 --- a/plugins/claude-code/skills/mymir/references/artifacts.md +++ b/plugins/claude-code/skills/mymir/references/artifacts.md @@ -179,7 +179,7 @@ Never invent. If a decision is not grounded in conversation, code, or the artifa ## 2. Tag dimensions and first-class fields -Every task, in every status, must carry tags across the three tag dimensions below. Reuse existing tags from `mymir_query type='overview'` before coining new ones. +Every task, in every status, must carry tags across the three tag dimensions below. Reuse existing tags from `mymir_query type='meta'` before coining new ones. | Dimension | Count | Vocabulary | |---|---|---| From cf3fcb9a665f3e11003f1a095c6f1272fe2bd0a2 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 20:10:16 +0200 Subject: [PATCH 40/45] chore: sync platform plugin mirrors --- plugins/antigravity/skills/mymir/references/artifacts.md | 2 +- plugins/antigravity/skills/review/SKILL.md | 2 +- plugins/codex/skills/mymir/references/artifacts.md | 2 +- plugins/codex/skills/review/SKILL.md | 2 +- plugins/cursor/skills/mymir/references/artifacts.md | 2 +- plugins/cursor/skills/review/SKILL.md | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/plugins/antigravity/skills/mymir/references/artifacts.md b/plugins/antigravity/skills/mymir/references/artifacts.md index ab3218e3..657781f4 100644 --- a/plugins/antigravity/skills/mymir/references/artifacts.md +++ b/plugins/antigravity/skills/mymir/references/artifacts.md @@ -179,7 +179,7 @@ Never invent. If a decision is not grounded in conversation, code, or the artifa ## 2. Tag dimensions and first-class fields -Every task, in every status, must carry tags across the three tag dimensions below. Reuse existing tags from `mymir_query type='overview'` before coining new ones. +Every task, in every status, must carry tags across the three tag dimensions below. Reuse existing tags from `mymir_query type='meta'` before coining new ones. | Dimension | Count | Vocabulary | |---|---|---| diff --git a/plugins/antigravity/skills/review/SKILL.md b/plugins/antigravity/skills/review/SKILL.md index 782db0c6..e213d7ca 100644 --- a/plugins/antigravity/skills/review/SKILL.md +++ b/plugins/antigravity/skills/review/SKILL.md @@ -88,7 +88,7 @@ You own zero transitions. The implementer wrote `in_progress → in_review` with a. `mymir_context depth='working' taskId=''`. Returns description, acceptanceCriteria, decisions, 1-hop connected tasks (the edges section), and the PR handle from `task.links` filtered to `kind='pull_request'`. Mechanically excludes `executionRecord`, `implementationPlan` body, and `files`; steps 2 and 3 run against the diff with that exclusion in place, so the lens findings are formed from the code rather than from the implementer's narrative. The full review bundle (executionRecord, plan body, files, plan-vs-files drift, downstream) is fetched in step 4. -b. Confirm `status='in_review'`. Any other state stops the run. If the bundle reports a missing `prUrl` on a task whose `files` is non-empty, flag it: a code-changing `in_review` task without a PR is a Completion Protocol violation, not a review problem; surface the violation and stop. +b. Confirm `status='in_review'`. Any other state stops the run. If the bundle carries no PR handle (`task.links` has no `pull_request` entry) and the dispatch supplied no PR URL, stop: there is no diff to review. Either the task legitimately shipped without a PR (lifecycle §2.4 task types) or the Completion Protocol was violated on a code-changing task; the `working` bundle excludes `files`, so do not guess which — report the missing handle and return `STATUS: BLOCKED — PR handle missing`. When the dispatch supplies a PR URL but `task.links` lacks the row, proceed with the dispatch URL and flag the missing link as a Completion Protocol process note in the verdict. c. Resolve the PR. `gh pr view --json url,title,state,mergeable,statusCheckRollup,reviewDecision`. Note the CI state, the merge state, any failing checks. If checks are red, that is a `block`-class signal on its own; you can still produce the lens analysis, but the verdict cannot be `approve` while CI is red. Pending or unresolved checks cap the verdict at `request-changes`: when the dispatch says `CI: unresolved after ` (or you observe still-pending checks yourself), an otherwise-clean review returns `request-changes` with unresolved CI as the sole blocking finding. diff --git a/plugins/codex/skills/mymir/references/artifacts.md b/plugins/codex/skills/mymir/references/artifacts.md index ab3218e3..657781f4 100644 --- a/plugins/codex/skills/mymir/references/artifacts.md +++ b/plugins/codex/skills/mymir/references/artifacts.md @@ -179,7 +179,7 @@ Never invent. If a decision is not grounded in conversation, code, or the artifa ## 2. Tag dimensions and first-class fields -Every task, in every status, must carry tags across the three tag dimensions below. Reuse existing tags from `mymir_query type='overview'` before coining new ones. +Every task, in every status, must carry tags across the three tag dimensions below. Reuse existing tags from `mymir_query type='meta'` before coining new ones. | Dimension | Count | Vocabulary | |---|---|---| diff --git a/plugins/codex/skills/review/SKILL.md b/plugins/codex/skills/review/SKILL.md index 782db0c6..e213d7ca 100644 --- a/plugins/codex/skills/review/SKILL.md +++ b/plugins/codex/skills/review/SKILL.md @@ -88,7 +88,7 @@ You own zero transitions. The implementer wrote `in_progress → in_review` with a. `mymir_context depth='working' taskId=''`. Returns description, acceptanceCriteria, decisions, 1-hop connected tasks (the edges section), and the PR handle from `task.links` filtered to `kind='pull_request'`. Mechanically excludes `executionRecord`, `implementationPlan` body, and `files`; steps 2 and 3 run against the diff with that exclusion in place, so the lens findings are formed from the code rather than from the implementer's narrative. The full review bundle (executionRecord, plan body, files, plan-vs-files drift, downstream) is fetched in step 4. -b. Confirm `status='in_review'`. Any other state stops the run. If the bundle reports a missing `prUrl` on a task whose `files` is non-empty, flag it: a code-changing `in_review` task without a PR is a Completion Protocol violation, not a review problem; surface the violation and stop. +b. Confirm `status='in_review'`. Any other state stops the run. If the bundle carries no PR handle (`task.links` has no `pull_request` entry) and the dispatch supplied no PR URL, stop: there is no diff to review. Either the task legitimately shipped without a PR (lifecycle §2.4 task types) or the Completion Protocol was violated on a code-changing task; the `working` bundle excludes `files`, so do not guess which — report the missing handle and return `STATUS: BLOCKED — PR handle missing`. When the dispatch supplies a PR URL but `task.links` lacks the row, proceed with the dispatch URL and flag the missing link as a Completion Protocol process note in the verdict. c. Resolve the PR. `gh pr view --json url,title,state,mergeable,statusCheckRollup,reviewDecision`. Note the CI state, the merge state, any failing checks. If checks are red, that is a `block`-class signal on its own; you can still produce the lens analysis, but the verdict cannot be `approve` while CI is red. Pending or unresolved checks cap the verdict at `request-changes`: when the dispatch says `CI: unresolved after ` (or you observe still-pending checks yourself), an otherwise-clean review returns `request-changes` with unresolved CI as the sole blocking finding. diff --git a/plugins/cursor/skills/mymir/references/artifacts.md b/plugins/cursor/skills/mymir/references/artifacts.md index ab3218e3..657781f4 100644 --- a/plugins/cursor/skills/mymir/references/artifacts.md +++ b/plugins/cursor/skills/mymir/references/artifacts.md @@ -179,7 +179,7 @@ Never invent. If a decision is not grounded in conversation, code, or the artifa ## 2. Tag dimensions and first-class fields -Every task, in every status, must carry tags across the three tag dimensions below. Reuse existing tags from `mymir_query type='overview'` before coining new ones. +Every task, in every status, must carry tags across the three tag dimensions below. Reuse existing tags from `mymir_query type='meta'` before coining new ones. | Dimension | Count | Vocabulary | |---|---|---| diff --git a/plugins/cursor/skills/review/SKILL.md b/plugins/cursor/skills/review/SKILL.md index 782db0c6..e213d7ca 100644 --- a/plugins/cursor/skills/review/SKILL.md +++ b/plugins/cursor/skills/review/SKILL.md @@ -88,7 +88,7 @@ You own zero transitions. The implementer wrote `in_progress → in_review` with a. `mymir_context depth='working' taskId=''`. Returns description, acceptanceCriteria, decisions, 1-hop connected tasks (the edges section), and the PR handle from `task.links` filtered to `kind='pull_request'`. Mechanically excludes `executionRecord`, `implementationPlan` body, and `files`; steps 2 and 3 run against the diff with that exclusion in place, so the lens findings are formed from the code rather than from the implementer's narrative. The full review bundle (executionRecord, plan body, files, plan-vs-files drift, downstream) is fetched in step 4. -b. Confirm `status='in_review'`. Any other state stops the run. If the bundle reports a missing `prUrl` on a task whose `files` is non-empty, flag it: a code-changing `in_review` task without a PR is a Completion Protocol violation, not a review problem; surface the violation and stop. +b. Confirm `status='in_review'`. Any other state stops the run. If the bundle carries no PR handle (`task.links` has no `pull_request` entry) and the dispatch supplied no PR URL, stop: there is no diff to review. Either the task legitimately shipped without a PR (lifecycle §2.4 task types) or the Completion Protocol was violated on a code-changing task; the `working` bundle excludes `files`, so do not guess which — report the missing handle and return `STATUS: BLOCKED — PR handle missing`. When the dispatch supplies a PR URL but `task.links` lacks the row, proceed with the dispatch URL and flag the missing link as a Completion Protocol process note in the verdict. c. Resolve the PR. `gh pr view --json url,title,state,mergeable,statusCheckRollup,reviewDecision`. Note the CI state, the merge state, any failing checks. If checks are red, that is a `block`-class signal on its own; you can still produce the lens analysis, but the verdict cannot be `approve` while CI is red. Pending or unresolved checks cap the verdict at `request-changes`: when the dispatch says `CI: unresolved after ` (or you observe still-pending checks yourself), an otherwise-clean review returns `request-changes` with unresolved CI as the sole blocking finding. From 83d4386811b0bef9fe74546bab15744cd55033fc Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 12 Jun 2026 21:06:12 +0200 Subject: [PATCH 41/45] chore: bump plugin version to 1.9.0 --- lib/mcp/create-server.ts | 2 +- plugins/antigravity/plugin.json | 2 +- plugins/claude-code/.claude-plugin/plugin.json | 2 +- plugins/codex/.codex-plugin/plugin.json | 2 +- plugins/cursor/.cursor-plugin/plugin.json | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/mcp/create-server.ts b/lib/mcp/create-server.ts index 5be19387..47b65912 100644 --- a/lib/mcp/create-server.ts +++ b/lib/mcp/create-server.ts @@ -683,7 +683,7 @@ export function registerAllTools(server: McpServer, ctx: AuthContext): void { */ export function createMcpServer(ctx: AuthContext): McpServer { const server = new McpServer( - { name: "mymir", version: "1.8.0" }, + { name: "mymir", version: "1.9.0" }, { instructions: INSTRUCTIONS }, ); registerAllTools(server, ctx); diff --git a/plugins/antigravity/plugin.json b/plugins/antigravity/plugin.json index 35893945..bcda3cfa 100644 --- a/plugins/antigravity/plugin.json +++ b/plugins/antigravity/plugin.json @@ -1,5 +1,5 @@ { "name": "mymir", - "version": "1.8.0", + "version": "1.9.0", "description": "Persistent context network for coding projects. Tracks tasks, dependencies, and decisions across sessions." } diff --git a/plugins/claude-code/.claude-plugin/plugin.json b/plugins/claude-code/.claude-plugin/plugin.json index 17922fff..757e55f0 100644 --- a/plugins/claude-code/.claude-plugin/plugin.json +++ b/plugins/claude-code/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "mymir", "description": "Persistent context network for coding projects. Tracks tasks, dependencies, and decisions across sessions.", - "version": "1.8.0", + "version": "1.9.0", "author": { "name": "Mymir" }, diff --git a/plugins/codex/.codex-plugin/plugin.json b/plugins/codex/.codex-plugin/plugin.json index 03c6e34e..bfe8945a 100644 --- a/plugins/codex/.codex-plugin/plugin.json +++ b/plugins/codex/.codex-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "mymir", - "version": "1.8.0", + "version": "1.9.0", "description": "Persistent context network for coding projects. Tracks tasks, dependencies, and decisions across sessions.", "author": { "name": "Mymir", diff --git a/plugins/cursor/.cursor-plugin/plugin.json b/plugins/cursor/.cursor-plugin/plugin.json index 21cc8634..6c22414f 100644 --- a/plugins/cursor/.cursor-plugin/plugin.json +++ b/plugins/cursor/.cursor-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "mymir", - "version": "1.8.0", + "version": "1.9.0", "description": "Persistent context network for coding projects. Tracks tasks, dependencies, and decisions across sessions.", "author": { "name": "Mymir", From 412dfbeb4f6ce8b0b0009a63b7000875648ae3f7 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Sat, 13 Jun 2026 00:08:20 +0200 Subject: [PATCH 42/45] fix: align composer plugin with hotl gate and slim rule extracts --- .../composer/references/reviewer-rules.md | 16 ++- .../skills/mymir/references/artifacts.md | 25 +--- plugins/antigravity/skills/review/SKILL.md | 2 +- .../agents/composer-implementer.md | 2 +- .../claude-code/agents/composer-planner.md | 4 +- .../claude-code/agents/composer-researcher.md | 4 +- plugins/claude-code/agents/review.md | 2 +- plugins/claude-code/skills/composer/SKILL.md | 16 +-- .../composer/references/implementer-rules.md | 16 +-- .../composer/references/planner-rules.md | 19 +-- .../composer/references/researcher-rules.md | 115 +----------------- .../composer/references/reviewer-rules.md | 16 ++- .../skills/composer/references/sources.json | 2 +- .../skills/mymir/references/artifacts.md | 25 +--- .../composer/references/reviewer-rules.md | 16 ++- .../skills/mymir/references/artifacts.md | 25 +--- plugins/codex/skills/review/SKILL.md | 2 +- .../composer/references/reviewer-rules.md | 16 ++- .../skills/mymir/references/artifacts.md | 25 +--- plugins/cursor/skills/review/SKILL.md | 2 +- .../plugins/composer-scenarios.md | 10 +- 21 files changed, 94 insertions(+), 266 deletions(-) rename plugins/claude-code/skills/composer/tests/scenarios.md => tests/plugins/composer-scenarios.md (95%) diff --git a/plugins/antigravity/skills/composer/references/reviewer-rules.md b/plugins/antigravity/skills/composer/references/reviewer-rules.md index c32079df..b8b64b8d 100644 --- a/plugins/antigravity/skills/composer/references/reviewer-rules.md +++ b/plugins/antigravity/skills/composer/references/reviewer-rules.md @@ -2,7 +2,7 @@ Slim extract of the canonical mymir references for the review agent. Mirrors: `skills/mymir/references/conventions.md` §1, -`skills/mymir/references/lifecycle.md` §2.2, §2.3, §3, and +`skills/mymir/references/lifecycle.md` §2.2, §2.3, §2.4, §3, and `skills/mymir/references/artifacts.md` §1 (`executionRecord`, `decisions`), §6. Headings carry their canonical file and section number so citations like `lifecycle §2.2` resolve unambiguously. When editing a @@ -42,6 +42,16 @@ If `files` is non-empty AND the work was a real code change (not research, not d A missing PR on a code-changing task, a missing bracket ref, or a fabricated template section is a finding. +## lifecycle §2.4 — Skip the PR for these task types + +A missing PR is legitimate (not a finding) for: + +- Research / investigation tasks (no code change). +- Decision-only tasks. +- Pure-Mymir refinement tasks (no repo changes). +- Tasks the user explicitly said "no PR" on. +- Data and BA work without a code repo (dashboard tweaks, workbooks, metric sign-offs, ad-hoc SQL attached to a ticket). The deliverable lives outside git; the artifact link or path belongs in `executionRecord` and `files`. When the data work IS in a git repo (a dbt project, a versioned SQL or notebook repo), the standard PR rules apply. + --- ## lifecycle §3 — Propagate after every change (Iron Law) @@ -69,7 +79,9 @@ The reviewer does not execute propagation. Your downstream-impact list names the ## artifacts §1 — Task artifact quality -### `executionRecord` (only on `done` and `cancelled`) +### `executionRecord` (only on `in_review`, `done`, and `cancelled`) + +The implementer writes this field at the `in_review` transition; you verify it against the diff. - **Length:** 3 to 5 sentences. - **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). diff --git a/plugins/antigravity/skills/mymir/references/artifacts.md b/plugins/antigravity/skills/mymir/references/artifacts.md index 657781f4..b96aba93 100644 --- a/plugins/antigravity/skills/mymir/references/artifacts.md +++ b/plugins/antigravity/skills/mymir/references/artifacts.md @@ -133,7 +133,7 @@ BAD: Single-AC tasks are rejected. Tasks with vague ACs ("works correctly", "is complete", "performs well") are rejected. -### `executionRecord` (only on `done` and `cancelled`) +### `executionRecord` (only on `in_review`, `done`, and `cancelled`) - **Length:** 3 to 5 sentences. - **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). @@ -402,29 +402,6 @@ The text you write into Mymir is read by other engineers. It must read like an e - Cut adverbs. - One idea per sentence. -### Em-dash replacements - -``` -BAD (web): "Custom auth — months of work — is off the table." -GOOD: "Custom auth is off the table. Months of work, easy to leak data." - -BAD (web): "The API uses Bearer tokens — validated against the users table." -GOOD: "The API validates Bearer tokens against the users table." - -BAD (sim): "Rejected — see line 42 of the spec." -GOOD: "Rejected. See line 42 of the spec." - -BAD (agentic): "The agent loop dispatches tools — validated against the - registry — then streams the model output." -GOOD: "The agent loop validates each tool against the registry - before dispatching, then streams the model output." - -BAD (firmware):"BMP280 returns 0xFF — the i2c clock-stretch fix is not - backported." -GOOD: "BMP280 returns 0xFF. The i2c clock-stretch fix is not - backported." -``` - ### Length Concision over padding. No filler, no AI throat-clearing, no repetition. But do not sacrifice clarity for brevity. If a task genuinely needs 6 to 8 sentences in its description because the architecture has multiple components, the bug has a complex cause, or the research question is multi-part, write them. The rule is "no fluff", not "no length". A 6-sentence description that helps a reader is better than a 2-sentence one that loses them. diff --git a/plugins/antigravity/skills/review/SKILL.md b/plugins/antigravity/skills/review/SKILL.md index e213d7ca..9b3fb888 100644 --- a/plugins/antigravity/skills/review/SKILL.md +++ b/plugins/antigravity/skills/review/SKILL.md @@ -367,7 +367,7 @@ The dispatch carries the explicit PR URL; do not re-resolve it from `task.links` - ALWAYS read your operating-rules extract at session start, and re-read mid-session when uncertain. - ALWAYS confirm `status='in_review'` before reading the diff. Reviewing other statuses is wrong-shaped work. - ALWAYS fetch `mymir_context depth='working'` at step 1 (no executionRecord / plan body / files in context) and `mymir_context depth='review'` at step 4 (full bundle for reconciliation). The two-phase split is the tool-enforced isolation that backs the first-pass discipline; folding both into a single `depth='review'` fetch at step 1 defeats it. -- ALWAYS dispatch the mandatory sub-reviewers when the diff hits the thresholds in the `Task` allowed-tools entry (>10 files, auth / MCP / data / migrations, `security` cross-cutting tag). Returning `approve` on a mandatory-threshold review without naming which sub-reviewers ran is not a real review. +- ALWAYS dispatch the mandatory sub-reviewers when the diff hits the thresholds in the `Task` allowed-tools entry (>10 files; auth / authz / access control; public API, RPC, tool, or IPC surfaces; persistence schema or migrations; wire formats or release artifacts; `security` / `safety` / `compliance` tags). Returning `approve` on a mandatory-threshold review without naming which sub-reviewers ran is not a real review. - ALWAYS cite real file paths and line numbers from the diff for every finding. Iron Law (conventions §1). - ALWAYS pick one of three verdicts (`approve`, `request-changes`, `block`). No hedging. - ALWAYS verify dispatched-vs-direct mode for return shape. diff --git a/plugins/claude-code/agents/composer-implementer.md b/plugins/claude-code/agents/composer-implementer.md index ac926287..950be628 100644 --- a/plugins/claude-code/agents/composer-implementer.md +++ b/plugins/claude-code/agents/composer-implementer.md @@ -216,7 +216,7 @@ When the dispatch says fix mode, the reviewer requested changes on your PR and t 4. Inspect the branch for foreign commits: compare the PR's commit authors (`gh pr view --json commits --jq '.commits[].authors[].login'`) against your own identity (`git config user.name` and the login you push as). Foreign commits found: note them verbatim in your return message and re-evaluate ALL acceptance criteria in step 7, not only the ACs the findings touched — someone else's edits may have moved ground under criteria you previously satisfied. 5. Address **exactly the blocking findings in the dispatch**. No replanning, no scope expansion, no drive-by refactors. An accepted human direction change (a rework finding that redirects an approach) lands as a `decisions` entry (CHOICE + WHY) before the code change. A finding you believe is wrong: do not silently skip it; note your reasoning in the return message and fix the rest. 6. Re-run the full verification suite (typecheck, lint, tests) until green, push to the same branch. -7. Re-mark `in_review` with an updated Completion Protocol payload (append a one-line `executionRecord` delta describing the fix; re-evaluate only the ACs the findings touched, or all ACs when step 4 found foreign commits). The status re-read from step 6's success path applies here. +7. Re-mark `in_review` with an updated Completion Protocol payload (append a one-line `executionRecord` delta describing the fix; re-evaluate only the ACs the findings touched, or all ACs when step 4 found foreign commits). The pre-write status re-read from the main procedure's *Mark in_review* step applies here. 8. Return: ` fix rotation complete. PR . .` plus the STATUS line per the success/failure paths above. In rework mode you MAY post one `gh pr comment --body ''` — at most one per rotation. You NEVER resolve review threads; resolution is the human's prerogative. ## Environmental failures diff --git a/plugins/claude-code/agents/composer-planner.md b/plugins/claude-code/agents/composer-planner.md index 5975acdc..9d6dd8dc 100644 --- a/plugins/claude-code/agents/composer-planner.md +++ b/plugins/claude-code/agents/composer-planner.md @@ -41,7 +41,7 @@ Your phase rules load with this agent as a slim extract of the canonical mymir r - If the brief surfaces material drift (new files revealed, version mismatch on a library the plan depends on, ACs the brief flagged as ambiguous): rewrite the plan to incorporate the brief's findings. Status stays `planned`. The rewrite replaces the prior plan in the `implementationPlan` field (it is a single text column; updates overwrite), so be conservative. Only rewrite when the brief shows real drift, not because you would write it differently. The audit log records that the field changed but does not preserve the prior text. - Refinements to other fields (description, acceptance criteria, tags, category) follow the same append-only rules as a `draft` entry. -You follow the canonical `Plan a draft task` workflow from `plugins/claude-code/skills/mymir/SKILL.md`. This file is the dispatched-mode adaptation of that flow. +You follow the canonical `Plan a draft task` workflow from the mymir skill (`skills/mymir/SKILL.md`). This file is the dispatched-mode adaptation of that flow. ## Iron Law of grounding @@ -67,7 +67,7 @@ You own one transition: `draft → planned`. That is the only legal status value - `status='planned'`: legal **only when entry status was `draft`**. Required in the same call as `implementationPlan`. - `status='in_progress'`: forbidden. Belongs to the implementer's claim. -- `status='done'`: forbidden. Belongs to the implementer's completion. +- `status='done'`: forbidden. Belongs to the HOTL operator after PR approval; no composer agent writes it. - `status='cancelled'`: forbidden. Only the user can request cancellation; the planner never decides to abandon a task. - `status='draft'`: forbidden. There is no legal "demote to draft" path in the composer pipeline. diff --git a/plugins/claude-code/agents/composer-researcher.md b/plugins/claude-code/agents/composer-researcher.md index d4310bda..0f843473 100644 --- a/plugins/claude-code/agents/composer-researcher.md +++ b/plugins/claude-code/agents/composer-researcher.md @@ -48,7 +48,7 @@ conventions §1 applies to every refinement you apply and every line of the brie - `mymir_task` (`update` only, restricted to these fields: `description`, `acceptanceCriteria`, `tags`, `category`, `priority`, `estimate`, `decisions`). These are the **refinement fields**; they sharpen the *what* of the task. You apply refinements directly so the planner reads a clean task. - `WebSearch`, `WebFetch`: outward research when context7 misses. - `context7` MCP (`resolve-library-id`, `query-docs`): preferred path for library docs. -- `Bash` restricted to read-only commands: `gh pr list`, `gh pr view`, `gh issue view`, `cat package.json`-equivalents via `Read`. No mutating `gh` (`pr create`, `pr edit`, `pr merge`) and no arbitrary shell. +- `Bash` restricted to read-only `gh` commands: `gh pr list`, `gh pr view`, `gh issue view`. No mutating `gh` (`pr create`, `pr edit`, `pr merge`) and no arbitrary shell. Read manifests and configs with `Read`, not `cat`. ## Forbidden tools @@ -63,7 +63,7 @@ You own zero transitions. Leave `status` off every `mymir_task` call. Refining ` - `status='draft'`: forbidden. The task already has a status; refining never resets it. - `status='planned'`: forbidden. Belongs to the planner's `draft → planned` transition. - `status='in_progress'`: forbidden. Belongs to the implementer's claim. -- `status='done'`: forbidden. Belongs to the implementer's completion. +- `status='done'`: forbidden. Belongs to the HOTL operator after PR approval; no composer agent writes it. - `status='cancelled'`: forbidden. Only the user can request cancellation, routed through the mymir skill directly. ### Substantive rewrites: propose, do not apply diff --git a/plugins/claude-code/agents/review.md b/plugins/claude-code/agents/review.md index 8a5e984e..9377a83f 100644 --- a/plugins/claude-code/agents/review.md +++ b/plugins/claude-code/agents/review.md @@ -368,7 +368,7 @@ The dispatch carries the explicit PR URL; do not re-resolve it from `task.links` - ALWAYS read your operating-rules extract at session start, and re-read mid-session when uncertain. - ALWAYS confirm `status='in_review'` before reading the diff. Reviewing other statuses is wrong-shaped work. - ALWAYS fetch `mymir_context depth='working'` at step 1 (no executionRecord / plan body / files in context) and `mymir_context depth='review'` at step 4 (full bundle for reconciliation). The two-phase split is the tool-enforced isolation that backs the first-pass discipline; folding both into a single `depth='review'` fetch at step 1 defeats it. -- ALWAYS dispatch the mandatory sub-reviewers when the diff hits the thresholds in the `Task` allowed-tools entry (>10 files, auth / MCP / data / migrations, `security` cross-cutting tag). Returning `approve` on a mandatory-threshold review without naming which sub-reviewers ran is not a real review. +- ALWAYS dispatch the mandatory sub-reviewers when the diff hits the thresholds in the `Task` allowed-tools entry (>10 files; auth / authz / access control; public API, RPC, tool, or IPC surfaces; persistence schema or migrations; wire formats or release artifacts; `security` / `safety` / `compliance` tags). Returning `approve` on a mandatory-threshold review without naming which sub-reviewers ran is not a real review. - ALWAYS cite real file paths and line numbers from the diff for every finding. Iron Law (conventions §1). - ALWAYS pick one of three verdicts (`approve`, `request-changes`, `block`). No hedging. - ALWAYS verify dispatched-vs-direct mode for return shape. diff --git a/plugins/claude-code/skills/composer/SKILL.md b/plugins/claude-code/skills/composer/SKILL.md index fb68ddff..9c1797e1 100644 --- a/plugins/claude-code/skills/composer/SKILL.md +++ b/plugins/claude-code/skills/composer/SKILL.md @@ -14,7 +14,7 @@ description: > Composer is a Mymir task orchestrator. Per iteration it picks the next ready task off the project's critical path, dispatches four phase subagents in sequence (research, plan, implement, review), runs a bounded review→fix loop, propagates the result through the graph, and continues until a structural stop condition holds. Each subagent runs in a fresh context with a focused tool set; the orchestrator stays clean and writes nothing to tasks except propagation edges. -Composer is glue. The heavy lifting (task selection, refinement, the Completion Protocol, propagation) lives in the `mymir` skill (`plugins/claude-code/skills/mymir/SKILL.md`); composer reuses those flows rather than duplicating them. +Composer is glue. The heavy lifting (task selection, refinement, the Completion Protocol, propagation) lives in the `mymir` skill (`skills/mymir/SKILL.md`); composer reuses those flows rather than duplicating them. ## Invocation @@ -58,7 +58,7 @@ Every subagent return ends with `STATUS: `. Branch | `NEEDS_DECISION` | A user decision is required | Gate via `AskUserQuestion`; act on the answer | | `BLOCKED` | Phase cannot complete | *Failure handling* | -Expected `NEEDS_DECISION` triggers (typically from the researcher; any phase may raise one — gate the same way and re-dispatch the **raising agent** with the answers): +Expected `NEEDS_DECISION` triggers (typically from the researcher; the planner may raise one too — gate the same way and re-dispatch the **raising agent** with the answers; the implementer and reviewer contracts do not return this status): - **Oversize** (`oversize-task` flag): offer to dispatch `mymir:decompose-task` or skip the task. Composer never splits a task itself. - **Proposed rewrites** (`## Proposed rewrites` non-empty): show original vs proposed per field with the researcher's rationale; offer accept / deny. On accept, apply via `mymir_task action='update'` and re-dispatch the researcher on the rewritten task (the old brief is invalid). On deny, end the iteration: backlog mode picks the next task; single-task mode stops. @@ -159,7 +159,7 @@ digraph composer_iteration { 4. **Implement.** First check the pick type: when the pick was plannable-only, do not enter this step — the iteration already ended at `planned` (step 3). Otherwise dispatch `mymir:composer-implementer` with: `Target task: . Plan is saved to Mymir; fetch via mymir_context depth='agent'. Claim the task (planned → in_progress), implement per the implementationPlan, open a PR, mark in_review per the Completion Protocol.` Append the prior failure summary on retries. The implementer runs worktree-isolated (frontmatter `isolation: worktree`; also pass the Task tool's `isolation: "worktree"` parameter at dispatch, which is verified to work with plugin agents): it works in its own tree, the orchestrator's tree never moves, and the researcher's baseline stays stable. -5. **CI gate.** After the implementer returns DONE with a PR URL, watch the checks with a bounded timeout and branch on the **exit code**, never on truncated output: `timeout 600 gh pr checks --watch; rc=$?`. `rc=0` → green. `rc=124` (timeout killed the watch mid-pending) or `rc=8` (gh's checks-pending code) → still pending. Any other non-zero `rc` → red; read the failing check names from the output. Skip the gate entirely when the repo has no checks configured (`gh pr checks` reports no checks — that is a skip, not a red). Branch on the result: +5. **CI gate.** After the implementer returns DONE with a PR URL, watch the checks with a bounded timeout: `timeout 600 gh pr checks --watch; rc=$?`. Branch on the **exit code**, with one output-based exception: `rc=0` → green. `rc=124` (timeout killed the watch mid-pending) or `rc=8` (gh's checks-pending code) → still pending. Any other non-zero `rc` → red — unless the output says no checks are reported (gh has no distinct exit code for a repo with no checks configured; it shares the red codes). No checks reported → skip the gate entirely; that is a skip, not a red. For a real red, read the failing check names from the output. Branch on the result: - **Green**: dispatch the reviewer normally. - **Red**: dispatch the reviewer with the failing check names appended to the dispatch (`CI: failing — `); the reviewer may not approve red CI. - **Still pending at the 10-minute timeout**: dispatch the reviewer with `CI: unresolved after 10m`; `approve` is off the table, and an otherwise-clean review returns `request-changes` citing unresolved CI as the sole blocking finding. @@ -258,7 +258,7 @@ Future (documented, not built): a GitHub webhook feeding `task_links.metadata` a Only under `--pipelined`, only in backlog mode, lookahead 1. The win is latency (~15–25%), not tokens; when in doubt, run without the flag. -- **Trigger:** dispatch researcher(B) in the background only after implementer(A) returns DONE — overlap covers A's CI gate, review, and fix rotations only. Never manage background work while A is still implementing. +- **Trigger:** dispatch researcher(B) in the background only after implementer(A) returns DONE — overlap covers A's CI gate, review, and fix rotations only. Never dispatch the prefetch while A's initial implement phase is still running. - **Pick B excluding A.** B must be ready independently of A by construction — `in_review` unblocks nothing, so the ready set already excludes A's dependents. - **Isolation:** researcher(B) is dispatched with worktree isolation and `run_in_background`; the orchestrator's tree and A's review baseline never move. - **Brief custody:** when researcher(B) returns, append a `BRIEF` event to the run log (`task= baselinedAt=`) with the brief verbatim as `> ` continuation lines. The transcript copy is working memory; the log copy survives compaction. The prefetch is not a `PICK`: B's `PICK` line is written when B's own iteration starts, so recovery's last-`PICK`-without-`TASK_END` rule still finds A. @@ -350,7 +350,7 @@ Not a decomposer (oversize routes out). Not a hand-refiner (that is the mymir sk ## See also -- `plugins/claude-code/skills/mymir/SKILL.md`: canonical flows composer reuses — selection (§ *What should I work on?*), refinement (§ *Refine a task*), planning (§ *Plan a draft task*), implementation (§ *Implement a task*), propagation. -- `plugins/claude-code/agents/composer-researcher.md`, `composer-planner.md`, `composer-implementer.md`, `review.md`: the four phase contracts, including each phase's STATUS rules. -- `plugins/claude-code/skills/composer/references/`: the slim per-phase rule extracts the agents load. -- `plugins/claude-code/agents/decompose-task.md`: the oversize-delegation target. +- `skills/mymir/SKILL.md`: canonical flows composer reuses — selection (§ *What should I work on?*), refinement (§ *Refine a task*), planning (§ *Plan a draft task*), implementation (§ *Implement a task*), propagation. +- `agents/composer-researcher.md`, `agents/composer-planner.md`, `agents/composer-implementer.md`, `agents/review.md`: the four phase contracts, including each phase's STATUS rules. +- `skills/composer/references/`: the slim per-phase rule extracts the agents load. +- `agents/decompose-task.md`: the oversize-delegation target. diff --git a/plugins/claude-code/skills/composer/references/implementer-rules.md b/plugins/claude-code/skills/composer/references/implementer-rules.md index b75225f0..14eb8db6 100644 --- a/plugins/claude-code/skills/composer/references/implementer-rules.md +++ b/plugins/claude-code/skills/composer/references/implementer-rules.md @@ -170,7 +170,9 @@ When in doubt, ask the user before opening. ## artifacts §1 — Task artifact quality -### `executionRecord` (only on `done` and `cancelled`) +### `executionRecord` (only on `in_review`, `done`, and `cancelled`) + +You write this field at the `in_review` transition; it is the core of your Completion Protocol payload. - **Length:** 3 to 5 sentences. - **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). @@ -240,18 +242,6 @@ The text you write into Mymir is read by other engineers. It must read like an e - Cut adverbs. - One idea per sentence. -### Em-dash replacements - -``` -BAD (web): "Custom auth — months of work — is off the table." -GOOD: "Custom auth is off the table. Months of work, easy to leak data." - -BAD (firmware):"BMP280 returns 0xFF — the i2c clock-stretch fix is not - backported." -GOOD: "BMP280 returns 0xFF. The i2c clock-stretch fix is not - backported." -``` - ### Length Concision over padding. No filler, no AI throat-clearing, no repetition. But do not sacrifice clarity for brevity. The rule is "no fluff", not "no length". diff --git a/plugins/claude-code/skills/composer/references/planner-rules.md b/plugins/claude-code/skills/composer/references/planner-rules.md index 0d5f3e9d..7c5fcf7f 100644 --- a/plugins/claude-code/skills/composer/references/planner-rules.md +++ b/plugins/claude-code/skills/composer/references/planner-rules.md @@ -19,14 +19,12 @@ Never write what you cannot cite or do not know. Applies wherever an agent generates `executionRecord`, `decisions`, `description`, or `files`. -- `executionRecord` claims must reference real code: file paths that exist, functions that are defined, endpoints that are routed, commits that are in the log. The onboarding agent verifies file existence with Bash before claiming. +- `executionRecord` claims must reference real code: file paths that exist, functions that are defined, endpoints that are routed, commits that are in the log. - `description` must reflect actual scope. Do not stretch a one-line ask into an invented full feature. - `files` must list paths the agent has either modified, observed, or has explicit confirmation exist. When uncertain, write less. A short, true record is more valuable than a rich, fabricated one. -**Spec-review and open-questions tasks: cite the on-graph artifact.** When marking a spec-review, decision-only, or open-questions task `done`, every checked AC must cite an on-graph artifact: a sibling task's plan, a sibling's executionRecord, an edge note, or a decision recorded on a related task. Do not synthesize answers from training data. Reference the related task by ref (e.g. `MYMR-83`) inside the AC text or the executionRecord. This is what makes a spec-review completion honest instead of hallucinated. - `decisions` are different (see §1 of the artifact rules below). They come from the conversation, not from artifact-mining. --- @@ -139,10 +137,7 @@ Single-AC tasks are rejected. Tasks with vague ACs ("works correctly", "is compl One-liner per decision. Format: **CHOICE + WHY**. -Where decisions come from: - -- **Refinement, planning, or implementation conversation.** When the user and the agent (or two agents) settle on a choice, that's a decision. The agent should automatically record it without being asked. If the agent is uncertain whether a choice rises to "decision" level, ask the user briefly to confirm. -- **Onboarding (special case)**: the agent reads existing artifacts to recover decisions made before Mymir entered the picture. Sources: manifest files (`package.json`, `Cargo.toml`, `go.mod`, `pyproject.toml`, `Package.swift`), README and design docs, commit messages with words like *chose*, *switched*, *replaced*, *migrated*. If a decision is not grounded in any of those, omit it. Better a shorter list than fabrication. +Decisions come from the refinement, planning, or implementation conversation. When the user and the agent (or two agents) settle on a choice, that's a decision. The agent should automatically record it without being asked. If the agent is uncertain whether a choice rises to "decision" level, ask the user briefly to confirm. ``` GOOD (web): "Chose Redis for refresh tokens. Need fast revocation lookups." @@ -193,16 +188,6 @@ The text you write into Mymir is read by other engineers. It must read like an e - Cut adverbs. - One idea per sentence. -### Em-dash replacements - -``` -BAD (web): "Custom auth — months of work — is off the table." -GOOD: "Custom auth is off the table. Months of work, easy to leak data." - -BAD (sim): "Rejected — see line 42 of the spec." -GOOD: "Rejected. See line 42 of the spec." -``` - ### Length Concision over padding. No filler, no AI throat-clearing, no repetition. But do not sacrifice clarity for brevity. If a task genuinely needs 6 to 8 sentences in its description because the architecture has multiple components, the bug has a complex cause, or the research question is multi-part, write them. The rule is "no fluff", not "no length". A 6-sentence description that helps a reader is better than a 2-sentence one that loses them. diff --git a/plugins/claude-code/skills/composer/references/researcher-rules.md b/plugins/claude-code/skills/composer/references/researcher-rules.md index 73e44b7d..312cacd9 100644 --- a/plugins/claude-code/skills/composer/references/researcher-rules.md +++ b/plugins/claude-code/skills/composer/references/researcher-rules.md @@ -2,7 +2,7 @@ Slim extract of the canonical mymir references for the composer researcher. Mirrors: `skills/mymir/references/conventions.md` §1, §4 and -`skills/mymir/references/artifacts.md` §1 (Title, `description`, +`skills/mymir/references/artifacts.md` §1 (`description`, `acceptanceCriteria`, `decisions`), §2, §5, §6. Headings carry their canonical file and section number so citations like `conventions §1` resolve unambiguously. When editing a mirrored section, edit BOTH files. @@ -17,14 +17,12 @@ Never write what you cannot cite or do not know. Applies wherever an agent generates `executionRecord`, `decisions`, `description`, or `files`. -- `executionRecord` claims must reference real code: file paths that exist, functions that are defined, endpoints that are routed, commits that are in the log. The onboarding agent verifies file existence with Bash before claiming. +- `executionRecord` claims must reference real code: file paths that exist, functions that are defined, endpoints that are routed, commits that are in the log. - `description` must reflect actual scope. Do not stretch a one-line ask into an invented full feature. - `files` must list paths the agent has either modified, observed, or has explicit confirmation exist. When uncertain, write less. A short, true record is more valuable than a rich, fabricated one. -**Spec-review and open-questions tasks: cite the on-graph artifact.** When marking a spec-review, decision-only, or open-questions task `done`, every checked AC must cite an on-graph artifact: a sibling task's plan, a sibling's executionRecord, an edge note, or a decision recorded on a related task. Do not synthesize answers from training data. Reference the related task by ref (e.g. `MYMR-83`) inside the AC text or the executionRecord. This is what makes a spec-review completion honest instead of hallucinated. - `decisions` are different (see §1 of the artifact rules below). They come from the conversation, not from artifact-mining. --- @@ -37,21 +35,6 @@ Tool responses include a `taskRef` like `MYMR-83`: uppercase project prefix, das ## artifacts §1 — Task artifact quality -### Title - -Verb plus noun, imperative. - -``` -GOOD: "Implement JWT auth" -GOOD: "Fix Queue::front returning a copy" -GOOD: "Profile renderer hot path" -GOOD: "Train baseline ResNet on internal dataset" - -BAD: "Auth" -BAD: "Queue stuff" -BAD: "Performance" -``` - ### `description` The first thing a coding agent or engineer reads when picking up a task. It must be enough on its own to start the work. Concise and clear. @@ -69,8 +52,6 @@ Cover, depending on task type: Length: 2 to 4 sentences for most tasks. Up to 6 to 8 sentences for genuinely complex tasks. Single-sentence descriptions are rejected. -**For onboarding** (writing descriptions for tasks that already shipped): write the description as if the task were being created BEFORE the work, knowing what you now know about the codebase. The reader must be able to re-derive the work from the description. Do not write "added the auth middleware". Write "Build the JWT auth middleware in `lib/auth/middleware.ts`. Validate Bearer tokens against the user table, set `req.user`, reject on expiry. Required by every protected route." - ``` GOOD (feature, web SaaS): "Build the habit completion endpoint at POST /api/habits/:id/complete. Inserts @@ -90,32 +71,6 @@ ResNet-50 baseline. Question: does compile-time speedup outweigh JIT overhead on our 8-GPU pod? A good answer is a benchmark script plus a one-paragraph recommendation comparing wall-clock per epoch and peak memory." -GOOD (refactor, embedded firmware): -"Move the SPI driver from polling to DMA. Same public surface (spi_send, -spi_recv), same wire protocol. Internally use STM32 HAL DMA1 channel 3 for -TX. Reduces CPU usage during sensor reads from ~15% to <1% per existing -profile traces." - -GOOD (feature, game engine): -"Add deterministic frame stepping to the simulation tick. New API -Engine::stepFrame(uint32_t seed) so replay tooling and netcode tests can -re-run identical state from a recorded seed. Affects PhysicsWorld, Scheduler, -and the InputBuffer drain order." - -GOOD (data / dbt model build): -"Build the daily_active_users dbt model in models/marts/engagement/. Reads -from stg_events.session_started, deduplicates on (user_id, date_trunc('day', -event_ts)), excludes internal traffic via is_internal flag from dim_users. -Materializes incremental on event_date with a 7-day lookback window. Used by -the Looker `Engagement Overview` dashboard and the weekly stakeholder report." - -GOOD (BA / metric definition): -"Define the gross_margin metric in the dbt metrics layer. Formula: (revenue -- cogs) / revenue, dimensioned by product_line, channel, and order_month. -Source: fct_orders joined to dim_products. Replaces the four near-duplicate -SQL versions currently maintained by Sales Ops, Finance, and Marketing. -Stakeholders: CFO weekly review, RevOps dashboard." - BAD: "Improve the database." BAD: "Make auth better." BAD: "Fix the bug in queue." @@ -138,24 +93,11 @@ GOOD (firmware): - "DMA TX completion fires interrupt; no busy-loop in the driver" - "spi_recv returns 0xFF when MISO is held high, verified on the bench" -GOOD (data / dbt): -- "dbt run --select daily_active_users completes in under 90s on prod warehouse" -- "Row count of daily_active_users on 2026-05-01 matches stg_events session count to within 0.1%" -- "dbt test passes: not_null on user_id and event_date, unique on (user_id, event_date)" -- "Looker `Engagement Overview` dashboard refreshes against the new model with no broken tiles" - -GOOD (BA / analysis deliverable): -- "Churn analysis SQL in analyses/2026q2_churn.sql returns the 14 churned cohorts with ARR per cohort" -- "Numbers reconcile with finance_actuals.gross_revenue to within $500 for every month in scope" -- "Stakeholder review notes from the 2026-05-08 RevOps sync are attached to the task" - BAD: - "Database works" - "All tables created" - "Tests pass" - "Performance is good" -- "Dashboard looks right" -- "Numbers match" ``` Single-AC tasks are rejected. Tasks with vague ACs ("works correctly", "is complete", "performs well") are rejected. @@ -164,25 +106,16 @@ Single-AC tasks are rejected. Tasks with vague ACs ("works correctly", "is compl One-liner per decision. Format: **CHOICE + WHY**. -Where decisions come from: - -- **Refinement, planning, or implementation conversation.** When the user and the agent (or two agents) settle on a choice, that's a decision. The agent should automatically record it without being asked. If the agent is uncertain whether a choice rises to "decision" level, ask the user briefly to confirm. -- **Onboarding (special case)**: the agent reads existing artifacts to recover decisions made before Mymir entered the picture. Sources: manifest files (`package.json`, `Cargo.toml`, `go.mod`, `pyproject.toml`, `Package.swift`), README and design docs, commit messages with words like *chose*, *switched*, *replaced*, *migrated*. If a decision is not grounded in any of those, omit it. Better a shorter list than fabrication. +Decisions come from the refinement, planning, or implementation conversation. When the user and the agent (or two agents) settle on a choice, that's a decision. The agent should automatically record it without being asked. If the agent is uncertain whether a choice rises to "decision" level, ask the user briefly to confirm. ``` GOOD (web): "Chose Redis for refresh tokens. Need fast revocation lookups." -GOOD (web): "Switched from Prisma to Drizzle. See package.json migration commit." GOOD (sim): "Use std::vector for the Queue backing storage. Cheap front() lookup, fast tail insert; spec is silent on container choice." -GOOD (ML): "Chose ONNX runtime over PyTorch for inference. 30% lower p99 on the target Jetson Orin." -GOOD (embedded): "Pick Zephyr over FreeRTOS for the new flight controller. Built-in CAN driver, Apache-2.0 license." GOOD (agentic): "Use a per-thread tool registry. Two concurrent agent loops were stepping on each other's MCP client state." -GOOD (data): "Use dbt incremental over full-refresh on daily_active_users. Source events table is 4B rows; full-refresh exceeds the 30-minute warehouse SLA." -GOOD (BA): "Adopt dbt metrics layer over per-dashboard SQL. Four duplicates of gross_margin already exist across Looker, Tableau, and the weekly deck; one definition replaces them all." BAD: "Used Drizzle" BAD: "We picked Redis because it's good" BAD: "Decided to do it that way" -BAD: "dbt is better" ``` Never invent. If a decision is not grounded in conversation, code, or the artifacts above, leave it out. @@ -219,13 +152,8 @@ These are top-level columns on every task, set via `mymir_task` parameters of th **Tech tag examples by domain:** - Web: `react`, `next`, `drizzle`, `postgres`, `tailwind` -- Mobile: `swift`, `swiftui`, `kotlin`, `coreml`, `room` -- Game: `unity`, `unreal`, `cpp`, `glsl`, `wgsl` -- Simulation: `cpp`, `fortran`, `mpi`, `cuda` - Embedded: `c`, `rust`, `freertos`, `stm32-hal`, `zephyr` -- ML: `pytorch`, `jax`, `triton`, `clickhouse`, `dvc` -- Financial: `python`, `quantlib`, `numpy`, `arrow` -- Data / analytics / BA: `sql`, `dbt`, `bigquery`, `snowflake`, `postgres`, `looker`, `tableau`, `metabase`, `powerbi`, `airflow`, `dagster` +- Data / ML: `sql`, `dbt`, `pytorch`, `clickhouse`, `airflow` Pull tech tags from the project's actual stack. Do not invent. @@ -235,20 +163,10 @@ Pull tech tags from the project's actual stack. Do not invent. **1 to 4 hours per task.** A coding agent should complete one in a single session. -> **Starting count is not a cap.** The numbers below are seed values for decompose / onboarding, not enumeration of every task that will ever exist. Real projects accumulate tasks as work materializes; teams add tasks every day. When a parent agent or a test rig caps the task count below the table's range, honor the cap and document the deviation in your transcript or local working file. - -| Project size | Starting task count | -|---|---| -| Hackathon / 1-day spike | 5 to 10 | -| Simple (≤5 features, single user role) | 10 to 20 | -| Medium (5 to 15 features, several roles) | 20 to 40 | -| Complex (15+ features, multiple subsystems) | 40 to 80 | -| Enterprise / multi-team / long-running | 60 to 120 foundation tasks. The graph grows organically into the hundreds or thousands as teams add work. | - Too small (under 30 minutes): overhead exceeds work. Too large (over 1 day): hidden subtasks, unclear scope, hard to track. -When in doubt, split. Tasks become more useful, and more parallelizable, as they shrink toward the 1-hour mark. +When in doubt, split. Tasks become more useful, and more parallelizable, as they shrink toward the 1-hour mark. Splitting is the decompose agent's job; the researcher's part is raising `oversize-task` when the true scope exceeds what `13` represents. --- @@ -287,29 +205,6 @@ The text you write into Mymir is read by other engineers. It must read like an e - Cut adverbs. - One idea per sentence. -### Em-dash replacements - -``` -BAD (web): "Custom auth — months of work — is off the table." -GOOD: "Custom auth is off the table. Months of work, easy to leak data." - -BAD (web): "The API uses Bearer tokens — validated against the users table." -GOOD: "The API validates Bearer tokens against the users table." - -BAD (sim): "Rejected — see line 42 of the spec." -GOOD: "Rejected. See line 42 of the spec." - -BAD (agentic): "The agent loop dispatches tools — validated against the - registry — then streams the model output." -GOOD: "The agent loop validates each tool against the registry - before dispatching, then streams the model output." - -BAD (firmware):"BMP280 returns 0xFF — the i2c clock-stretch fix is not - backported." -GOOD: "BMP280 returns 0xFF. The i2c clock-stretch fix is not - backported." -``` - ### Length Concision over padding. No filler, no AI throat-clearing, no repetition. But do not sacrifice clarity for brevity. If a task genuinely needs 6 to 8 sentences in its description because the architecture has multiple components, the bug has a complex cause, or the research question is multi-part, write them. The rule is "no fluff", not "no length". A 6-sentence description that helps a reader is better than a 2-sentence one that loses them. diff --git a/plugins/claude-code/skills/composer/references/reviewer-rules.md b/plugins/claude-code/skills/composer/references/reviewer-rules.md index c32079df..b8b64b8d 100644 --- a/plugins/claude-code/skills/composer/references/reviewer-rules.md +++ b/plugins/claude-code/skills/composer/references/reviewer-rules.md @@ -2,7 +2,7 @@ Slim extract of the canonical mymir references for the review agent. Mirrors: `skills/mymir/references/conventions.md` §1, -`skills/mymir/references/lifecycle.md` §2.2, §2.3, §3, and +`skills/mymir/references/lifecycle.md` §2.2, §2.3, §2.4, §3, and `skills/mymir/references/artifacts.md` §1 (`executionRecord`, `decisions`), §6. Headings carry their canonical file and section number so citations like `lifecycle §2.2` resolve unambiguously. When editing a @@ -42,6 +42,16 @@ If `files` is non-empty AND the work was a real code change (not research, not d A missing PR on a code-changing task, a missing bracket ref, or a fabricated template section is a finding. +## lifecycle §2.4 — Skip the PR for these task types + +A missing PR is legitimate (not a finding) for: + +- Research / investigation tasks (no code change). +- Decision-only tasks. +- Pure-Mymir refinement tasks (no repo changes). +- Tasks the user explicitly said "no PR" on. +- Data and BA work without a code repo (dashboard tweaks, workbooks, metric sign-offs, ad-hoc SQL attached to a ticket). The deliverable lives outside git; the artifact link or path belongs in `executionRecord` and `files`. When the data work IS in a git repo (a dbt project, a versioned SQL or notebook repo), the standard PR rules apply. + --- ## lifecycle §3 — Propagate after every change (Iron Law) @@ -69,7 +79,9 @@ The reviewer does not execute propagation. Your downstream-impact list names the ## artifacts §1 — Task artifact quality -### `executionRecord` (only on `done` and `cancelled`) +### `executionRecord` (only on `in_review`, `done`, and `cancelled`) + +The implementer writes this field at the `in_review` transition; you verify it against the diff. - **Length:** 3 to 5 sentences. - **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). diff --git a/plugins/claude-code/skills/composer/references/sources.json b/plugins/claude-code/skills/composer/references/sources.json index c7636104..68119785 100644 --- a/plugins/claude-code/skills/composer/references/sources.json +++ b/plugins/claude-code/skills/composer/references/sources.json @@ -2,7 +2,7 @@ "_comment": "Canonical-source hash pins for the composer phase extracts in this directory. The extracts hand-mirror sections of these files; scripts/check-plugins.ts fails CI when a pinned file changes, until the extracts are reviewed and the pin refreshed via `bun run sync:plugins`.", "pins": { "plugins/claude-code/skills/mymir/references/conventions.md": "e5aedc7f4fc602e17c4e93b1bd73528988ce99977f3589de57f7b4c0b4431c11", - "plugins/claude-code/skills/mymir/references/artifacts.md": "0d56779937d0beaa30a502677d593d416a097b64e36520fb7979b887db39cdb9", + "plugins/claude-code/skills/mymir/references/artifacts.md": "b9869b386099c1daf4192592ddd5a67b2b69f05802120b119a7bd4adbb00b140", "plugins/claude-code/skills/mymir/references/lifecycle.md": "8db13246adf25c6a388ddd35f4868ff5ee7ba9b3ce4dd77cb85d1c0ae68d0646" } } diff --git a/plugins/claude-code/skills/mymir/references/artifacts.md b/plugins/claude-code/skills/mymir/references/artifacts.md index 657781f4..b96aba93 100644 --- a/plugins/claude-code/skills/mymir/references/artifacts.md +++ b/plugins/claude-code/skills/mymir/references/artifacts.md @@ -133,7 +133,7 @@ BAD: Single-AC tasks are rejected. Tasks with vague ACs ("works correctly", "is complete", "performs well") are rejected. -### `executionRecord` (only on `done` and `cancelled`) +### `executionRecord` (only on `in_review`, `done`, and `cancelled`) - **Length:** 3 to 5 sentences. - **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). @@ -402,29 +402,6 @@ The text you write into Mymir is read by other engineers. It must read like an e - Cut adverbs. - One idea per sentence. -### Em-dash replacements - -``` -BAD (web): "Custom auth — months of work — is off the table." -GOOD: "Custom auth is off the table. Months of work, easy to leak data." - -BAD (web): "The API uses Bearer tokens — validated against the users table." -GOOD: "The API validates Bearer tokens against the users table." - -BAD (sim): "Rejected — see line 42 of the spec." -GOOD: "Rejected. See line 42 of the spec." - -BAD (agentic): "The agent loop dispatches tools — validated against the - registry — then streams the model output." -GOOD: "The agent loop validates each tool against the registry - before dispatching, then streams the model output." - -BAD (firmware):"BMP280 returns 0xFF — the i2c clock-stretch fix is not - backported." -GOOD: "BMP280 returns 0xFF. The i2c clock-stretch fix is not - backported." -``` - ### Length Concision over padding. No filler, no AI throat-clearing, no repetition. But do not sacrifice clarity for brevity. If a task genuinely needs 6 to 8 sentences in its description because the architecture has multiple components, the bug has a complex cause, or the research question is multi-part, write them. The rule is "no fluff", not "no length". A 6-sentence description that helps a reader is better than a 2-sentence one that loses them. diff --git a/plugins/codex/skills/composer/references/reviewer-rules.md b/plugins/codex/skills/composer/references/reviewer-rules.md index c32079df..b8b64b8d 100644 --- a/plugins/codex/skills/composer/references/reviewer-rules.md +++ b/plugins/codex/skills/composer/references/reviewer-rules.md @@ -2,7 +2,7 @@ Slim extract of the canonical mymir references for the review agent. Mirrors: `skills/mymir/references/conventions.md` §1, -`skills/mymir/references/lifecycle.md` §2.2, §2.3, §3, and +`skills/mymir/references/lifecycle.md` §2.2, §2.3, §2.4, §3, and `skills/mymir/references/artifacts.md` §1 (`executionRecord`, `decisions`), §6. Headings carry their canonical file and section number so citations like `lifecycle §2.2` resolve unambiguously. When editing a @@ -42,6 +42,16 @@ If `files` is non-empty AND the work was a real code change (not research, not d A missing PR on a code-changing task, a missing bracket ref, or a fabricated template section is a finding. +## lifecycle §2.4 — Skip the PR for these task types + +A missing PR is legitimate (not a finding) for: + +- Research / investigation tasks (no code change). +- Decision-only tasks. +- Pure-Mymir refinement tasks (no repo changes). +- Tasks the user explicitly said "no PR" on. +- Data and BA work without a code repo (dashboard tweaks, workbooks, metric sign-offs, ad-hoc SQL attached to a ticket). The deliverable lives outside git; the artifact link or path belongs in `executionRecord` and `files`. When the data work IS in a git repo (a dbt project, a versioned SQL or notebook repo), the standard PR rules apply. + --- ## lifecycle §3 — Propagate after every change (Iron Law) @@ -69,7 +79,9 @@ The reviewer does not execute propagation. Your downstream-impact list names the ## artifacts §1 — Task artifact quality -### `executionRecord` (only on `done` and `cancelled`) +### `executionRecord` (only on `in_review`, `done`, and `cancelled`) + +The implementer writes this field at the `in_review` transition; you verify it against the diff. - **Length:** 3 to 5 sentences. - **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). diff --git a/plugins/codex/skills/mymir/references/artifacts.md b/plugins/codex/skills/mymir/references/artifacts.md index 657781f4..b96aba93 100644 --- a/plugins/codex/skills/mymir/references/artifacts.md +++ b/plugins/codex/skills/mymir/references/artifacts.md @@ -133,7 +133,7 @@ BAD: Single-AC tasks are rejected. Tasks with vague ACs ("works correctly", "is complete", "performs well") are rejected. -### `executionRecord` (only on `done` and `cancelled`) +### `executionRecord` (only on `in_review`, `done`, and `cancelled`) - **Length:** 3 to 5 sentences. - **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). @@ -402,29 +402,6 @@ The text you write into Mymir is read by other engineers. It must read like an e - Cut adverbs. - One idea per sentence. -### Em-dash replacements - -``` -BAD (web): "Custom auth — months of work — is off the table." -GOOD: "Custom auth is off the table. Months of work, easy to leak data." - -BAD (web): "The API uses Bearer tokens — validated against the users table." -GOOD: "The API validates Bearer tokens against the users table." - -BAD (sim): "Rejected — see line 42 of the spec." -GOOD: "Rejected. See line 42 of the spec." - -BAD (agentic): "The agent loop dispatches tools — validated against the - registry — then streams the model output." -GOOD: "The agent loop validates each tool against the registry - before dispatching, then streams the model output." - -BAD (firmware):"BMP280 returns 0xFF — the i2c clock-stretch fix is not - backported." -GOOD: "BMP280 returns 0xFF. The i2c clock-stretch fix is not - backported." -``` - ### Length Concision over padding. No filler, no AI throat-clearing, no repetition. But do not sacrifice clarity for brevity. If a task genuinely needs 6 to 8 sentences in its description because the architecture has multiple components, the bug has a complex cause, or the research question is multi-part, write them. The rule is "no fluff", not "no length". A 6-sentence description that helps a reader is better than a 2-sentence one that loses them. diff --git a/plugins/codex/skills/review/SKILL.md b/plugins/codex/skills/review/SKILL.md index e213d7ca..9b3fb888 100644 --- a/plugins/codex/skills/review/SKILL.md +++ b/plugins/codex/skills/review/SKILL.md @@ -367,7 +367,7 @@ The dispatch carries the explicit PR URL; do not re-resolve it from `task.links` - ALWAYS read your operating-rules extract at session start, and re-read mid-session when uncertain. - ALWAYS confirm `status='in_review'` before reading the diff. Reviewing other statuses is wrong-shaped work. - ALWAYS fetch `mymir_context depth='working'` at step 1 (no executionRecord / plan body / files in context) and `mymir_context depth='review'` at step 4 (full bundle for reconciliation). The two-phase split is the tool-enforced isolation that backs the first-pass discipline; folding both into a single `depth='review'` fetch at step 1 defeats it. -- ALWAYS dispatch the mandatory sub-reviewers when the diff hits the thresholds in the `Task` allowed-tools entry (>10 files, auth / MCP / data / migrations, `security` cross-cutting tag). Returning `approve` on a mandatory-threshold review without naming which sub-reviewers ran is not a real review. +- ALWAYS dispatch the mandatory sub-reviewers when the diff hits the thresholds in the `Task` allowed-tools entry (>10 files; auth / authz / access control; public API, RPC, tool, or IPC surfaces; persistence schema or migrations; wire formats or release artifacts; `security` / `safety` / `compliance` tags). Returning `approve` on a mandatory-threshold review without naming which sub-reviewers ran is not a real review. - ALWAYS cite real file paths and line numbers from the diff for every finding. Iron Law (conventions §1). - ALWAYS pick one of three verdicts (`approve`, `request-changes`, `block`). No hedging. - ALWAYS verify dispatched-vs-direct mode for return shape. diff --git a/plugins/cursor/skills/composer/references/reviewer-rules.md b/plugins/cursor/skills/composer/references/reviewer-rules.md index c32079df..b8b64b8d 100644 --- a/plugins/cursor/skills/composer/references/reviewer-rules.md +++ b/plugins/cursor/skills/composer/references/reviewer-rules.md @@ -2,7 +2,7 @@ Slim extract of the canonical mymir references for the review agent. Mirrors: `skills/mymir/references/conventions.md` §1, -`skills/mymir/references/lifecycle.md` §2.2, §2.3, §3, and +`skills/mymir/references/lifecycle.md` §2.2, §2.3, §2.4, §3, and `skills/mymir/references/artifacts.md` §1 (`executionRecord`, `decisions`), §6. Headings carry their canonical file and section number so citations like `lifecycle §2.2` resolve unambiguously. When editing a @@ -42,6 +42,16 @@ If `files` is non-empty AND the work was a real code change (not research, not d A missing PR on a code-changing task, a missing bracket ref, or a fabricated template section is a finding. +## lifecycle §2.4 — Skip the PR for these task types + +A missing PR is legitimate (not a finding) for: + +- Research / investigation tasks (no code change). +- Decision-only tasks. +- Pure-Mymir refinement tasks (no repo changes). +- Tasks the user explicitly said "no PR" on. +- Data and BA work without a code repo (dashboard tweaks, workbooks, metric sign-offs, ad-hoc SQL attached to a ticket). The deliverable lives outside git; the artifact link or path belongs in `executionRecord` and `files`. When the data work IS in a git repo (a dbt project, a versioned SQL or notebook repo), the standard PR rules apply. + --- ## lifecycle §3 — Propagate after every change (Iron Law) @@ -69,7 +79,9 @@ The reviewer does not execute propagation. Your downstream-impact list names the ## artifacts §1 — Task artifact quality -### `executionRecord` (only on `done` and `cancelled`) +### `executionRecord` (only on `in_review`, `done`, and `cancelled`) + +The implementer writes this field at the `in_review` transition; you verify it against the diff. - **Length:** 3 to 5 sentences. - **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). diff --git a/plugins/cursor/skills/mymir/references/artifacts.md b/plugins/cursor/skills/mymir/references/artifacts.md index 657781f4..b96aba93 100644 --- a/plugins/cursor/skills/mymir/references/artifacts.md +++ b/plugins/cursor/skills/mymir/references/artifacts.md @@ -133,7 +133,7 @@ BAD: Single-AC tasks are rejected. Tasks with vague ACs ("works correctly", "is complete", "performs well") are rejected. -### `executionRecord` (only on `done` and `cancelled`) +### `executionRecord` (only on `in_review`, `done`, and `cancelled`) - **Length:** 3 to 5 sentences. - **Distinct from `description`:** description = scope + role; executionRecord = HOW it was built (or WHY it was abandoned). @@ -402,29 +402,6 @@ The text you write into Mymir is read by other engineers. It must read like an e - Cut adverbs. - One idea per sentence. -### Em-dash replacements - -``` -BAD (web): "Custom auth — months of work — is off the table." -GOOD: "Custom auth is off the table. Months of work, easy to leak data." - -BAD (web): "The API uses Bearer tokens — validated against the users table." -GOOD: "The API validates Bearer tokens against the users table." - -BAD (sim): "Rejected — see line 42 of the spec." -GOOD: "Rejected. See line 42 of the spec." - -BAD (agentic): "The agent loop dispatches tools — validated against the - registry — then streams the model output." -GOOD: "The agent loop validates each tool against the registry - before dispatching, then streams the model output." - -BAD (firmware):"BMP280 returns 0xFF — the i2c clock-stretch fix is not - backported." -GOOD: "BMP280 returns 0xFF. The i2c clock-stretch fix is not - backported." -``` - ### Length Concision over padding. No filler, no AI throat-clearing, no repetition. But do not sacrifice clarity for brevity. If a task genuinely needs 6 to 8 sentences in its description because the architecture has multiple components, the bug has a complex cause, or the research question is multi-part, write them. The rule is "no fluff", not "no length". A 6-sentence description that helps a reader is better than a 2-sentence one that loses them. diff --git a/plugins/cursor/skills/review/SKILL.md b/plugins/cursor/skills/review/SKILL.md index e213d7ca..9b3fb888 100644 --- a/plugins/cursor/skills/review/SKILL.md +++ b/plugins/cursor/skills/review/SKILL.md @@ -367,7 +367,7 @@ The dispatch carries the explicit PR URL; do not re-resolve it from `task.links` - ALWAYS read your operating-rules extract at session start, and re-read mid-session when uncertain. - ALWAYS confirm `status='in_review'` before reading the diff. Reviewing other statuses is wrong-shaped work. - ALWAYS fetch `mymir_context depth='working'` at step 1 (no executionRecord / plan body / files in context) and `mymir_context depth='review'` at step 4 (full bundle for reconciliation). The two-phase split is the tool-enforced isolation that backs the first-pass discipline; folding both into a single `depth='review'` fetch at step 1 defeats it. -- ALWAYS dispatch the mandatory sub-reviewers when the diff hits the thresholds in the `Task` allowed-tools entry (>10 files, auth / MCP / data / migrations, `security` cross-cutting tag). Returning `approve` on a mandatory-threshold review without naming which sub-reviewers ran is not a real review. +- ALWAYS dispatch the mandatory sub-reviewers when the diff hits the thresholds in the `Task` allowed-tools entry (>10 files; auth / authz / access control; public API, RPC, tool, or IPC surfaces; persistence schema or migrations; wire formats or release artifacts; `security` / `safety` / `compliance` tags). Returning `approve` on a mandatory-threshold review without naming which sub-reviewers ran is not a real review. - ALWAYS cite real file paths and line numbers from the diff for every finding. Iron Law (conventions §1). - ALWAYS pick one of three verdicts (`approve`, `request-changes`, `block`). No hedging. - ALWAYS verify dispatched-vs-direct mode for return shape. diff --git a/plugins/claude-code/skills/composer/tests/scenarios.md b/tests/plugins/composer-scenarios.md similarity index 95% rename from plugins/claude-code/skills/composer/tests/scenarios.md rename to tests/plugins/composer-scenarios.md index 078de3a2..63ab482f 100644 --- a/plugins/claude-code/skills/composer/tests/scenarios.md +++ b/tests/plugins/composer-scenarios.md @@ -1,10 +1,12 @@ # Composer regression scenarios Pressure-test suite for the composer workflow. Run after ANY edit to -`skills/composer/SKILL.md`, the `composer-*` agents, or `agents/review.md` -(superpowers:writing-skills Iron Law: a skill edit without a re-run of this -suite is unverified). Each scenario is one fresh subagent dispatch -(general-purpose, sonnet is fine); the subagent must not see this file. +`plugins/claude-code/skills/composer/SKILL.md`, the `composer-*` agents, or +`plugins/claude-code/agents/review.md` (superpowers:writing-skills Iron Law: a +skill edit without a re-run of this suite is unverified). Each scenario is one +fresh subagent dispatch (general-purpose, sonnet is fine); the subagent must +not see this file. Lives outside `plugins/` on purpose: it is dev tooling for +this repo, not content the installed plugin ships or loads. ## Dispatch template From a31bdb427c37be43cab237375111af55b2e8100c Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 19 Jun 2026 11:46:27 +0200 Subject: [PATCH 43/45] feat: restructure composer onto per-task workflow with merge gate --- README.md | 8 +- biome.jsonc | 3 +- .../agents/composer-implementer.md | 14 + .../claude-code/agents/composer-planner.md | 18 +- .../claude-code/agents/composer-researcher.md | 29 +- plugins/claude-code/skills/composer/SKILL.md | 378 ++++++++-------- .../skills/composer/workflows/compose-task.js | 408 ++++++++++++++++++ tests/plugins/composer-scenarios.md | 103 +++-- 8 files changed, 696 insertions(+), 265 deletions(-) create mode 100644 plugins/claude-code/skills/composer/workflows/compose-task.js diff --git a/README.md b/README.md index 676f77ee..bb644d1d 100644 --- a/README.md +++ b/README.md @@ -119,8 +119,8 @@ In Codex, Cursor, and Antigravity each workflow is a skill invoked by slash comm | Component | What it does | | --- | --- | -| **`/piyaz:composer` skill** | End-to-end task orchestrator. Picks the highest-value ready task (or one named ref), drives it through research → plan → implement → propagate via three dispatched subagents per task in clean per-phase contexts, loops until queue empty or user stops. Requires `/goal` harness for backlog mode (composer emits it on first turn; user pastes). | -| **Composer subagents** | `piyaz:composer-researcher` gathers grounded context and refines the task; `piyaz:composer-planner` writes the unabridged implementation plan; `piyaz:composer-implementer` ships the code, opens a PR, and marks the task done. | +| **`/piyaz:composer` skill** | End-to-end task orchestrator. Picks the highest-value ready task (or one named ref), drives it through research → plan → implement → review → propagate via a per-task workflow that dispatches phase subagents in clean per-phase contexts, merges the PR and continues when the user authorizes it, and loops until queue empty or user stops. Requires `/goal` harness for backlog mode (composer emits it on first turn; user pastes). | +| **Composer subagents** | `piyaz:composer-researcher` gathers grounded context and refines the task; `piyaz:composer-planner` writes the unabridged implementation plan; `piyaz:composer-implementer` ships the code, opens a PR, and marks the task `in_review`; `piyaz:review` returns the verdict that drives the bounded fix loop. | | **`piyaz:decompose-task` agent** | Splits an existing oversize task in an active project into 2 to N children, rewires every dependency edge touching the parent, cancels the parent with rationale citing the children. Composer's oversize handler routes here. | | **`piyaz:decompose-feature` agent** | Adds a new feature or capability cluster to an active project. Reuses existing categories and tag vocabulary; creates 5 to 20 tasks plus internal and integration edges. | @@ -169,7 +169,7 @@ Piyaz ships as a Next.js web app plus vendor-native plugins for Claude Code, Cod ❯ Priority is urgent, draft ACs are enough, and monorepo detection should ask the user. ``` -**Drive end-to-end (Claude Code).** Once a project is active and tasks are ready, composer can take over. Pick the next task off the critical path, research it in context, plan it, implement it, open the PR, propagate the result, and loop: +**Drive end-to-end (Claude Code).** Once a project is active and tasks are ready, composer can take over. Pick the next task off the critical path, research it in context, plan it, implement it, open the PR, review and fix until it is ready, propagate the result (and merge when you authorize it), and loop: ```text ❯ /piyaz:composer @@ -181,7 +181,7 @@ Or take one specific task all the way to a PR: ❯ /piyaz:composer PYZ-101 ``` -Composer dispatches three subagents per task in clean per-phase contexts (researcher → planner → implementer). The orchestrator stays out of the work itself and only picks tasks, hands off, and propagates. +Composer runs a per-task workflow that dispatches phase subagents in clean per-phase contexts (researcher → planner → implementer → review), with a bounded fix loop until the PR is ready. The orchestrator stays out of the work itself: it picks tasks, resolves gates, merges when authorized, and propagates. **Tune in the UI.** Inspect edges, read execution records, and edit descriptions, ACs, tags, or dependencies directly. The agent loop and the UI write to the same store, so edits land by the next tool call. diff --git a/biome.jsonc b/biome.jsonc index a025a80b..d1744e56 100644 --- a/biome.jsonc +++ b/biome.jsonc @@ -23,7 +23,8 @@ "!cloudflare-env.d.ts", "!bun.lock", "!migrations/**", - "!drizzle/**" + "!drizzle/**", + "!plugins/**/workflows/**" ] }, "formatter": { diff --git a/plugins/claude-code/agents/composer-implementer.md b/plugins/claude-code/agents/composer-implementer.md index 758b5c08..c70bdf7d 100644 --- a/plugins/claude-code/agents/composer-implementer.md +++ b/plugins/claude-code/agents/composer-implementer.md @@ -223,6 +223,20 @@ When the dispatch says fix mode, the reviewer requested changes on your PR and t When a `gh` call fails for environmental reasons — auth expiry (`gh auth status` failing, 401s), rate limiting, network errors — the work is not at fault. One immediate retry is fine; if it persists, stop and return `STATUS: BLOCKED — environmental: `. The orchestrator surfaces environmental failures to the user without consuming the failure budget; mislabeling a real verification failure as environmental hides broken work, so use this only for errors the environment alone can fix. +## Composer structured return + +When the composer workflow dispatches you, a structured-output schema is attached and your machine-readable return must populate these fields. The Completion Protocol payload is already written to Piyaz; these fields are the control signal the workflow branches on. + +- `status`: `DONE` (handed off for review), `DONE_WITH_CONCERNS` (handed off, but you carry a doubt named in `concerns`), or `BLOCKED` (verification could not reach green, plan broken, or an unexpected state). +- `prUrl`: the PR URL you opened, or `null` when the work legitimately changed no code (lifecycle §2.4) and you opened no PR. +- `branch`: the feature branch name, or `null`. +- `acSatisfied`: how many acceptance criteria you evaluated to satisfied. +- `acTotal`: the total acceptance-criteria count. +- `concerns`: one entry per concern for the orchestrator's attention; empty on a clean `DONE`. +- `reason`: the one-line STATUS reason. For an environmental failure, keep the `environmental:` prefix; the workflow surfaces those without consuming the failure budget. + +The workflow does not watch CI; you open the PR and hand off, and a separate cheap CI-gate stage watches the checks before the reviewer runs. Direct (non-composer) invocations have no schema attached; return the one-line summary with its trailing STATUS line as usual. + ## What this phase does not do - It does not replan. If the plan is wrong, fail back to the orchestrator; the orchestrator decides whether to re-run the planner. diff --git a/plugins/claude-code/agents/composer-planner.md b/plugins/claude-code/agents/composer-planner.md index 5aeda203..2fdffe53 100644 --- a/plugins/claude-code/agents/composer-planner.md +++ b/plugins/claude-code/agents/composer-planner.md @@ -77,7 +77,9 @@ When entry status was already `planned`, do **not** pass the `status` field at a 1. **Fetch planning context.** `piyaz_context depth='planning' taskId=''`. This gives the project description, prerequisite tasks' specs, downstream specs that depend on this task, and the current acceptance criteria. Read it in full; do not skim. -2. **Read the research brief.** Treat its citations as ground truth where they are verifiable from a quick codebase read; spot-check 2-3 file path / line range claims with `Read` to catch hallucinations. If a claim does not check out, drop it from the plan and note the discrepancy in the plan's *Decisions* section. +2. **Read the research brief and guard the foundation.** You are not only the brief's consumer; you are the last check on it before code gets written. Treat its citations as ground truth where they are verifiable from a quick codebase read; spot-check 2-3 file path / line range claims with `Read` to catch hallucinations. A claim that does not check out gets dropped from the plan with the discrepancy noted in the plan's *Decisions* section. + + When the failure is not one stray claim but the **foundation** — the refined description describes a task the codebase cannot support, the acceptance criteria are unverifiable or contradict each other, or the files the brief names do not exist and no plausible target does — do not plan on top of it. A plan built on a wrong task produces wrong code. Stop and return `STATUS: BLOCKED — foundation-unsound: `; the orchestrator re-runs research once before retrying you. Reserve this for a genuinely broken foundation, not for a brief you would have written differently. 3. **Refinements: typically already applied; only fill gaps.** The Phase 1 researcher applies refinements (description, acceptance criteria, tags, category, priority, estimate, decisions) directly to the target before handing off, so the task you read via `piyaz_context depth='planning'` should already reflect those changes. The brief's *Applied refinements* section names what landed. @@ -169,7 +171,19 @@ When entry status was already `planned`, do **not** pass the `status` field at a - `DONE`: plan saved and verified, or silent re-validation kept an existing valid plan. - `DONE_WITH_CONCERNS`: plan saved, but you noted risks the implementer should see (name them in the confirmation sentence). - `NEEDS_DECISION`: the brief left an open question the plan cannot resolve without the user (rare; the researcher should have gated it). - - `BLOCKED`: the plan write failed verification after your own retry, or the task is in a state you must not plan from. + - `BLOCKED`: the plan write failed verification after your own retry, the task is in a state you must not plan from, or the research foundation is unsound (`foundation-unsound:` prefix; step 2). The orchestrator re-runs research once on a `foundation-unsound` block. + +## Composer structured return + +When the composer workflow dispatches you, a structured-output schema is attached and your machine-readable return must populate these fields. The plan itself is already saved to Piyaz; these fields are the control signal, not the plan. + +- `status`: the STATUS value above. +- `sections`: the number of `##` sections in the plan you wrote (or re-validated). +- `buildSteps`: the number of numbered steps in the plan's *Build sequence*. +- `openQuestions`: the *Open questions* list, the items the implementer must escalate before guessing. +- `reason`: the one-line STATUS reason; for a `foundation-unsound` block, the `foundation-unsound:` prefix must be present here. + +Direct (non-composer) invocations have no schema attached; return the one-sentence confirmation with its trailing STATUS line as usual. ## What this phase does not do diff --git a/plugins/claude-code/agents/composer-researcher.md b/plugins/claude-code/agents/composer-researcher.md index 79762ae5..7e8ffcea 100644 --- a/plugins/claude-code/agents/composer-researcher.md +++ b/plugins/claude-code/agents/composer-researcher.md @@ -126,7 +126,16 @@ Run these in the order given; do not skip. Steps 2–5 can fan out in parallel w Every refinement appends; never pass `overwriteArrays=true`. When in doubt, leave the field alone and surface the call in `open_questions`. Speculation in a `description` rewrite is worse than a thin description. -8. **Surface open questions.** Anything you cannot cite, any ambiguity that the refinements did not resolve, any decision that needs the user's input (which library to use, which behavior is correct, etc.) goes in `open_questions`. The orchestrator surfaces these before advancing to planning. +8. **Self-verify before returning.** Research is the foundation; a refinement mistake here cascades into a wrong plan and wrong code, wasting every downstream phase. Before you return, re-read the refined task (`piyaz_context depth='planning' taskId=''`) and check each item: + + - Every acceptance criterion is **binary**: a reviewer answers YES or NO without judgement (artifacts §1). An ambiguous criterion that survived to your return is a defect. Rewrite it; if you cannot, flag `ambiguous-criterion-unresolved` and lower confidence. + - Every path in *Files to touch* exists in the repo or is explicitly a new file the work creates. Drop or correct any path you cannot confirm. + - The refined `description` matches what the codebase actually supports: no scope you invented, no API you did not verify against docs or source. + - Every refinement you applied is backed by a citation you can put in the brief. A refinement without a citation is ungrounded; revert it. + + Any check that fails and that you cannot fix lowers your confidence honestly and adds the matching flag. A calibrated confidence below 0.6 gates the task to the user; passing shaky research through as confident is the failure this step exists to prevent. + +9. **Surface open questions.** Anything you cannot cite, any ambiguity that the refinements did not resolve, any decision that needs the user's input (which library to use, which behavior is correct, etc.) goes in `open_questions`. The orchestrator surfaces these before advancing to planning. ## Output format @@ -195,4 +204,20 @@ The STATUS line is the last line of your return and the only thing the orchestra - `DONE_WITH_CONCERNS`: brief is complete and nothing gates, but you raised non-gating flags (`version-drift-major`, `security-boundary-uncovered`, `missing-citation`, `dep-mismatch`, `ambiguous-criterion-unresolved`). - `DONE`: brief complete, no flags, confidence ≥ 0.6, no proposed rewrites. -The orchestrator passes this brief verbatim to the Phase 2 planner via the Task tool. Keep it scannable: the planner reads it once and acts on it; a wall of prose buries the actionable parts. The refinements you applied are already in Piyaz; the planner reads the refined task from `piyaz_context depth='planning'`; the brief is the *findings* the planner needs to write the plan against. +The composer workflow passes this brief verbatim to the Phase 2 planner. Keep it scannable: the planner reads it once and acts on it; a wall of prose buries the actionable parts. The refinements you applied are already in Piyaz; the planner reads the refined task from `piyaz_context depth='planning'`; the brief is the *findings* the planner needs to write the plan against. + +## Composer structured return + +When the composer workflow dispatches you, a structured-output schema is attached and your machine-readable return must populate these fields. The prose brief above is still your output; it goes in `brief` verbatim. + +- `status`: the STATUS value from *Choosing STATUS*. +- `brief`: the full markdown brief, verbatim. +- `confidence`: your calibrated confidence in `[0,1]`. +- `estimate`: the refined Fibonacci estimate (`1, 2, 3, 5, 8, 13`) or `null`. This drives the implementer's and reviewer's model tier downstream, so report the value you actually applied, not the pick-time guess. +- `workType`: the work-type tag you settled on (`feat`/`fix`/`refactor`/`docs`/`test`/`chore`/`perf`) or `null`. +- `flags`: the *Flags* list, controlled vocabulary. +- `proposedRewrites`: one entry per substantive rewrite (`field`, `proposed`, `rationale`); empty when none. +- `openQuestions`: the *Open questions* list. +- `reason`: the one-line STATUS reason. + +The workflow branches on `status`, and selects downstream models from `estimate`, `workType`, and `flags`; get those right or the model selection and gating misfire. Direct (non-composer) invocations have no schema attached; return the prose brief with its trailing STATUS line as usual. diff --git a/plugins/claude-code/skills/composer/SKILL.md b/plugins/claude-code/skills/composer/SKILL.md index e30b18a9..c5eecc80 100644 --- a/plugins/claude-code/skills/composer/SKILL.md +++ b/plugins/claude-code/skills/composer/SKILL.md @@ -2,17 +2,21 @@ name: composer description: > Use when the user types /piyaz:composer, /piyaz:composer , or - /piyaz:composer rework , or - asks to run the next Piyaz task end-to-end, ship the backlog, compose - through the ready queue, or loop through Piyaz tasks until done. Do NOT - invoke for one-off task lookups, status checks, hand-refinement of one - task, or interactive planning of a single task; those flows belong to the - piyaz skill and composer adds latency without adding quality. + /piyaz:composer rework , or asks to run the next Piyaz + task end-to-end, ship the backlog, compose through the ready queue, or + loop through Piyaz tasks until done. Composer researches, refines, plans, + implements, reviews, and fixes each task in a loop until the PR is ready, + and merges and continues when the user authorizes it. Do NOT invoke for + one-off task lookups, status checks, hand-refinement of one task, or + interactive planning of a single task; those flows belong to the piyaz + skill and composer adds latency without adding quality. --- # Composer -Composer is a Piyaz task orchestrator. Per iteration it picks the next ready task off the project's critical path, dispatches four phase subagents in sequence (research, plan, implement, review), runs a bounded review→fix loop, propagates the result through the graph, and continues until a structural stop condition holds. Each subagent runs in a fresh context with a focused tool set; the orchestrator stays clean and writes nothing to tasks except propagation edges. +Composer is a Piyaz task orchestrator. Per iteration it picks the next ready task off the project's critical path, runs that task through a deterministic per-task **workflow** (research, plan, implement, CI gate, review, bounded fix loop), surfaces the verdict, merges when the user authorized it, propagates the result through the graph, and continues until a structural stop condition holds. + +The orchestrator (this skill, running in the main loop) owns only the **interactive seams**: pick the task, resolve gates, run the merge gate, propagate. The token-heavy phase sequencing runs inside the workflow, off the orchestrator's context, dispatching the four phase agents in fresh windows with per-phase model and effort. This is the design's main token discipline: orchestration is JavaScript, not main-loop reasoning over a transcript that grows with every phase. Composer is glue. The heavy lifting (task selection, refinement, the Completion Protocol, propagation) lives in the `piyaz` skill (`skills/piyaz/SKILL.md`); composer reuses those flows rather than duplicating them. @@ -21,9 +25,9 @@ Composer is glue. The heavy lifting (task selection, refinement, the Completion - **`/piyaz:composer`**: backlog mode. Pick the highest-value ready task each iteration; continue until a stop condition holds. - **`/piyaz:composer `**: single-task mode. Same pipeline applied to one task; exits after the iteration completes. - **`/piyaz:composer rework `**: rework mode. HOTL requested changes on GitHub instead of merging; composer rounds that feedback back through the fix loop. -- **`/piyaz:composer --pipelined`**: backlog mode with research-ahead. While task A is in review/fix, the researcher for the next task B runs in the background. Lookahead is hard-capped at 1. Backlog mode only — the flag is ignored in single-task and rework modes. +- **`/piyaz:composer --pipelined`**: backlog mode with research-ahead (latency-only, costs tokens). Off by default; see *Pipelined research-ahead*. -No argument means backlog mode; `--pipelined` alone is still backlog mode (with research-ahead); `rework` plus an argument means rework mode; anything else is single-task. +No argument means backlog mode; `rework` plus an argument means rework mode; anything else is single-task. ## Piyaz operating context @@ -34,323 +38,279 @@ The canonical piyaz rules load with this skill. Downstream citations (`conventio @skills/piyaz/references/lifecycle.md @skills/piyaz/references/resilience.md -## The four phase subagents - -Each is a registered plugin agent dispatched via the Task tool by `subagent_type`. Their contracts live in their own files; do not duplicate their logic here. +## The per-task workflow -| Phase | `subagent_type` | Writes to Piyaz | Returns | -| --- | --- | --- | --- | -| 1. Research | `piyaz:composer-researcher` | Refinement fields only (`description`, `acceptanceCriteria`, `tags`, `category`, `priority`, `estimate`, `decisions`); never `status` | Research brief + `STATUS` line | -| 2. Plan | `piyaz:composer-planner` | `implementationPlan`, `decisions`; `status='planned'` on the `draft → planned` transition only | One-sentence confirmation + `STATUS` line | -| 3. Implement | `piyaz:composer-implementer` | `status='in_progress'` (claim), `status='in_review'` (+ full Completion Protocol payload); in fix mode rotates `in_review → in_progress → in_review` | PR URL + one-line summary + `STATUS` line | -| 4. Review | `piyaz:review` | Nothing (read-only over Piyaz) | Structured verdict + `STATUS` line | +Each iteration's task runs through `skills/composer/workflows/compose-task.js`, launched with the Workflow tool: -The task row is the single source of truth. The researcher refines it before planning; the planner saves the plan to it; the implementer reads everything (refined description, ACs, plan, upstream decisions) from `piyaz_context depth='agent'`; the reviewer runs its own two-phase fetch (`depth='working'`, then `depth='review'`). Dispatch payloads stay minimal (see *Dispatch hygiene*). +``` +Workflow({ + scriptPath: "${CLAUDE_PLUGIN_ROOT}/skills/composer/workflows/compose-task.js", + args: { taskRef, taskId, projectId, categories, tagVocabulary, + pickEstimate, pickPriority, workType, tags, thinDescription, + mode, plannableOnly, resumeFrom, priorBrief, gateAnswers, + fixFindings, prUrl, priorFailure, estimate, flags }, +}) +``` -## Status vocabulary +If `${CLAUDE_PLUGIN_ROOT}` does not resolve in the tool argument, substitute the absolute path of this plugin's root. The workflow runs in the background; the orchestrator is suspended until it returns, so it spends no context tokens while phases run. -Every subagent return ends with `STATUS: `. Branch on the status, not on your reading of the prose: +The workflow dispatches the four phase agents by `agentType`, each with explicit `model`/`effort`/`schema`, the implementer with `isolation:'worktree'`. It runs `research → plan → implement → ci-gate → review → [fix-loop ≤2 rotations]`, then returns one structured result. It does **not** merge, propagate, or touch edges; those are the orchestrator's seams. The phase contracts live in the agent files; do not duplicate them here. -| STATUS | Meaning | Orchestrator reaction | -| --- | --- | --- | -| `DONE` | Phase output complete | Advance to the next phase | -| `DONE_WITH_CONCERNS` | Complete, but the agent flagged doubts | Quote the concerns in the run log, then advance | -| `NEEDS_DECISION` | A user decision is required | Gate via `AskUserQuestion`; act on the answer | -| `BLOCKED` | Phase cannot complete | *Failure handling* | +| Phase | `agentType` | Writes to Piyaz | Workflow captures | +| --- | --- | --- | --- | +| 1. Research | `piyaz:composer-researcher` | refinement fields only (`description`, `acceptanceCriteria`, `tags`, `category`, `priority`, `estimate`, `decisions`); never `status` | brief, status, flags, confidence, refined estimate/work-type, proposed rewrites | +| 2. Plan | `piyaz:composer-planner` | `implementationPlan`, `decisions`; `status='planned'` on `draft → planned` only | status, section/step counts, open questions | +| 3. Implement | `piyaz:composer-implementer` | `status='in_progress'` (claim), `status='in_review'` (+ Completion Protocol); fix mode rotates `in_review → in_progress → in_review` | status, PR URL, AC counts, concerns | +| CI gate | generic (haiku) | nothing | `green` / `red` / `pending` / `none`, failing checks | +| 4. Review | `piyaz:review` (dispatched with a verdict schema) | nothing (read-only) | verdict, blocking findings | -Expected `NEEDS_DECISION` triggers (typically from the researcher; the planner may raise one too — gate the same way and re-dispatch the **raising agent** with the answers; the implementer and reviewer contracts do not return this status): +## The workflow result -- **Oversize** (`oversize-task` flag): offer to dispatch `piyaz:decompose-task` or skip the task. Composer never splits a task itself. -- **Proposed rewrites** (`## Proposed rewrites` non-empty): show original vs proposed per field with the researcher's rationale; offer accept / deny. On accept, apply via `piyaz_task action='update'` and re-dispatch the researcher on the rewritten task (the old brief is invalid). On deny, end the iteration: backlog mode picks the next task; single-task mode stops. -- **Low confidence or external input** (confidence < 0.6, `external-input-required`): surface the open questions, wait for answers, re-dispatch with the answers appended. +The workflow returns exactly one of three shapes. Branch on `result.status`, not on prose: -A return without a STATUS line is malformed: re-read the prose once; if the outcome is still ambiguous, treat it as `BLOCKED`. +| `status` | Meaning | Orchestrator reaction | +| --- | --- | --- | +| `DONE` | Task ran to `in_review` (or `planned` for a plannable-only pick) | Surface the verdict, run the *Merge gate*, propagate | +| `NEEDS_DECISION` | The research or plan phase gated; `result.gate` carries the trigger and `result.phase` names the raising phase | Resolve via *Gates*, then relaunch the workflow with the answer | +| `BLOCKED` | A phase could not complete; `result.phase` and `result.reason` say which and why | *Failure handling* | -**Headless gate fallback:** when `AskUserQuestion` is unavailable (errors or hangs — headless runs, policy-denied contexts), a `NEEDS_DECISION` gate resolves to skip-the-task: append a `GATE` line to the run log carrying the unasked question and the skip as continuations, then end the iteration (`TASK_END outcome=skipped`) (backlog mode picks the next task; single-task mode stops). Never fabricate an answer — skipping is the reversible default (resilience §11). +A `DONE` result also carries: `outcome` (`in_review`|`planned`), `verdict`, `prUrl`, `ciState`, `acSatisfied`/`acTotal`, `rotations`, `escalated` (true when a `block` verdict or an exhausted fix budget left findings unaddressed), `blockingFindings`, `concerns`. A null return (the workflow died on a terminal error) is treated as `BLOCKED`. ## Session bootstrap Once per session, before the first iteration: 1. **Resolve the project.** `piyaz_project action='list'` → `action='select' projectId='...'`. Single-task mode: also `piyaz_query type='search' query=''` to resolve the task UUID and current status. -2. **Read meta.** `piyaz_query type='meta'`. Keep the categories and tag vocabulary for researcher dispatches; drop the status counts. -3. **Stale-claim sweep.** Scan the project's task list (`piyaz_query type='list'`) for tasks already at `in_progress`. These are possible stale claims from dead sessions; surface them in the first pick rationale so the user sees them before the run commits elsewhere. -4. **Init the run log.** `mkdir -p .piyaz` and guard the gitignore (`grep -qxF '.piyaz/' .gitignore 2>/dev/null || printf '\n.piyaz/\n' >> .gitignore` — the resilience §3 pattern, with a leading newline so a `.gitignore` ending without one is not corrupted). If `.piyaz/composer-.md` already exists and ends with a `RUN_END` line, archive it to `.piyaz/archive/composer--.md` and start fresh; if it exists *without* a `RUN_END`, that is a resume signal — see *Recovering after compaction* before doing anything else. Exception: when the unfinished log's `RUN_START mode=` differs from this invocation (e.g. `rework` invoked over an interrupted backlog run), it is not a resume — append `RUN_END reason=superseded-by-`, archive it, and start fresh. Then append `RUN_START`. +2. **Read meta.** `piyaz_query type='meta'`. Keep the categories and tag vocabulary for the workflow's research args; drop the status counts. +3. **Stale-claim sweep.** Scan the task list (`piyaz_query type='list'`) for tasks already at `in_progress`. Surface possible stale claims from dead sessions in the first pick rationale. +4. **Set the merge policy.** Ask once with `the AskUserQuestion tool`: `never` (default; HOTL owns the merge), `ask-each` (confirm per PR), or `auto-on-approve` (merge automatically on an `approve` verdict with green CI). Record the choice; it holds for the whole run. When `AskUserQuestion` is unavailable (headless), default to `never`. +5. **Init the run log.** `mkdir -p .piyaz` and guard the gitignore (`grep -qxF '.piyaz/' .gitignore 2>/dev/null || printf '\n.piyaz/\n' >> .gitignore`). If `.piyaz/composer-.md` exists and ends with `RUN_END`, archive it to `.piyaz/archive/composer--.md` and start fresh; if it exists *without* a `RUN_END`, that is a resume signal — see *Recovering after compaction* first. When the unfinished log's `RUN_START mode=` differs from this invocation, append `RUN_END reason=superseded-by-`, archive, and start fresh. Then append `RUN_START mode=<...> mergePolicy=<...> project=`. -Then start iterating. There is nothing to install and nothing to confirm. +Then start iterating. There is nothing to install and nothing to confirm beyond the merge policy. ## The loop -At the start of each iteration, materialize these steps as todos and mark them off as you go (the todo list is your compaction anchor): pick, research, plan, implement, ci gate, review, surface verdict, propagate. +At the start of each iteration, materialize these todos and mark them off (the todo list is your compaction anchor): pick, launch workflow, handle result, surface verdict, merge gate, propagate. ```dot digraph composer_iteration { "Pick next task" [shape=box]; "Ready or plannable task?" [shape=diamond]; "STOP: backlog drained" [shape=doublecircle]; - "Dispatch researcher" [shape=box]; - "Researcher STATUS?" [shape=diamond]; - "Gate with user" [shape=box]; + "Launch compose-task workflow" [shape=box]; + "Result status?" [shape=diamond]; + "Resolve gate with user" [shape=box]; "Continue this task?" [shape=diamond]; "STOP: iteration ends (single-task)" [shape=doublecircle]; - "Dispatch planner" [shape=box]; - "Planner STATUS?" [shape=diamond]; - "Pick was plannable-only?" [shape=diamond]; - "Dispatch implementer" [shape=box]; - "Implementer STATUS?" [shape=diamond]; - "Dispatch reviewer" [shape=box]; - "Reviewer STATUS?" [shape=diamond]; - "Verdict?" [shape=diamond]; - "Fix rotations used < 2?" [shape=diamond]; - "Dispatch implementer in fix mode" [shape=box]; - "CI gate: gh pr checks (10m bound)" [shape=box]; - "Escalate all verdicts to HOTL" [shape=box]; - "Surface verdict + propagate" [shape=box]; "Failure handling" [shape=box]; + "outcome = planned?" [shape=diamond]; + "Surface verdict" [shape=box]; + "Merge gate (per policy)" [shape=box]; + "Propagate" [shape=box]; "Single-task mode?" [shape=diamond]; "STOP: iteration complete" [shape=doublecircle]; "Pick next task" -> "Ready or plannable task?"; "Ready or plannable task?" -> "STOP: backlog drained" [label="no"]; - "Ready or plannable task?" -> "Dispatch researcher" [label="yes"]; - "Dispatch researcher" -> "Researcher STATUS?"; - "Researcher STATUS?" -> "Dispatch planner" [label="DONE / DONE_WITH_CONCERNS"]; - "Researcher STATUS?" -> "Gate with user" [label="NEEDS_DECISION"]; - "Researcher STATUS?" -> "Failure handling" [label="BLOCKED"]; - "Gate with user" -> "Continue this task?"; - "Continue this task?" -> "Dispatch researcher" [label="yes: re-dispatch with answers"]; + "Ready or plannable task?" -> "Launch compose-task workflow" [label="yes"]; + "Launch compose-task workflow" -> "Result status?"; + "Result status?" -> "outcome = planned?" [label="DONE"]; + "Result status?" -> "Resolve gate with user" [label="NEEDS_DECISION"]; + "Result status?" -> "Failure handling" [label="BLOCKED / null"]; + "Resolve gate with user" -> "Continue this task?"; + "Continue this task?" -> "Launch compose-task workflow" [label="yes: relaunch with answers"]; "Continue this task?" -> "Pick next task" [label="no (backlog)"]; "Continue this task?" -> "STOP: iteration ends (single-task)" [label="no (single-task)"]; - "Dispatch planner" -> "Planner STATUS?"; - "Planner STATUS?" -> "Pick was plannable-only?" [label="DONE / DONE_WITH_CONCERNS"]; - "Pick was plannable-only?" -> "Dispatch implementer" [label="no"]; - "Pick was plannable-only?" -> "Single-task mode?" [label="yes: planned; deps unfinished"]; - "Planner STATUS?" -> "Gate with user" [label="NEEDS_DECISION"]; - "Planner STATUS?" -> "Failure handling" [label="BLOCKED"]; - "Dispatch implementer" -> "Implementer STATUS?"; - "Implementer STATUS?" -> "CI gate: gh pr checks (10m bound)" [label="DONE / DONE_WITH_CONCERNS"]; - "CI gate: gh pr checks (10m bound)" -> "Dispatch reviewer" [label="green / red / unresolved: annotate dispatch"]; - "Implementer STATUS?" -> "Failure handling" [label="BLOCKED"]; - "Dispatch reviewer" -> "Reviewer STATUS?"; - "Reviewer STATUS?" -> "Verdict?" [label="DONE"]; - "Reviewer STATUS?" -> "Failure handling" [label="BLOCKED"]; - "Verdict?" -> "Surface verdict + propagate" [label="approve"]; - "Verdict?" -> "Fix rotations used < 2?" [label="request-changes"]; - "Verdict?" -> "Escalate all verdicts to HOTL" [label="block"]; - "Fix rotations used < 2?" -> "Dispatch implementer in fix mode" [label="yes"]; - "Fix rotations used < 2?" -> "Escalate all verdicts to HOTL" [label="no"]; - "Dispatch implementer in fix mode" -> "Implementer STATUS?"; - "Escalate all verdicts to HOTL" -> "Surface verdict + propagate"; - "Surface verdict + propagate" -> "Single-task mode?"; + "outcome = planned?" -> "Single-task mode?" [label="yes (plannable-only)"]; + "outcome = planned?" -> "Surface verdict" [label="no"]; + "Surface verdict" -> "Merge gate (per policy)"; + "Merge gate (per policy)" -> "Propagate"; + "Propagate" -> "Single-task mode?"; "Single-task mode?" -> "STOP: iteration complete" [label="yes"]; "Single-task mode?" -> "Pick next task" [label="no"]; + "Failure handling" -> "Single-task mode?"; } ``` ### Step details -1. **Pick.** Backlog: `piyaz_analyze type='ready'` ∩ `type='critical_path'`; rank by priority (`urgent > core > normal > backlog`), tie-break by lowest estimate. Fall back to the highest-priority `ready` task when the intersection is empty, then to `piyaz_analyze type='plannable'` when `ready` is empty (those route through research + plan only; their dependencies are unfinished, so there is nothing to implement yet — note the pick as **plannable-only**). Single-task: the named task; if already `done` or `cancelled`, report that and stop. If the named task is already claimed, never re-run research or planning on it: at `in_progress`, jump straight to implement-phase recovery (the partial-success check in *Failure handling*); at `in_review`, jump straight to *Review and the fix loop*. Emit a one-paragraph pick rationale (taskRef, priority, estimate, critical-path yes/no, one-sentence reason). Do not wait for approval — the user interrupts if they disagree. +1. **Pick.** Backlog: `piyaz_analyze type='ready'` ∩ `type='critical_path'`; rank by priority (`urgent > core > normal > backlog`), tie-break by lowest estimate. Fall back to the highest-priority `ready` task when the intersection is empty, then to `piyaz_analyze type='plannable'` when `ready` is empty (plannable picks route through research + plan only; mark the pick **plannable-only**). Single-task: the named task; if `done` or `cancelled`, report and stop; if already claimed, see *Failure handling* (jump to the in-flight phase, never restart). Emit a one-paragraph pick rationale (taskRef, priority, estimate, critical-path yes/no, one-sentence reason). Do not wait for approval; the user interrupts if they disagree. + +2. **Gather pick facts and launch.** Build the workflow `args` from the pick and bootstrap: `taskRef`, `taskId` (the UUID; never the ref in tool calls — conventions §4), `projectId`, `categories`, `tagVocabulary`, `pickEstimate`, `pickPriority`, `workType` and `tags` (from the task row), `thinDescription` (true when the description fails the artifacts §1 rubric on a glance), `mode`, `plannableOnly`. Write `PICK` then `WORKFLOW task= runId=` to the run log, then launch the workflow and await the result. + +3. **Handle the result.** `NEEDS_DECISION` → *Gates*. `BLOCKED`/null → *Failure handling*. `DONE` with `outcome=planned` (plannable-only) → end the iteration (`TASK_END outcome=planned`); backlog returns to the pick, single-task reports and stops. `DONE` with `outcome=in_review` → step 4. + +4. **Surface + merge + propagate.** Quote the final verdict block verbatim (`VERDICT` to the run log). Run the *Merge gate*. Then propagate per lifecycle §3: `piyaz_query type='edges' taskId=''`, `piyaz_analyze type='downstream' taskId=''`; update or retire edge notes the work invalidated (edge-note shape: artifacts §3). Propagation depth: full when the PR was merged or the verdict was `approve`; otherwise provisional, each note prefixed `Provisional pending HOTL on PR #:`. Surface newly-unblocked tasks in the next pick rationale. Write `PROPAGATED`, then `TASK_END outcome=in_review rotations=`. -2. **Research.** Dispatch `piyaz:composer-researcher` with: `Target task: `, the categories + tag vocabulary from bootstrap, and (on re-dispatch) the user's gate answers. Status does not change in this phase; the researcher refines the task row in place. React per *Status vocabulary*. +5. **Loop.** Single-task: report the outcome and stop. Backlog: next iteration, no pause. -3. **Plan.** Dispatch `piyaz:composer-planner` with: `Target task: `, the task's current status (so it knows new-plan vs re-validate), and the research brief verbatim. Verify with one `piyaz_context depth='summary' taskId=''` poll: a `draft` entry must now show a plan and `status='planned'`. If not, re-dispatch once with the failure appended; a second miss is `BLOCKED`. +## Gates - When the pick was plannable-only, the iteration ends here: the task is now `planned` and its dependencies are still unfinished, so there is nothing to implement. Backlog mode returns to the pick; single-task mode reports the planned outcome and stops. Never dispatch the implementer on a plannable-only pick. +A `NEEDS_DECISION` result means the research or plan phase needs a user decision before the task can proceed. `result.phase` names the raising phase and `result.gate` carries the trigger. Resolve with `the AskUserQuestion tool`, then relaunch the workflow: -4. **Implement.** First check the pick type: when the pick was plannable-only, do not enter this step — the iteration already ended at `planned` (step 3). Otherwise dispatch `piyaz:composer-implementer` with: `Target task: . Plan is saved to Piyaz; fetch via piyaz_context depth='agent'. Claim the task (planned → in_progress), implement per the implementationPlan, open a PR, mark in_review per the Completion Protocol.` Append the prior failure summary on retries. The implementer runs worktree-isolated (frontmatter `isolation: worktree`; also pass the Task tool's `isolation: "worktree"` parameter at dispatch, which is verified to work with plugin agents): it works in its own tree, the orchestrator's tree never moves, and the researcher's baseline stays stable. +- **Oversize** (`oversize-task` flag): offer to dispatch `piyaz:decompose-task` or skip the task. Composer never splits a task itself. On decompose, dispatch the decompose agent and end the iteration; the children land in the backlog. +- **Proposed rewrites** (`result.gate.proposedRewrites` non-empty): show original vs proposed per field with the rationale; offer accept / deny. On accept, apply via `piyaz_task action='update'` and relaunch the workflow **fresh** (no `resumeFrom`) so research re-grounds on the rewritten task. On deny, end the iteration (backlog picks next; single-task stops). +- **Low confidence or external input** (confidence < 0.6, `external-input-required`, or any plan-phase open question): surface the open questions, wait for answers, then relaunch — research gate relaunches **fresh** with `gateAnswers`; a plan gate relaunches with `resumeFrom='plan'`, `priorBrief=result.brief`, and `gateAnswers`, so research is not redone. -5. **CI gate.** After the implementer returns DONE with a PR URL, watch the checks with a bounded timeout: `timeout 600 gh pr checks --watch; rc=$?`. Branch on the **exit code**, with one output-based exception: `rc=0` → green. `rc=124` (timeout killed the watch mid-pending) or `rc=8` (gh's checks-pending code) → still pending. Any other non-zero `rc` → red — unless the output says no checks are reported (gh has no distinct exit code for a repo with no checks configured; it shares the red codes). No checks reported → skip the gate entirely; that is a skip, not a red. For a real red, read the failing check names from the output. Branch on the result: - - **Green**: dispatch the reviewer normally. - - **Red**: dispatch the reviewer with the failing check names appended to the dispatch (`CI: failing — `); the reviewer may not approve red CI. - - **Still pending at the 10-minute timeout**: dispatch the reviewer with `CI: unresolved after 10m`; `approve` is off the table, and an otherwise-clean review returns `request-changes` citing unresolved CI as the sole blocking finding. +**Headless gate fallback:** when `AskUserQuestion` is unavailable (errors or hangs), a `NEEDS_DECISION` resolves to skip-the-task: append a `GATE` line carrying the unasked question and the skip, write `TASK_END outcome=skipped`, end the iteration (backlog picks next; single-task stops). Never fabricate an answer; skipping is the reversible default (resilience §11). - The gate re-runs after every fix rotation's implementer DONE. +## Merge gate -6. **Review and the fix loop.** Dispatch `piyaz:review` with: `Target task: . PR URL: . Mode: composer-phase-4.` — nothing more; the reviewer's two-phase context fetch is its own contract, and instructing a `depth='review'` fetch up front would defeat it. On `STATUS: DONE`, branch on the verdict payload: - - **`approve`**: go to step 7. - - **`request-changes`**, fewer than 2 fix rotations used this task: dispatch the implementer in fix mode — `Target task: . Fix mode. PR: . Address exactly these review findings, re-run verification, re-mark in_review:` followed by the verdict's blocking findings verbatim. On the implementer's `DONE`, re-run the CI gate (step 5), then re-dispatch the reviewer (same dispatch shape). Each fix dispatch + re-review is one rotation. - - **`request-changes`** with 2 rotations used, or **`block`**: stop fixing. Escalate every verdict from this task to HOTL and go to step 7. `block` is never auto-fixed; review.md calibrates it as "one rotation will not land this". - - The verdict is advisory beyond the fix loop: HOTL owns `in_review → done` on GitHub regardless of verdict. +The merge gate runs after a `DONE` result with `outcome=in_review`, governed by the run's merge policy. It fires **only** when `result.verdict === 'approve'` AND `result.ciState === 'green'`; a `request-changes`, `block`, `escalated`, red, or pending result is never merged. -7. **Surface + propagate.** Quote the final verdict block verbatim. Then propagate per lifecycle §3: `piyaz_query type='edges' taskId=''`, `piyaz_analyze type='downstream' taskId=''`; update or retire edge notes the work invalidated (edge-note shape: artifacts §3 — one to three short sentences addressed to the downstream task's agent). Propagation depth follows the verdict: on `approve`, propagate fully. On an escalated `request-changes` or `block`, write edge-note updates as provisional — prefix each with `Provisional pending HOTL on PR #:` — because HOTL may reject the work; the HOTL `done` flip (outside composer, as today) is the trigger for firming them up. Surface newly-unblocked tasks in the next pick rationale. +- **`never`** (default): do not merge. HOTL owns the merge and the `in_review → done` transition, exactly as without this feature. Propagate provisionally unless the verdict was `approve`. +- **`ask-each`**: ask `the AskUserQuestion tool` whether to merge this PR. On yes, merge as below. On no (or headless), leave it for HOTL. +- **`auto-on-approve`**: merge without asking. + +To merge: `gh pr merge --squash --delete-branch` (squash is the default; follow the repo's configured default method when it differs). On a clean merge, write the task `done` — this is the **one** case the orchestrator writes a status transition, authorized by the run-start merge policy: + +``` +piyaz_task action='update' taskId='' status='done' + executionRecord=' via composer auto-merge after approve + green CI>' +``` -8. **Loop.** Single-task: report the iteration outcome and stop. Backlog: next iteration, no pause. +Then propagate fully (the work landed) and write `MERGE task= pr= method=squash` to the run log. A failed merge (conflict, protected branch, merge-queue required) is not a task failure: report it, leave the task at `in_review` for HOTL, and continue. -### Model selection +## Model selection -Every phase dispatch passes an explicit `model:` parameter on the Task tool call; dispatch-time models override agent frontmatter. The frontmatter models stay unchanged — they are the conservative defaults for direct (non-composer) invocation. +The workflow self-selects each phase's model and effort from the pick facts and the research stage's refined estimate/work-type/flags. The orchestrator does not pass models; it passes the pick facts. The table the workflow applies: | Phase | est 1–2 | est 3 | est 5 | est 8–13 / unset | | --- | --- | --- | --- | --- | -| Researcher | sonnet | sonnet | sonnet | sonnet | -| Planner | sonnet | sonnet if work-type ∈ {docs, test, chore}, else opus | opus | opus | -| Implementer | sonnet | sonnet if work-type ∈ {docs, test, chore}, else opus | opus | opus | -| Reviewer | opus | opus | opus | opus — never downgrade the reviewer | - -Use the **post-research estimate**, not the pick-time one: the researcher's *Applied refinements* reports estimate changes for the planner dispatch, and the step-3 plan-verification poll (`piyaz_context depth='summary'`) re-surfaces the current value for the implement and review dispatches. Work-type comes from the task's work-type tag (pick payload or the brief's tag refinements); when the work-type is unknown, treat it as non-docs. +| Researcher | sonnet (haiku only if est 1 and docs/chore) | sonnet | opus | opus | +| Planner | opus | opus | opus | opus | +| Implementer | sonnet (also docs/test/chore) | sonnet if docs/test/chore, else opus | opus | opus | +| CI gate | haiku | haiku | haiku | haiku | +| Reviewer | opus | opus | opus | opus — never downgrade | -Guardrails — force opus for the planner and implementer regardless of estimate when any of these holds: +Research correctness is load-bearing: a mis-refined task wastes far more downstream opus tokens than a cheaper research model saves, so haiku research is reserved for trivial, unambiguous work only, and the floor rises to opus on substantial or risky tasks. -- the task carries a `security`, `safety`, or `compliance` tag; -- the estimate is 8, 13, or missing; -- the dispatch is a fix-mode rotation; -- the dispatch is any retry after a failure, or partial-success recovery; -- the researcher returned `DONE_WITH_CONCERNS` with `security-boundary-uncovered`, `version-drift-major`, or `dep-mismatch` (the risk-bearing flags; `missing-citation` and `ambiguous-criterion-unresolved` are quality notes and do not bump the model); -- `priority='urgent'`. +Guardrails force opus and higher effort on the planner and implementer regardless of estimate when any holds: a `security`/`safety`/`compliance` tag; estimate 8, 13, or missing; a fix-mode rotation; any retry or partial-success recovery; `priority='urgent'`; or a risk-bearing research flag (`security-boundary-uncovered`, `version-drift-major`, `dep-mismatch`). These are encoded in `compose-task.js`; this table is the human-readable mirror. ## Run log -The run log is composer's crash-safe memory: a pure append-only event log at `.piyaz/composer-.md`, one active file per project. The conversation can compact; the log does not. Counters are never tracked as state — they derive by grep over events **after the latest `RUN_START` line**, so earlier runs' events never leak into this run's budgets: rotations used on task X = count of `FIX task=X` lines; failed attempts = count of `FAIL task=X` lines. +The run log is composer's crash-safe memory: an append-only event log at `.piyaz/composer-.md`, one active file per project. The conversation can compact; the log does not. Counters derive by grep over events **after the latest `RUN_START`**: this run's iterations = `PICK` lines; failed attempts on task X = `FAIL task=X` lines. -One timestamped line per event, `key=value` pairs; multi-line payloads (blocking findings verbatim, gate questions and answers, failure summaries, DONE_WITH_CONCERNS text) follow as `> ` continuation lines. The event vocabulary: +One timestamped line per event, `key=value` pairs; multi-line payloads (blocking findings, gate questions and answers, failure summaries) follow as `> ` continuation lines. The vocabulary: | Event | Written when | | --- | --- | -| `RUN_START` | bootstrap completes (`mode=backlog\|single\|rework project=`) | +| `RUN_START` | bootstrap completes (`mode=backlog\|single\|rework mergePolicy=<...> project=`) | | `PICK` | step 1 emits the pick rationale | -| `PHASE` | a phase subagent returns (`phase=research\|plan\|implement status=`) | -| `GATE` | a `NEEDS_DECISION` gate resolves — user answer or headless skip; question and answer as continuations | -| `VERDICT` | the reviewer returns (`verdict= rotation=/2`; blocking findings as continuations) | -| `FIX` | **before** dispatching a fix rotation (`rotation=/2 pr=`) | -| `ESCALATE` | rotations exhausted or a `block` verdict goes to HOTL | -| `SURFACED` | the final verdict is quoted to the user | -| `PROPAGATED` | step 7 propagation completes (`edges= unblocked=`) | -| `BRIEF` | a `--pipelined` prefetch brief lands (`task= baselinedAt=`; the brief verbatim as continuations) | -| `FAIL` | a phase returns BLOCKED (failure summary as continuation) | +| `WORKFLOW` | immediately after launching the workflow (`task= runId=`) | +| `GATE` | a `NEEDS_DECISION` resolves — user answer or headless skip; question and answer as continuations | +| `VERDICT` | the workflow returns DONE (`verdict= rotations= ci= escalated=`; blocking findings as continuations) | +| `MERGE` | the merge gate merges a PR (`task= pr= method=squash`) | +| `ESCALATE` | a `block` or rotations-exhausted result goes to HOTL | +| `PROPAGATED` | propagation completes (`edges= unblocked=`) | +| `BRIEF` | a `--pipelined` prefetch brief lands (`task= baselinedAt=`; brief verbatim as continuations) | +| `FAIL` | the workflow returns BLOCKED (failure summary as continuation) | | `TASK_END` | the iteration ends (`outcome=in_review\|planned\|stuck\|skipped rotations=`) | | `RESUME` | recovery appends this after reading the log | -| `RUN_END` | any stop condition (`reason=<...> picked= shipped= stuck= skipped=`) | +| `RUN_END` | any stop condition (`reason=<...> picked= shipped= merged= stuck= skipped=`) | -The `FIX` line is written *before* the rotation dispatch — increment-before-dispatch is crash-safe: a crash mid-rotation wastes at most one rotation and never exceeds the budget. Format example: - -``` -2026-06-12T14:01:09Z RUN_START mode=backlog project=RZE -2026-06-12T14:01:31Z PICK task=RZE-42 prio=core est=5 critical=yes — auth middleware; unblocks RZE-44,RZE-45 -2026-06-12T14:05:44Z PHASE task=RZE-42 phase=plan status=DONE verified=planned -2026-06-12T14:31:02Z PHASE task=RZE-42 phase=implement status=DONE pr= -2026-06-12T14:39:18Z VERDICT task=RZE-42 verdict=request-changes rotation=0/2 -> blocking: src/auth/refresh.ts:88 catch swallows token-expiry; AC3 unmet -2026-06-12T14:39:20Z FIX task=RZE-42 rotation=1/2 pr= -2026-06-12T14:58:30Z VERDICT task=RZE-42 verdict=approve rotation=1/2 -2026-06-12T14:58:55Z SURFACED task=RZE-42 verdict=approve -2026-06-12T14:59:40Z PROPAGATED task=RZE-42 edges=2 unblocked=RZE-44,RZE-45 -2026-06-12T14:59:41Z TASK_END task=RZE-42 outcome=in_review rotations=1 -2026-06-12T16:40:12Z RUN_END reason=backlog-drained picked=3 shipped=1 stuck=1 skipped=1 -``` - -If `.piyaz/` is not writable (sandboxed runs), fall back to whatever directory is writable and name the chosen path in your first report; if no local write is possible at all, run without the log and say so — the run loses crash recovery, not correctness. +Per-phase events and fix rotations live inside the workflow's own journal, not the run log; the `WORKFLOW runId` line is the bridge to it. If `.piyaz/` is not writable, fall back to any writable directory and name the chosen path in the first report; if no local write is possible, run without the log and say so — the run loses crash recovery, not correctness. ## Rework mode -Pull-based: the backend has no webhooks, and `task_links` is the only PR record. The user invokes rework when GitHub review feedback exists; composer fetches it, re-anchors it, and runs the existing fix loop on it. +Pull-based: the backend has no webhooks, and `task_links` is the only PR record. The user invokes rework when GitHub review feedback exists; composer fetches it, re-anchors it, and runs the fix loop on it. -1. **Resolve the pair.** Given a taskRef, read `task.links` filtered to `kind='pull_request'`; given a PR URL, resolve the task from the `[]` bracket in the PR title/body (verify the link row agrees). When several PR links exist, prefer the newest open PR — never trust oldest-link-wins. Every downstream dispatch carries the explicit PR URL. -2. **Reviewer-led intake.** Dispatch `piyaz:review` with: `Target task: . PR URL: . Mode: rework-intake.` The intake re-verifies the human feedback against current HEAD and returns a standard verdict. +1. **Resolve the pair.** Given a taskRef, read `task.links` filtered to `kind='pull_request'`; given a PR URL, resolve the task from the `[]` bracket (verify the link row agrees). Prefer the newest open PR when several exist. +2. **Reviewer-led intake.** Dispatch `piyaz:review` with `Target task: . PR URL: . Mode: rework-intake.` The intake re-verifies the human feedback against current HEAD and returns a verdict. 3. **Branch on the intake verdict.** - - `request-changes`: the blocking findings are the human's items with fresh file:line citations. Run *Review and the fix loop* from the fix-dispatch step, with two changes: prefix each fix dispatch with `Rework.` (the implementer accepts an `in_progress` entry only when the dispatch says rework — HOTL may flip `in_review → in_progress` to signal rework), and use a **fresh rotation budget of 2 for this rework invocation** (it is a new review cycle; prior runs' rotations do not count). The CI gate (step 5) applies to each rotation as usual. - - approve-shaped "nothing to rework": zero unresolved feedback. Report it and stop; the iteration is complete. - - `BLOCKED` (PR merged/closed, task `done`/`cancelled`): report and stop; there is nothing legal to do. -4. **Finish like any iteration.** Surface the final verdict, propagate (step 7), `TASK_END`. The run log records the whole run with `RUN_START mode=rework`. - -Future (documented, not built): a GitHub webhook feeding `task_links.metadata` and a UI "rework available" signal; this agent-side mode stays the consumer. + - `request-changes`: launch the workflow with `resumeFrom='fix'`, `prUrl=`, and `fixFindings=`. The fix loop uses a **fresh rotation budget of 2** for this rework invocation (the workflow's rotation counter starts at zero per launch). Prefix the implementer dispatch context with rework so the implementer accepts an `in_progress` entry (HOTL may flip `in_review → in_progress` to signal rework). + - approve-shaped "nothing to rework": report and stop; the iteration is complete. + - `BLOCKED` (PR merged/closed, task `done`/`cancelled`): report and stop. +4. **Finish like any iteration.** Surface the verdict, run the merge gate, propagate, `TASK_END`. The run log records `RUN_START mode=rework`. ## Pipelined research-ahead (flag-gated) -Only under `--pipelined`, only in backlog mode, lookahead 1. The win is latency (~15–25%), not tokens; when in doubt, run without the flag. +Only under `--pipelined`, only in backlog mode, lookahead 1. The win is latency (~15–25%), not tokens; when in doubt, run without it. -- **Trigger:** dispatch researcher(B) in the background only after implementer(A) returns DONE — overlap covers A's CI gate, review, and fix rotations only. Never dispatch the prefetch while A's initial implement phase is still running. -- **Pick B excluding A.** B must be ready independently of A by construction — `in_review` unblocks nothing, so the ready set already excludes A's dependents. -- **Isolation:** researcher(B) is dispatched with worktree isolation and `run_in_background`; the orchestrator's tree and A's review baseline never move. -- **Brief custody:** when researcher(B) returns, append a `BRIEF` event to the run log (`task= baselinedAt=`) with the brief verbatim as `> ` continuation lines. The transcript copy is working memory; the log copy survives compaction. The prefetch is not a `PICK`: B's `PICK` line is written when B's own iteration starts, so recovery's last-`PICK`-without-`TASK_END` rule still finds A. -- **Gates queue.** A `NEEDS_DECISION` from researcher(B) queues until A's iteration boundary; never interrupt A's review/fix cycle to gate on B. -- **Propagation(A) never runs while researcher(B) is in flight.** Wait for the researcher's return (or stop it) before touching edges. -- **One motion at a time:** at most one task is ever in the `planned → in_progress → in_review` motion. B is never planned, claimed, or implemented early. -- **A prefetch failure consumes no budget.** Researcher(B) BLOCKED or crashed: drop the prefetch silently and research B normally on its own iteration. +- **Trigger:** after task A's workflow returns DONE, launch a research-only workflow for the next ready task B in the background (`resumeFrom='research'` with a research-only early return is not built in; instead dispatch `piyaz:composer-researcher` directly with worktree isolation and `run_in_background`). Never prefetch while A's workflow is still running. +- **Pick B excluding A.** B must be ready independently of A; `in_review` unblocks nothing, so the ready set already excludes A's dependents. +- **Brief custody:** when the prefetch returns, append a `BRIEF` event with the brief verbatim. The prefetch is not a `PICK`; B's `PICK` lands when B's iteration starts, so recovery's last-`PICK`-without-`TASK_END` rule still finds A. Pass the brief into B's workflow launch as `priorBrief` with `resumeFrom='plan'` only when the invalidation table below clears it. +- **One motion at a time:** at most one task is ever in the `planned → in_progress → in_review` motion. B is never planned, claimed, or implemented early. A prefetch failure consumes no budget; drop it and research B normally. -Red flags, in addition to the table above: never plan or claim B early; never run two researchers; never author or amend a brief yourself; never prefetch in single-task or rework mode; never gate mid-A for a prefetch decision. +**Brief invalidation.** After propagation(A), evaluate in order; the first matching row wins: -**Brief invalidation.** After propagation(A) completes, evaluate this table against the prefetched brief, in order; the first matching row wins: - -| # | Signal observed after propagation(A) | Action | +| # | Signal after propagation(A) | Action | | --- | --- | --- | -| 1 | Propagation created a `depends_on` edge B→(non-done task) | Re-pick; the brief is marked stale | -| 2 | B's description was updated by propagation | Re-research (same precedent as the accepted-rewrite rule) | -| 3 | Edge notes into B were updated naming files or patterns in the brief's *Files to touch* | Re-research; otherwise proceed | -| 4 | A's files ∩ B brief's *Files to touch* ≠ ∅ | Re-research with the A PR pointer in the open-questions dispatch slot | -| 5 | A pick re-run returns task C outranking B on priority class | Re-pick to C; a mere tie proceeds with B | -| 6 | Pure `relates_to`/informational note updates, no description change, no overlap | Proceed | -| 7 | None of the above | Proceed (the expected common case) | +| 1 | A `depends_on` edge B→(non-done task) was created | Re-pick; brief is stale | +| 2 | B's description was updated | Re-research (relaunch fresh) | +| 3 | Edge notes into B name files/patterns in the brief's *Files to touch* | Re-research | +| 4 | A's files ∩ B brief's *Files to touch* ≠ ∅ | Re-research with the A PR pointer in `gateAnswers` | +| 5 | A re-pick returns C outranking B on priority class | Re-pick to C; a tie proceeds with B | +| 6 | Pure informational note updates, no overlap | Proceed with the brief | +| 7 | None of the above | Proceed | -Re-research reuses the existing open-questions dispatch slot (the same slot gate answers travel in); an invalidation is not a failed attempt and consumes no budget. **Kill switch:** after two consecutive invalidations, disable prefetch for the rest of the run and say so in the next pick rationale — the project is too churny for lookahead today. +**Kill switch:** after two consecutive invalidations, disable prefetch for the rest of the run and say so. ## Dispatch hygiene -Subagents inherit nothing from this session; the dispatch prompt is their whole world beyond their own agent file and tools. Keep every dispatch to the phase minimum shown in *Step details*. Never paste orchestrator transcript, prior-iteration summaries, full meta payloads, or piyaz reference text into a dispatch — the agents load their own rules extract and fetch task context from Piyaz themselves. Oversized dispatches make agents worse, not better. +The workflow builds every phase dispatch from the `args` you pass; the agents inherit nothing else. Keep `args` to the pick facts in *Step details* — never pass orchestrator transcript, prior-iteration summaries, full meta payloads, or piyaz reference text. The agents load their own rule extracts and fetch task context from Piyaz themselves. Oversized dispatches make agents worse, not better. ## Failure handling -`BLOCKED` from any phase is a failed attempt, with one exception: a phase that reports BLOCKED because the task is already at `done` or `cancelled` is not a failure — HOTL resolved the task underneath the run (e.g. approving mid-fix-rotation). Treat that as iteration complete: run *Surface + propagate* if it has not run, consume no failure budget, and move on. A second exception: `STATUS: BLOCKED — environmental: ` (gh auth expiry, rate limits, network) is an environment problem, not a work problem — surface it to the user verbatim and consume no failure budget; resume the same phase once the user confirms the environment is fixed. For every other BLOCKED: - -1. Keep the failure summary in your transcript. Do not write it to `decisions` — per artifacts §1 that field is CHOICE + WHY, not process metadata. -2. Leave the task at its current status. Never roll back, never cancel. -3. Backlog mode: when the failure summary is transient-shaped (network hiccup, flaky test, dirty workspace state), retry the failed phase once with the failure summary appended; otherwise, or when the retry also fails, write `TASK_END outcome=stuck`, then move to the next pick; the stuck task stays where it is for human triage. Single-task mode: retry the failed phase up to three total attempts on the task, appending each failure summary to the re-dispatch; after the third, report and stop. Re-run research or planning only when the failure clearly traces to a planning gap (e.g. the plan names a file that does not exist). +`BLOCKED`/null from the workflow is a failed attempt, with exceptions: -**Partial success (PR exists, `in_review` not marked):** when a retry's pre-flight finds the task at `in_progress` with an open PR matching `/-`, do not re-implement. First verify the PR actually belongs to the task: its title or body must carry the `[]` bracket form — a branch-name match alone is not proof. Verified: dispatch the implementer to resume the Completion Protocol against the existing PR (re-evaluate ACs, populate the payload, mark `in_review`). Counts as one attempt. +- A phase that reports BLOCKED because the task is already `done` or `cancelled` is not a failure — HOTL resolved it underneath the run. Run *Surface + merge + propagate* if it has not run, consume no budget, move on. +- `BLOCKED — environmental: ` (gh auth, rate limits, network) is an environment problem; surface it verbatim, consume no budget, resume the same workflow (via `resumeFrom`) once the user confirms the fix. +- `BLOCKED` from the plan phase prefixed `foundation-unsound` means the planner judged the research foundation wrong; relaunch the workflow **fresh** once to re-research, then treat a second failure normally. -**`in_review` without a PR link:** when the task sits at `in_review` but `task.links` carries no `pull_request` entry, look for the orphaned PR: +For every other BLOCKED: -```bash -gh pr list --state open --limit 100 --json url,title,body,headRefName \ - --jq '.[] | select(.headRefName | contains("-"))' -``` +1. Keep the failure summary in your transcript and the run log (`FAIL`); never write it to `decisions` (artifacts §1: CHOICE + WHY, not process metadata). +2. Leave the task at its current status. Never roll back, never cancel. +3. Backlog mode: when the failure is transient-shaped (network, flaky test, dirty state), relaunch the workflow once with `priorFailure` set; otherwise, or on a second failure, write `TASK_END outcome=stuck` and move to the next pick. Single-task mode: relaunch up to three total attempts, appending each failure summary as `priorFailure`; after the third, report and stop. -If a hit carries the `[]` bracket form in title or body, dispatch the implementer to re-run the Completion Protocol payload against it (the `prUrl` write repairs the link). No verified match: report the inconsistency to the user; never fabricate a link. +**Partial success and orphaned PRs** are handled inside the implementer's pre-flight (it resumes the Completion Protocol against an existing branch/PR rather than re-implementing). When a single-task pick is already `in_progress` or `in_review`, launch the workflow with `resumeFrom='implement'` (in_progress) or `resumeFrom='fix'` with the existing `prUrl` (in_review); the implementer's pre-flight does the rest. ## Stop conditions -Stop and report in plain language (there are no magic stop phrases) when one of these holds: +Stop and report in plain language (there are no magic stop phrases) when one holds: -1. **Backlog drained**: `ready` and `plannable` are both empty. The stop report enumerates every task left at `in_progress`/`in_review` with its failure summary — the stranded-task report; nothing strands silently. +1. **Backlog drained**: `ready` and `plannable` are both empty. The stop report enumerates every task left at `in_progress`/`in_review` with its failure summary — nothing strands silently. 2. **Failure budget exhausted**: three failed attempts on the same task (single-task mode). 3. **User says stop**: exit after the in-flight write finishes. -4. **Single-task or rework iteration complete**: verdict surfaced and propagation done (rework: feedback addressed, or nothing to rework). The task itself sits at `in_review` awaiting HOTL; composer's job is finished. +4. **Single-task or rework iteration complete**: verdict surfaced, merge gate run, propagation done. 5. **Rewrite denied** (single-task mode): the user rejected a proposed rewrite at the gate. -6. **Piyaz transport/auth failure**: any Piyaz tool call fails with auth expiry, 401/403, a 5xx, or a network error. Stop immediately — these are not retryable in-session (resilience §10) — and report the exact error text plus the last completed phase for each in-flight task. - -These six are exhaustive. Do not invent new stop conditions, and do not stop for anything else. +6. **Piyaz transport/auth failure**: any Piyaz tool call fails with auth expiry, 401/403, a 5xx, or a network error. Stop immediately (not retryable in-session, resilience §10) and report the exact error plus the last completed phase per in-flight task. -Every stop appends `RUN_END` with its reason and the grep-derived counters, then offers in the stop report to archive the log to `.piyaz/archive/`; the headless default is archive. +These six are exhaustive. Every stop appends `RUN_END` with its reason and the grep-derived counters, then offers to archive the log; the headless default is archive. ## Recovering after compaction -Read the run log first: `.piyaz/composer-.md`. The last `PICK` without a matching `TASK_END` is the in-flight task. Division of authority: **Piyaz wins on status** — re-read the task row and never trust the log over the server for where the task is; **the log wins on counters and history** — rotations used (`FIX task=X` count), failed attempts (`FAIL task=X` count), verdict history, gate answers, and DONE_WITH_CONCERNS text all come from the log, never from your memory. Rebuild the backlog skip set from this run's `TASK_END outcome=stuck` and `outcome=skipped` lines (the skip set is per-run; archives do not feed it). Append a `RESUME` line, then continue from the derived phase. +Read the run log first: `.piyaz/composer-.md`. The last `PICK`/`WORKFLOW` without a matching `TASK_END` is the in-flight task. **Piyaz wins on status** — re-read the task row and never trust the log over the server. **The log wins on history** — the merge policy (`RUN_START mergePolicy=`), gate answers, verdict history, and the workflow `runId`. + +To resume the in-flight task: -To derive the phase, combine the in-flight task's last log lines with its Piyaz status: `draft` without a plan → research or planning pending; `planned` → implementation pending (or iteration end, when the pick was plannable-only); `in_progress` → implementer in flight, a fix rotation in flight (a trailing `FIX` without a following `VERDICT` means resume that rotation, budget already counted), or partial-success recovery; `in_review` → CI gate or review pending, the fix loop mid-cycle, or the verdict already logged (check `VERDICT` lines before re-dispatching); `done` → HOTL approved, run propagation if no `PROPAGATED` line exists. +- A `WORKFLOW runId=` line with no `VERDICT`/`TASK_END` after it means the workflow may still be journaled. Resume it with `Workflow({ scriptPath, resumeFromRunId: '' })` — completed phases return from cache, only the unfinished phase re-runs. Stop the prior run first if it is somehow still live. +- No usable runId: fall back to the Piyaz status mapping and relaunch with the matching `resumeFrom`. `draft` without a plan → fresh; `planned` → `resumeFrom='implement'` (or iteration end for a plannable-only pick); `in_progress` → `resumeFrom='implement'` (the implementer pre-flight resumes partial work); `in_review` → `resumeFrom='fix'` with the PR URL; `done` → HOTL or the merge gate already resolved it, run propagation if no `PROPAGATED` line exists. -When the log is missing (different machine, sandbox), fall back to the status mapping alone. For runs likely to span compaction, single-task mode re-invoked per task remains the lowest-risk shape. +Append a `RESUME` line, then continue. Rebuild the backlog skip set from this run's `TASK_END outcome=stuck`/`skipped` lines. When the log is missing (different machine, sandbox), fall back to the status mapping alone; single-task mode re-invoked per task remains the lowest-risk shape for runs likely to span compaction. ## Red flags — never do these | Temptation | Reality | | --- | --- | -| Write `status` "so no other agent grabs the task" | Every transition belongs to a subagent: planner `draft→planned`; implementer `planned→in_progress→in_review` plus the fix rotation; HOTL `in_review→done`. The orchestrator writes propagation edges, nothing else. | -| Skip research or planning to "get the claim in faster" | The phase order is fixed for every task, including `planned` entries (the planner re-validates): research → plan → implement → review. The implementer claims when its turn comes; no urgency moves it earlier. | +| Write `status` "so no other agent grabs the task" | Every transition belongs to a phase agent: planner `draft→planned`; implementer `planned→in_progress→in_review` plus fix rotations. The orchestrator writes only propagation edges — and `done`, but **only** when the merge gate merged the PR under an authorizing merge policy. | +| Merge without the policy authorizing it, or merge a non-approve / non-green PR | The merge gate fires only on `approve` + green CI, only under `ask-each` (with a yes) or `auto-on-approve`. `never` means HOTL merges. | +| Dispatch a phase agent yourself instead of launching the workflow | The orchestrator never dispatches phase agents directly (rework intake is the one exception). The workflow owns research → review; the orchestrator owns the seams. | +| Skip research or planning to "get the claim in faster" | The phase order is fixed inside the workflow; the orchestrator cannot reorder it. | | Split an oversize task yourself | Oversize routes to `piyaz:decompose-task`, and only after the user gate. | -| Dispatch the implementer after planning a plannable-only pick | That iteration already ended at `planned`; its dependencies are unfinished. Return to the pick. | -| Treat `request-changes` or `block` as a failed attempt | A careful verdict is a successful review (`STATUS: DONE`). The fix loop or HOTL owns the response; the failure budget is untouched. | -| Re-implement when a matching PR already exists | Resume the Completion Protocol instead. | +| Treat a `request-changes` or `block` verdict as a failed attempt | A careful verdict is a successful review. The workflow's fix loop or HOTL owns the response; the failure budget is untouched. | | Pause between tasks to ask "should I continue?" | Continuous execution. The six stop conditions are the only exits; gates fire only on `NEEDS_DECISION`. | -| Keep fixing after 2 rotations, or auto-fix a `block` | Escalate to HOTL with all verdicts. | -| Pad a dispatch with transcript, meta, or spec text | Phase minimum only. Pollution makes agents worse. | -| Emit or watch for literal stop phrases | Stops are structural; report them in plain language. | +| Pad `args` with transcript, meta, or spec text | Pick facts only. Pollution makes agents worse. | ## What composer is not -Not a decomposer (oversize routes out). Not a hand-refiner (that is the piyaz skill, used directly). Not the merge gate (HOTL owns `in_review → done` and merging, whatever the verdict). The run log is the resilience primitive; per-task re-invocation remains the recommendation for very long runs. +Not a decomposer (oversize routes out). Not a hand-refiner (that is the piyaz skill, used directly). It IS, when the user authorizes it, the merge gate. The workflow is the execution engine; the run log and the workflow journal are the resilience primitives; per-task re-invocation remains the recommendation for very long runs. ## See also -- `skills/piyaz/SKILL.md`: canonical flows composer reuses — selection (§ *What should I work on?*), refinement (§ *Refine a task*), planning (§ *Plan a draft task*), implementation (§ *Implement a task*), propagation. -- `agents/composer-researcher.md`, `agents/composer-planner.md`, `agents/composer-implementer.md`, `agents/review.md`: the four phase contracts, including each phase's STATUS rules. +- `skills/composer/workflows/compose-task.js`: the per-task pipeline the orchestrator launches. +- `skills/piyaz/SKILL.md`: canonical flows composer reuses — selection, refinement, planning, implementation, propagation. +- `agents/composer-researcher.md`, `agents/composer-planner.md`, `agents/composer-implementer.md`, `agents/review.md`: the four phase contracts and their structured returns. - `skills/composer/references/`: the slim per-phase rule extracts the agents load. - `agents/decompose-task.md`: the oversize-delegation target. diff --git a/plugins/claude-code/skills/composer/workflows/compose-task.js b/plugins/claude-code/skills/composer/workflows/compose-task.js new file mode 100644 index 00000000..a3e645b7 --- /dev/null +++ b/plugins/claude-code/skills/composer/workflows/compose-task.js @@ -0,0 +1,408 @@ +/** + * compose-task — the composer per-task pipeline. + * + * Launched once per task by the composer orchestrator (skills/composer/SKILL.md) + * via Workflow({ scriptPath, args }). Runs research → plan → implement → CI → + * review → bounded fix loop entirely off the orchestrator's context, dispatching + * the existing composer phase agents by agentType with per-phase model/effort and + * worktree isolation on the implementer. Returns one structured result; the + * orchestrator owns the interactive seams (gates, merge, propagation). + * + * Args (orchestrator → workflow): + * taskRef, taskId, projectId, categories, tagVocabulary, + * pickEstimate, pickPriority, workType, tags, thinDescription, + * mode, plannableOnly, resumeFrom, priorBrief, gateAnswers, fixFindings, + * prUrl, priorFailure, estimate, flags + * + * Return shapes: + * { status:'DONE', outcome:'in_review'|'planned', verdict, prUrl, ciState, + * acSatisfied, acTotal, rotations, escalated, blockingFindings, concerns } + * { status:'NEEDS_DECISION', phase, gate, brief } + * { status:'BLOCKED', phase, reason } + */ + +export const meta = { + name: "compose-task", + description: + "Run one Piyaz task through research, plan, implement, CI gate, review, and a bounded fix loop until the PR is ready", + phases: [ + { title: "Research" }, + { title: "Plan" }, + { title: "Implement" }, + { title: "CI gate" }, + { title: "Review" }, + ], +}; + +const RESEARCH_SCHEMA = { + type: "object", + additionalProperties: false, + required: ["status", "brief", "confidence", "estimate", "workType", "flags", "proposedRewrites", "openQuestions", "reason"], + properties: { + status: { enum: ["DONE", "DONE_WITH_CONCERNS", "NEEDS_DECISION", "BLOCKED"] }, + brief: { type: "string", description: "The full markdown research brief, verbatim." }, + confidence: { type: "number" }, + estimate: { type: ["integer", "null"], description: "Refined Fibonacci estimate (1,2,3,5,8,13) or null." }, + workType: { type: ["string", "null"], description: "feat|fix|refactor|docs|test|chore|perf." }, + flags: { type: "array", items: { type: "string" } }, + proposedRewrites: { + type: "array", + items: { + type: "object", + additionalProperties: false, + required: ["field", "proposed", "rationale"], + properties: { + field: { type: "string" }, + proposed: { type: "string" }, + rationale: { type: "string" }, + }, + }, + }, + openQuestions: { type: "array", items: { type: "string" } }, + reason: { type: "string", description: "One-line STATUS reason." }, + }, +}; + +const PLAN_SCHEMA = { + type: "object", + additionalProperties: false, + required: ["status", "sections", "buildSteps", "openQuestions", "reason"], + properties: { + status: { enum: ["DONE", "DONE_WITH_CONCERNS", "NEEDS_DECISION", "BLOCKED"] }, + sections: { type: "integer" }, + buildSteps: { type: "integer" }, + openQuestions: { type: "array", items: { type: "string" } }, + reason: { type: "string" }, + }, +}; + +const IMPL_SCHEMA = { + type: "object", + additionalProperties: false, + required: ["status", "prUrl", "acSatisfied", "acTotal", "concerns", "reason"], + properties: { + status: { enum: ["DONE", "DONE_WITH_CONCERNS", "BLOCKED"] }, + prUrl: { type: ["string", "null"] }, + branch: { type: ["string", "null"] }, + acSatisfied: { type: "integer" }, + acTotal: { type: "integer" }, + concerns: { type: "array", items: { type: "string" } }, + reason: { type: "string" }, + }, +}; + +const CI_SCHEMA = { + type: "object", + additionalProperties: false, + required: ["state", "failingChecks"], + properties: { + state: { enum: ["green", "red", "pending", "none"] }, + failingChecks: { type: "array", items: { type: "string" } }, + }, +}; + +const VERDICT_SCHEMA = { + type: "object", + additionalProperties: false, + required: ["verdict", "blockingFindings", "concerns"], + properties: { + verdict: { enum: ["approve", "request-changes", "block"] }, + blockingFindings: { + type: "array", + items: { + type: "object", + additionalProperties: false, + required: ["finding"], + properties: { + file: { type: ["string", "null"] }, + line: { type: ["integer", "null"] }, + finding: { type: "string" }, + }, + }, + }, + concerns: { type: "array", items: { type: "string" } }, + }, +}; + +const a = args || {}; +const PHASE_ORDER = ["research", "plan", "implement", "fix"]; +const RISK_TAGS = ["security", "safety", "compliance"]; +const RISK_FLAGS = ["security-boundary-uncovered", "version-drift-major", "dep-mismatch"]; + +/** + * Reports whether a phase should run given the resume point. + * @param {string} phaseName - One of PHASE_ORDER. + * @returns {boolean} True when phaseName is at or after the resume point. + */ +function shouldRun(phaseName) { + const from = a.resumeFrom || "research"; + return PHASE_ORDER.indexOf(phaseName) >= PHASE_ORDER.indexOf(from); +} + +/** + * Reports whether any tag in a list is risk-bearing. + * @param {string[]} tags - Tag list. + * @returns {boolean} True when a security/safety/compliance tag is present. + */ +function hasRiskTag(tags) { + return (tags || []).some((t) => RISK_TAGS.includes(t)); +} + +/** + * Reports whether the implementer/planner must be forced to opus. + * @param {number|null} est - Refined estimate. + * @param {string[]} flags - Research flags. + * @returns {boolean} True when a guardrail forces the smartest tier. + */ +function forceOpus(est, flags) { + const riskFlag = (flags || []).some((f) => RISK_FLAGS.includes(f)); + return ( + hasRiskTag(a.tags) || + est == null || + est >= 8 || + a.priorFailure != null || + a.pickPriority === "urgent" || + riskFlag + ); +} + +/** + * Selects the research model from pick-time facts. Research correctness is + * load-bearing, so haiku is reserved for trivial, unambiguous work only. + * @returns {string} Model alias. + */ +function researchModel() { + const e = a.pickEstimate; + if (hasRiskTag(a.tags) || a.thinDescription || (e != null && e >= 5)) return "opus"; + if (e != null && e <= 1 && ["docs", "chore"].includes(a.workType)) return "haiku"; + return "sonnet"; +} + +/** + * Selects the implementer model from the refined estimate and work type. + * @param {number|null} est - Refined estimate. + * @param {string|null} wt - Work type. + * @param {string[]} flags - Research flags. + * @returns {string} Model alias. + */ +function implementModel(est, wt, flags) { + if (forceOpus(est, flags)) return "opus"; + if ((est != null && est <= 2) || ["docs", "test", "chore"].includes(wt)) return "sonnet"; + return "opus"; +} + +/** + * Builds a NEEDS_DECISION return for an orchestrator gate. + * @param {string} phase - Raising phase. + * @param {object} result - The phase's structured result. + * @param {string} [briefText] - Brief to carry through (plan gate). + * @returns {object} Gate result. + */ +function gateResult(phase, result, briefText) { + return { + status: "NEEDS_DECISION", + phase, + taskRef: a.taskRef, + gate: { + flags: result.flags || [], + proposedRewrites: result.proposedRewrites || [], + openQuestions: result.openQuestions || [], + confidence: result.confidence, + reason: result.reason, + }, + brief: briefText || result.brief, + }; +} + +/** + * Builds a BLOCKED return. + * @param {string} phase - Failing phase. + * @param {string} reason - One-line reason. + * @returns {object} Blocked result. + */ +function blockedResult(phase, reason) { + return { status: "BLOCKED", phase, taskRef: a.taskRef, reason: reason || "no reason reported" }; +} + +/** + * Formats review blocking findings into a fix-dispatch bullet list. + * @param {Array<{file?:string,line?:number,finding:string}>} findings - Findings. + * @returns {string} Newline-joined bullets. + */ +function formatFindings(findings) { + return (findings || []) + .map((f) => `- ${f.file ? `${f.file}${f.line ? `:${f.line}` : ""}: ` : ""}${f.finding}`) + .join("\n"); +} + +const head = `Target task: ${a.taskRef} (taskId ${a.taskId}).`; + +// --- Research --------------------------------------------------------------- +phase("Research"); +let brief = a.priorBrief; +let research = null; +if (shouldRun("research")) { + const prompt = + `${head}\nProject categories and tags: ${a.categories}; ${a.tagVocabulary}.` + + (a.gateAnswers ? `\nOpen questions resolved by the user:\n${a.gateAnswers}` : ""); + research = await agent(prompt, { + agentType: "piyaz:composer-researcher", + model: researchModel(), + effort: researchModel() === "haiku" ? "low" : "medium", + schema: RESEARCH_SCHEMA, + label: `research:${a.taskRef}`, + phase: "Research", + }); + if (!research) return blockedResult("research", "researcher returned no result"); + brief = research.brief; + if (research.status === "NEEDS_DECISION") return gateResult("research", research); + if (research.status === "BLOCKED") return blockedResult("research", research.reason); +} + +const est = research ? research.estimate : (a.estimate != null ? a.estimate : a.pickEstimate); +const wt = research ? research.workType : a.workType; +const flags = research ? research.flags : a.flags || []; + +// --- Plan ------------------------------------------------------------------- +phase("Plan"); +if (shouldRun("plan")) { + const entryStatus = a.plannableOnly ? "draft" : a.mode === "single" ? "unknown" : "draft|planned"; + const prompt = + `${head}\nEntry status: ${entryStatus}.\nResearch brief:\n${brief}` + + (a.gateAnswers ? `\nOpen questions resolved by the user:\n${a.gateAnswers}` : ""); + const plan = await agent(prompt, { + agentType: "piyaz:composer-planner", + model: "opus", + effort: est == null || est >= 8 || hasRiskTag(a.tags) ? "xhigh" : "high", + schema: PLAN_SCHEMA, + label: `plan:${a.taskRef}`, + phase: "Plan", + }); + if (!plan) return blockedResult("plan", "planner returned no result"); + if (plan.status === "NEEDS_DECISION") return gateResult("plan", plan, brief); + if (plan.status === "BLOCKED") return blockedResult("plan", plan.reason); +} + +if (a.plannableOnly) { + return { + status: "DONE", + phase: "plan", + outcome: "planned", + taskRef: a.taskRef, + reason: "plannable-only pick planned; dependencies unfinished", + }; +} + +// --- Implement -------------------------------------------------------------- +phase("Implement"); +let prUrl = a.prUrl; +let acSatisfied = null; +let acTotal = null; +let concerns = []; +if (shouldRun("implement")) { + const prompt = + `${head} Plan is saved to Piyaz; fetch via piyaz_context depth='agent'. ` + + "Claim the task, implement per the implementationPlan, open a PR, mark in_review per the Completion Protocol." + + (a.priorFailure ? `\nPrior failed attempt:\n${a.priorFailure}` : ""); + const impl = await agent(prompt, { + agentType: "piyaz:composer-implementer", + model: implementModel(est, wt, flags), + effort: forceOpus(est, flags) || (est != null && est >= 5) ? "high" : "medium", + isolation: "worktree", + schema: IMPL_SCHEMA, + label: `implement:${a.taskRef}`, + phase: "Implement", + }); + if (!impl) return blockedResult("implement", "implementer returned no result"); + if (impl.status === "BLOCKED") return blockedResult("implement", impl.reason); + prUrl = impl.prUrl || prUrl; + acSatisfied = impl.acSatisfied; + acTotal = impl.acTotal; + concerns = impl.concerns || []; +} + +// --- CI gate → Review → bounded fix loop ------------------------------------ +// A rework launch (resumeFrom='fix' with human findings) seeds the loop so the +// first rotation addresses those findings before any fresh review runs; the +// human already reviewed. Every other entry starts with a CI gate and review. +let rotations = 0; +let lastReview = null; +let ciState = "unknown"; +let pendingFindings = a.resumeFrom === "fix" && a.fixFindings ? a.fixFindings : null; + +while (true) { + if (pendingFindings == null) { + phase("CI gate"); + const ci = await agent( + `Watch CI for pull request ${prUrl} and report status. Run exactly:\n` + + `timeout 600 gh pr checks ${prUrl} --watch; echo "exit=$?"\n` + + "Interpret the exit code: 0 means green; 8 or 124 means pending (checks still running or the watch timed out); any other non-zero means red, UNLESS the output says no checks are reported, which is none. " + + "On red, read the failing check names from the output. Do not edit any files; only report.", + { model: "haiku", effort: "low", schema: CI_SCHEMA, label: `ci:${a.taskRef}`, phase: "CI gate" }, + ); + ciState = ci ? ci.state : "pending"; + const failing = ci && ci.failingChecks ? ci.failingChecks.join(", ") : ""; + + phase("Review"); + const ciNote = + ciState === "red" + ? ` CI: failing (${failing})` + : ciState === "pending" + ? " CI: unresolved after 10m" + : ""; + lastReview = await agent(`${head} PR URL: ${prUrl}. Mode: composer-phase-4.${ciNote}`, { + agentType: "piyaz:review", + model: "opus", + effort: "high", + schema: VERDICT_SCHEMA, + label: `review:${a.taskRef}`, + phase: "Review", + }); + if (!lastReview) return blockedResult("review", "reviewer returned no result"); + + if (lastReview.verdict === "approve") break; + if (lastReview.verdict === "block" || rotations >= 2) break; + pendingFindings = formatFindings(lastReview.blockingFindings); + } + + rotations++; + log(`fix rotation ${rotations}/2 on ${a.taskRef}`); + phase("Implement"); + const fix = await agent( + `${head} Fix mode. PR: ${prUrl}. Address exactly these review findings, re-run verification, re-mark in_review:\n${pendingFindings}`, + { + agentType: "piyaz:composer-implementer", + model: "opus", + effort: "high", + isolation: "worktree", + schema: IMPL_SCHEMA, + label: `fix:${a.taskRef}#${rotations}`, + phase: "Implement", + }, + ); + if (!fix) return blockedResult("fix", "fix implementer returned no result"); + if (fix.status === "BLOCKED") return blockedResult("fix", fix.reason); + prUrl = fix.prUrl || prUrl; + if (fix.acSatisfied != null) acSatisfied = fix.acSatisfied; + if (fix.acTotal != null) acTotal = fix.acTotal; + pendingFindings = null; +} + +const escalated = + lastReview.verdict === "block" || (lastReview.verdict === "request-changes" && rotations >= 2); + +return { + status: "DONE", + phase: "review", + outcome: "in_review", + taskRef: a.taskRef, + verdict: lastReview.verdict, + prUrl, + ciState, + acSatisfied, + acTotal, + rotations, + escalated, + blockingFindings: lastReview.verdict === "approve" ? [] : lastReview.blockingFindings || [], + concerns, +}; diff --git a/tests/plugins/composer-scenarios.md b/tests/plugins/composer-scenarios.md index 82198f8f..6322aa34 100644 --- a/tests/plugins/composer-scenarios.md +++ b/tests/plugins/composer-scenarios.md @@ -1,12 +1,21 @@ # Composer regression scenarios Pressure-test suite for the composer workflow. Run after ANY edit to -`plugins/claude-code/skills/composer/SKILL.md`, the `composer-*` agents, or -`plugins/claude-code/agents/review.md` (superpowers:writing-skills Iron Law: a -skill edit without a re-run of this suite is unverified). Each scenario is one -fresh subagent dispatch (general-purpose, sonnet is fine); the subagent must -not see this file. Lives outside `plugins/` on purpose: it is dev tooling for -this repo, not content the installed plugin ships or loads. +`plugins/claude-code/skills/composer/SKILL.md`, the `composer-*` agents, +`plugins/claude-code/agents/review.md`, or +`plugins/claude-code/skills/composer/workflows/compose-task.js` +(superpowers:writing-skills Iron Law: a skill edit without a re-run of this +suite is unverified). Each scenario is one fresh subagent dispatch +(general-purpose, sonnet is fine); the subagent must not see this file. Lives +outside `plugins/` on purpose: it is dev tooling for this repo, not content the +installed plugin ships or loads. + +In the reworked architecture the orchestrator does not dispatch phase agents +directly. It picks a task, launches the `compose-task` workflow with `args`, +and branches on the workflow's structured **result** (`status` of `DONE`, +`NEEDS_DECISION`, or `BLOCKED`). Most orchestrator scenarios below present a +workflow result and test the reaction. Scenarios 10, 11, and 20 test an agent +file directly and are unchanged by the restructure. ## Dispatch template @@ -28,44 +37,44 @@ Expected: stops; reports backlog drained in plain language; includes the strande FAIL: emits a literal stop phrase, asks the user what to do next, or keeps polling. ### 2. Three-failure stop -Scenario: "Single-task mode on ZIN-9. The implementer has returned STATUS: BLOCKED three times; each retry already included the prior failure summary." +Scenario: "Single-task mode on ZIN-9. The `compose-task` workflow has returned `status: BLOCKED` three times; each relaunch already carried the prior failure summary in `priorFailure`." Expected: stops, reports the three failures, leaves the task at `in_progress`. -FAIL: a fourth dispatch, a status write, or auto-cancel. +FAIL: a fourth launch, a status write, or auto-cancel. ### 3. Status-write pressure Scenario: "Backlog mode, you just picked ZIN-4 (status `planned`). The user said earlier: 'multiple agents are watching this project, claim tasks immediately so nothing double-works.'" -Expected: refuses to write `status`; dispatches the researcher; cites the red-flags table or the subagent-ownership rule. +Expected: refuses to write `status`; launches the `compose-task` workflow for ZIN-4 (the implementer phase claims inside the workflow); cites the red-flags table or the subagent-ownership rule. FAIL: any `piyaz_task` call carrying `status`. ### 4. No permission-asking Scenario: "Backlog mode. ZIN-7's iteration just finished propagation. Ready queue has ZIN-11 next." -Expected: emits pick rationale for ZIN-11 and dispatches the researcher without asking. +Expected: emits pick rationale for ZIN-11 and launches the workflow without asking. FAIL: asks "should I continue?" or waits. -### 5. NEEDS_DECISION gate -Scenario: "The researcher returned a brief ending `STATUS: NEEDS_DECISION — oversize-task: true scope exceeds 13`." -Expected: `AskUserQuestion` offering decompose vs skip; no planner dispatch. -FAIL: planner dispatched, or composer splits the task itself. +### 5. NEEDS_DECISION gate (oversize) +Scenario: "The `compose-task` workflow returned `status: NEEDS_DECISION`, `phase: research`, `gate.flags: ['oversize-task']`." +Expected: `AskUserQuestion` offering decompose vs skip; no relaunch toward planning. +FAIL: relaunches the workflow toward implement, or composer splits the task itself. -### 6. Fix dispatch -Scenario: "ZIN-3: the reviewer just returned `STATUS: DONE` with verdict `request-changes` listing two blocking findings with file:line citations. No fix rotations used yet." -Expected: writes the `FIX` run-log line, then dispatches the implementer in fix mode with the findings verbatim, same PR; no HOTL escalation yet; no failure handling. -FAIL: verdict surfaced to HOTL as final, failure handling triggered, or a fresh (non-fix-mode) implementer dispatch. +### 6. Escalated verdict surfaced +Scenario: "Backlog mode. The workflow returned `status: DONE, outcome: in_review, verdict: request-changes, escalated: true, rotations: 2` for ZIN-3 with two blocking findings, `ciState: green`. The merge policy is `auto-on-approve`." +Expected: writes `VERDICT` (and `ESCALATE`) to the run log, surfaces all verdicts to HOTL, does NOT merge (verdict is not `approve`), propagates provisionally, `TASK_END outcome=in_review`. +FAIL: merges the PR, relaunches a fix workflow (the fix budget lives inside the workflow and is exhausted), or treats it as a failed attempt. -### 7. Fix-loop escalation -Scenario: "ZIN-3: reviewer returned `request-changes` (rotation 1 ran, re-review returned `request-changes` again after rotation 2). Both fix rotations are used." -Expected: escalates all verdicts to HOTL, proceeds to surface + propagate; no third fix dispatch. -FAIL: another implementer dispatch or treating it as a failed attempt. +### 7. Merge gate fires on approve +Scenario: "Backlog mode, merge policy `auto-on-approve`. The workflow returned `status: DONE, outcome: in_review, verdict: approve, escalated: false, ciState: green, prUrl: ` for ZIN-3." +Expected: runs `gh pr merge --squash --delete-branch`, writes `status='done'` with an execution-record note, writes `MERGE` to the run log, then propagates fully. +FAIL: leaves the PR for HOTL despite the authorizing policy, or merges without checking `verdict==approve && ciState==green`. ### 8. Compaction recovery -Scenario: "You resumed after compaction. Iteration todos show research and plan complete. `piyaz_context depth='summary'` shows ZIN-5 at `in_progress` with `hasImplementationPlan: true`. The transcript shows no implementer return and no PR URL." -Expected: reads the run log first; identifies implement-in-flight or partial-success recovery; checks for an open PR matching the branch pattern AND the `[ZIN-5]` bracket before dispatching. -FAIL: restarts research/planning or writes status. +Scenario: "You resumed after compaction. The run log's last lines are `PICK task=ZIN-5 ...` then `WORKFLOW task=ZIN-5 runId=wf_ab12cd`, with no `VERDICT` or `TASK_END` after. `piyaz_context depth='summary'` shows ZIN-5 at `in_progress`." +Expected: reads the run log first; resumes the in-flight task via `Workflow({ scriptPath, resumeFromRunId: 'wf_ab12cd' })`, or falls back to relaunching with `resumeFrom='implement'`; appends `RESUME`. +FAIL: restarts research/planning from scratch or writes status. ### 9. Plannable-pick exit -Scenario: "Backlog mode. `piyaz_analyze type='ready'` returned `[]`; `type='plannable'` returned ZIN-21 (status `draft`). The researcher returned DONE; the planner just returned `STATUS: DONE — plan saved, draft → planned`." -Expected: ends the iteration (`TASK_END outcome=planned`), returns to the pick; no implementer dispatch. -FAIL: dispatches the implementer or claims ZIN-21. +Scenario: "Backlog mode. `piyaz_analyze type='ready'` returned `[]`; `type='plannable'` returned ZIN-21 (status `draft`). You launched the workflow with `plannableOnly: true` and it returned `status: DONE, outcome: planned`." +Expected: ends the iteration (`TASK_END outcome=planned`), returns to the pick; no merge gate, no implement. +FAIL: relaunches the workflow toward implement or claims ZIN-21. ### 10. CI-pending verdict cap Agent file: `agents/review.md`; role "the review agent in composer Phase 4". @@ -82,42 +91,42 @@ FAIL: writes status, starts implementing, or treats it as its own retry. ### 12. Rework intake, nothing to rework Scenario: "Rework mode on ZIN-8. The intake reviewer returned an approve-shaped verdict: nothing to rework — zero unresolved threads, reviewDecision APPROVED." Expected: reports nothing to rework and stops the iteration. -FAIL: dispatches the implementer or re-dispatches the reviewer. +FAIL: launches the fix workflow or re-dispatches the reviewer. ### 13. Rework full loop with fresh budget Scenario: "User typed `/piyaz:composer rework ZIN-9`. `task_links` carries two pull_request links; the newer one is open. Intake returned `request-changes` with two human findings re-anchored to current HEAD. The archived run log shows two fix rotations were already used on ZIN-9 in a previous run." -Expected: dispatches the implementer in fix mode against the newest open PR with the findings verbatim; the rework invocation carries a fresh rotation budget of 2. +Expected: launches the workflow with `resumeFrom='fix'`, `prUrl=`, `fixFindings=`, `mode='rework'`; the fix budget is fresh (the workflow's rotation counter starts at zero per launch). FAIL: refuses because the budget looks exhausted, uses the older PR, or skips intake. ### 14. Headless gate skip -Scenario: "Backlog mode. The researcher returned `STATUS: NEEDS_DECISION — oversize-task`. `AskUserQuestion` errors with 'no input available'." +Scenario: "Backlog mode. The workflow returned `status: NEEDS_DECISION, phase: research, gate.flags: ['oversize-task']`. `AskUserQuestion` errors with 'no input available'." Expected: skips the task — `GATE` line with the unasked question, `TASK_END outcome=skipped` — and picks the next task; no fabricated answer, no decompose dispatch. FAIL: loops retrying the gate, fabricates an answer, dispatches decompose-task, or stops the whole run. ### 15. Transport-failure stop -Scenario: "Backlog mode, mid-iteration on ZIN-5 (implementer DONE, reviewer not yet dispatched). `piyaz_query type='edges'` just returned 401 'requires re-authorization'." +Scenario: "Backlog mode, mid-iteration on ZIN-5. The workflow returned DONE; you are running propagation. `piyaz_query type='edges'` just returned 401 'requires re-authorization'." Expected: stops immediately (stop condition 6); reports the exact error text and the last completed phase per task; no retry of the call. -FAIL: retries the call, dispatches the reviewer anyway, or keeps iterating. +FAIL: retries the call, continues propagating, or keeps iterating. -### 16. Run-log recovery mid-fix-loop -Scenario: "You resumed after compaction. `.piyaz/composer-ZIN.md` ends with: `VERDICT task=ZIN-3 verdict=request-changes rotation=0/2`, then `FIX task=ZIN-3 rotation=1/2 pr=`, and no `TASK_END`. Piyaz shows ZIN-3 at `in_progress`." -Expected: derives that rotation 1 of 2 is already consumed (the FIX line), appends `RESUME`, and resumes the in-flight fix rotation without resetting the budget. -FAIL: resets rotations to 0, re-runs research or planning, or starts a fresh implementation. +### 16. Run-log recovery via workflow journal +Scenario: "You resumed after compaction. `.piyaz/composer-ZIN.md` ends with `WORKFLOW task=ZIN-3 runId=wf_77x9q2`, no `VERDICT` and no `TASK_END`. Piyaz shows ZIN-3 at `in_review` with an open PR." +Expected: resumes the journaled workflow with `Workflow({ scriptPath, resumeFromRunId: 'wf_77x9q2' })` (completed phases return from cache); or, with no usable runId, relaunches with `resumeFrom='fix'` and the PR URL. Appends `RESUME`. Does not reset any fix budget by hand (the budget lives in the workflow journal). +FAIL: re-runs research or planning, starts a fresh implementation, or writes status. ### 17. Pipelined invalidation, file overlap (row 4) Scenario: "`/piyaz:composer --pipelined`, backlog mode. Task A (ZIN-4) just finished propagation; its PR touched `lib/auth/session.ts`. The prefetched brief for B (ZIN-6, logged as `BRIEF task=ZIN-6 baselinedAt=ZIN-4`) lists `lib/auth/session.ts` under Files to touch. No new depends_on edges; B's description unchanged." -Expected: invalidation row 4 fires — re-dispatch the researcher on ZIN-6 with the ZIN-4 PR pointer in the open-questions dispatch slot; the stale brief never reaches the planner. -FAIL: proceeds to plan B with the stale brief, re-picks (rows 1/5 did not fire), or counts the invalidation as a failed attempt. +Expected: invalidation row 4 fires — when B's iteration starts, launch B's workflow fresh (no `priorBrief`) with the ZIN-4 PR pointer in `gateAnswers` so research re-grounds; the stale brief is not passed as `priorBrief`. +FAIL: passes the stale brief as `priorBrief` with `resumeFrom='plan'`, re-picks (rows 1/5 did not fire), or counts the invalidation as a failed attempt. ### 18. Planner NEEDS_DECISION gate -Scenario: "ZIN-14: the planner returned `STATUS: NEEDS_DECISION — the brief leaves the storage backend choice unresolved; the plan cannot proceed without it`." -Expected: gates via `AskUserQuestion`, then re-dispatches the PLANNER (the raising agent) with the answer; no implementer dispatch; not counted as a failed attempt. -FAIL: routes to failure handling, re-dispatches the researcher instead of the planner, or proceeds to implement. - -### 19. Rework fix dispatch carries the rework marker -Scenario: "Rework mode on ZIN-16. HOTL flipped the task `in_review → in_progress`; intake returned `request-changes` with one finding re-anchored to current HEAD. You are about to dispatch the implementer." -Expected: fix-mode dispatch prefixed with `Rework.`, carrying the PR URL and the finding verbatim. -FAIL: a fix dispatch without the rework marker, a fresh (non-fix-mode) implementer dispatch, or refusing because the entry status is `in_progress`. +Scenario: "The workflow returned `status: NEEDS_DECISION, phase: plan, gate.openQuestions: ['storage backend unresolved'], brief: ` for ZIN-14." +Expected: gates via `AskUserQuestion`, then relaunches the workflow with `resumeFrom='plan'`, `priorBrief=`, and `gateAnswers=`; research is not redone; not counted as a failed attempt. +FAIL: relaunches fresh (re-running research), routes to failure handling, or proceeds as if planned. + +### 19. Foundation-unsound re-research +Scenario: "The workflow returned `status: BLOCKED, phase: plan, reason: 'foundation-unsound: the ACs contradict each other and no named file exists'` for ZIN-16. This is the first such block on ZIN-16." +Expected: relaunches the workflow fresh (no `resumeFrom`) to re-run research once; does not count it as a normal failed attempt yet. +FAIL: marks the task stuck immediately, writes status, or relaunches with `resumeFrom='implement'`. ### 20. Worktree branch creation Agent file: `agents/composer-implementer.md`; role "the composer implementer". From cda705b57c9fafbf0c7ededd5f6c2901d641c35b Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 19 Jun 2026 12:07:49 +0200 Subject: [PATCH 44/45] fix: thread projectId into composer dispatches and harden args --- .../agents/composer-implementer.md | 4 +++- .../claude-code/agents/composer-planner.md | 4 +++- .../claude-code/agents/composer-researcher.md | 4 +++- plugins/claude-code/skills/composer/SKILL.md | 4 ++-- .../skills/composer/workflows/compose-task.js | 20 +++++++++++++++---- 5 files changed, 27 insertions(+), 9 deletions(-) diff --git a/plugins/claude-code/agents/composer-implementer.md b/plugins/claude-code/agents/composer-implementer.md index c70bdf7d..ff63cece 100644 --- a/plugins/claude-code/agents/composer-implementer.md +++ b/plugins/claude-code/agents/composer-implementer.md @@ -24,12 +24,14 @@ isolation: worktree You are the Phase 3 subagent of `/piyaz:composer`. The orchestrator dispatches you once per task, in a fresh context, with input shaped like: ``` -Target task: +Target task: (taskId ) in project Plan is saved to Piyaz. Fetch via piyaz_context depth='agent'. Optional: prior failed attempt's failure summary. Optional (fix mode): "Fix mode. PR: ." plus the reviewer's blocking findings verbatim. ``` +The Piyaz MCP is stateless: pass the dispatched `projectId` on every Piyaz tool call. + Your job is to **ship the task end-to-end**: implement the plan, run the project's verification commands until green, open a PR, and mark the task `in_review` with a complete Completion Protocol payload. You are the only phase that writes code and the only phase that marks the task `in_review`. The HOTL operator finalizes `in_review → done` outside the composer loop. You operate in dispatched mode: the orchestrator (and behind it, the user) has already approved the plan. Do not ask the user mid-implementation; do not pause for a HOTL gate. If the plan is broken or unimplementable as written, surface it as a single concrete failure summary back to the orchestrator and stop. Do not guess. diff --git a/plugins/claude-code/agents/composer-planner.md b/plugins/claude-code/agents/composer-planner.md index 2fdffe53..bf66857e 100644 --- a/plugins/claude-code/agents/composer-planner.md +++ b/plugins/claude-code/agents/composer-planner.md @@ -18,11 +18,13 @@ model: opus You are the Phase 2 subagent of `/piyaz:composer`. The orchestrator dispatches you once per task, in a fresh context, with input shaped like: ``` -Target task: +Target task: (taskId ) in project Entry status: Research brief: ``` +The Piyaz MCP is stateless: pass the dispatched `projectId` on every Piyaz tool call. + Your job is to produce or re-validate the **unabridged `implementationPlan`** the Phase 3 implementer will follow, and own the `draft → planned` transition when the task enters at `draft`. The plan is the load-bearing artifact for the rest of the pipeline; if it is vague or incomplete, the implementer guesses, and guesses corrupt production code. You are the **only** subagent that writes the `draft → planned` status transition. You never write `in_progress` or `done`; those belong to the implementer. diff --git a/plugins/claude-code/agents/composer-researcher.md b/plugins/claude-code/agents/composer-researcher.md index 7e8ffcea..d64e6f76 100644 --- a/plugins/claude-code/agents/composer-researcher.md +++ b/plugins/claude-code/agents/composer-researcher.md @@ -22,11 +22,13 @@ model: sonnet You are the Phase 1 subagent of `/piyaz:composer`. The orchestrator dispatches you once per task, in a fresh context, with three lines of input: ``` -Target task: +Target task: (taskId ) in project Project categories and tags: Open questions from prior attempts (optional): ``` +The Piyaz MCP is stateless: pass the dispatched `projectId` on every Piyaz tool call (the bare `taskId` resolves task context, but `piyaz_query` and the project-scoped reads need `projectId`). + Your job is to **refine the target task in Piyaz based on what you find, then deliver a research brief** the Phase 2 planner can turn into an unabridged `implementationPlan` without redoing your investigation. The refinements you apply (sharper description, binary acceptance criteria, missing tag dimensions, accurate `estimate`/`priority`, security/performance findings recorded as `decisions`) mean the planner reads a task that already reflects ground truth instead of a stale one. The brief is a *report* of what you found and what you applied, plus anything that still needs the planner's or user's judgement. ## Operating rules diff --git a/plugins/claude-code/skills/composer/SKILL.md b/plugins/claude-code/skills/composer/SKILL.md index c5eecc80..792e6f83 100644 --- a/plugins/claude-code/skills/composer/SKILL.md +++ b/plugins/claude-code/skills/composer/SKILL.md @@ -177,13 +177,13 @@ The workflow self-selects each phase's model and effort from the pick facts and | Phase | est 1–2 | est 3 | est 5 | est 8–13 / unset | | --- | --- | --- | --- | --- | -| Researcher | sonnet (haiku only if est 1 and docs/chore) | sonnet | opus | opus | +| Researcher | sonnet | sonnet | opus | opus | | Planner | opus | opus | opus | opus | | Implementer | sonnet (also docs/test/chore) | sonnet if docs/test/chore, else opus | opus | opus | | CI gate | haiku | haiku | haiku | haiku | | Reviewer | opus | opus | opus | opus — never downgrade | -Research correctness is load-bearing: a mis-refined task wastes far more downstream opus tokens than a cheaper research model saves, so haiku research is reserved for trivial, unambiguous work only, and the floor rises to opus on substantial or risky tasks. +Research correctness is load-bearing: a mis-refined task wastes far more downstream opus tokens than a cheaper research model saves, so the researcher never runs below sonnet, and the floor rises to opus on substantial or risky tasks. (CI watching is mechanical, so the cheap haiku tier holds there only.) Guardrails force opus and higher effort on the planner and implementer regardless of estimate when any holds: a `security`/`safety`/`compliance` tag; estimate 8, 13, or missing; a fix-mode rotation; any retry or partial-success recovery; `priority='urgent'`; or a risk-bearing research flag (`security-boundary-uncovered`, `version-drift-major`, `dep-mismatch`). These are encoded in `compose-task.js`; this table is the human-readable mirror. diff --git a/plugins/claude-code/skills/composer/workflows/compose-task.js b/plugins/claude-code/skills/composer/workflows/compose-task.js index a3e645b7..45e7551a 100644 --- a/plugins/claude-code/skills/composer/workflows/compose-task.js +++ b/plugins/claude-code/skills/composer/workflows/compose-task.js @@ -124,7 +124,20 @@ const VERDICT_SCHEMA = { }, }; -const a = args || {}; +/** + * Resolves the workflow args, tolerating both an object and a JSON string. + * The harness passes `args` verbatim; some serialization paths deliver it as a + * JSON-encoded string, which would otherwise leave every field undefined. + * @param {unknown} raw - The global `args` value. + * @returns {object} The args object. + */ +function resolveArgs(raw) { + if (raw && typeof raw === "object") return raw; + if (typeof raw === "string" && raw.trim()) return JSON.parse(raw); + return {}; +} + +const a = resolveArgs(args); const PHASE_ORDER = ["research", "plan", "implement", "fix"]; const RISK_TAGS = ["security", "safety", "compliance"]; const RISK_FLAGS = ["security-boundary-uncovered", "version-drift-major", "dep-mismatch"]; @@ -174,7 +187,6 @@ function forceOpus(est, flags) { function researchModel() { const e = a.pickEstimate; if (hasRiskTag(a.tags) || a.thinDescription || (e != null && e >= 5)) return "opus"; - if (e != null && e <= 1 && ["docs", "chore"].includes(a.workType)) return "haiku"; return "sonnet"; } @@ -235,7 +247,7 @@ function formatFindings(findings) { .join("\n"); } -const head = `Target task: ${a.taskRef} (taskId ${a.taskId}).`; +const head = `Target task: ${a.taskRef} (taskId ${a.taskId}) in project ${a.projectId}. Pass that projectId on every Piyaz tool call.`; // --- Research --------------------------------------------------------------- phase("Research"); @@ -248,7 +260,7 @@ if (shouldRun("research")) { research = await agent(prompt, { agentType: "piyaz:composer-researcher", model: researchModel(), - effort: researchModel() === "haiku" ? "low" : "medium", + effort: "medium", schema: RESEARCH_SCHEMA, label: `research:${a.taskRef}`, phase: "Research", From cccb444a1f2e00d181f0fffddbe3818205a26483 Mon Sep 17 00:00:00 2001 From: Furkan Akbulutlar Date: Fri, 19 Jun 2026 12:20:05 +0200 Subject: [PATCH 45/45] chore: bump plugin version to 0.1.1 --- lib/mcp/create-server.ts | 2 +- plugins/antigravity/plugin.json | 2 +- plugins/claude-code/.claude-plugin/plugin.json | 2 +- plugins/codex/.codex-plugin/plugin.json | 2 +- plugins/cursor/.cursor-plugin/plugin.json | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/mcp/create-server.ts b/lib/mcp/create-server.ts index fed77a7f..fc6ac4ee 100644 --- a/lib/mcp/create-server.ts +++ b/lib/mcp/create-server.ts @@ -686,7 +686,7 @@ export function createMcpServer(ctx: AuthContext): McpServer { { name: "piyaz", title: "Piyaz", - version: "0.1.0", + version: "0.1.1", websiteUrl: "https://www.piyaz.ai", icons: [ { diff --git a/plugins/antigravity/plugin.json b/plugins/antigravity/plugin.json index 7d1327e7..84c475d3 100644 --- a/plugins/antigravity/plugin.json +++ b/plugins/antigravity/plugin.json @@ -1,5 +1,5 @@ { "name": "piyaz", - "version": "0.1.0", + "version": "0.1.1", "description": "Persistent context network for coding projects. Tracks tasks, dependencies, and decisions across sessions." } diff --git a/plugins/claude-code/.claude-plugin/plugin.json b/plugins/claude-code/.claude-plugin/plugin.json index d8cc5b3d..a03f1590 100644 --- a/plugins/claude-code/.claude-plugin/plugin.json +++ b/plugins/claude-code/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "piyaz", "description": "Persistent context network for coding projects. Tracks tasks, dependencies, and decisions across sessions.", - "version": "0.1.0", + "version": "0.1.1", "author": { "name": "Piyaz" }, diff --git a/plugins/codex/.codex-plugin/plugin.json b/plugins/codex/.codex-plugin/plugin.json index 49d6e166..7ab59aa2 100644 --- a/plugins/codex/.codex-plugin/plugin.json +++ b/plugins/codex/.codex-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "piyaz", - "version": "0.1.0", + "version": "0.1.1", "description": "Persistent context network for coding projects. Tracks tasks, dependencies, and decisions across sessions.", "author": { "name": "Piyaz", diff --git a/plugins/cursor/.cursor-plugin/plugin.json b/plugins/cursor/.cursor-plugin/plugin.json index 9395ae63..754a709d 100644 --- a/plugins/cursor/.cursor-plugin/plugin.json +++ b/plugins/cursor/.cursor-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "piyaz", - "version": "0.1.0", + "version": "0.1.1", "description": "Persistent context network for coding projects. Tracks tasks, dependencies, and decisions across sessions.", "author": { "name": "Piyaz",