From 7c2c9575e65673b1ad7e9eb150c868543cc95a2a Mon Sep 17 00:00:00 2001 From: HiranoMasaaki Date: Fri, 13 Mar 2026 22:17:19 +0000 Subject: [PATCH] feat: add review-definition expert as soft gate before testing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Separate concerns: write-definition focuses on TOML structure, review-definition checks plan alignment (domain constraints, library names, instruction quality). Changes: - Add @create-expert/review-definition (soft review gate) - Build loop: write → review → test → verify (two gates: soft + hard) - Design Principle 2: "Binary Checks" → "Soft Review + Hard Verification" - write-definition: remove instruction quality checklists (now reviewer's job) - plan: preserve user-specified library names in Domain Knowledge Co-Authored-By: Claude Opus 4.6 (1M context) --- definitions/create-expert/perstack.toml | 133 +++++++++++++++--------- 1 file changed, 82 insertions(+), 51 deletions(-) diff --git a/definitions/create-expert/perstack.toml b/definitions/create-expert/perstack.toml index 371ddd09..d4ee48b4 100644 --- a/definitions/create-expert/perstack.toml +++ b/definitions/create-expert/perstack.toml @@ -3,8 +3,9 @@ # # create-expert — pipeline orchestration (plan → build) # ├── @create-expert/plan — requirements + architecture → plan.md -# └── @create-expert/build — write → test → verify cycle +# └── @create-expert/build — write → review → test → verify cycle # ├── @create-expert/write-definition — perstack.toml authoring +# ├── @create-expert/review-definition — plan alignment review (soft gate) # ├── @create-expert/test-expert — single query execution (pure executor, no evaluation) # └── @create-expert/verify-test — hard signal execution + reproducibility + structural checks # ============================================================================= @@ -29,13 +30,15 @@ # that exercise each constraint, so missing constraints surface as # hard signal failures, not LLM opinion. # -# 2. Instruction Quality via Binary Checks -# - Subjective self-checks ("would removing this make output worse?") -# do not work — the LLM always judges its own output as necessary. -# - Binary checks work: "code block present?" "library name present?" -# have unambiguous yes/no answers with clear remediation actions. -# - Pattern: structural checks (delegates array, pick list) have always -# been followed. Content checks must follow the same binary pattern. +# 2. Soft Review + Hard Verification +# - Instruction quality is enforced by a soft review gate: LLM reads +# plan.md and perstack.toml, checks whether each domain constraint +# is faithfully reflected, and flags violations. Soft signal adds +# value here because plan alignment is semantic, not syntactic. +# - Structural correctness (delegates array, pick list, exec capability) +# is enforced by hard signals in verify-test. +# - Build loop has two gates: review (soft) must pass before testing, +# verify (hard) must pass before completion. # # 3. Domain Agnosticism # - create-expert must produce experts for ANY domain — coding, writing, @@ -67,9 +70,8 @@ # completion criteria, priority rules. # - Everything else — implementation techniques, library choices, file # structures, well-known algorithms — is noise that dilutes the signal. -# - Enforced by a 6-item binary checklist (no code blocks, no library -# names, no file paths, no procedures, no technique explanations, -# ≤15 lines for non-coordinator experts). +# - Enforced by review-definition checking each instruction against +# plan.md's domain knowledge. # # 7. Brevity = Focus # - Verbose instructions dilute LLM attention. The more you write, the @@ -99,7 +101,7 @@ You are the coordinator for creating and modifying Perstack expert definitions. ## Delegates - @create-expert/plan — requirements analysis + architecture design: use cases, verification signals, domain knowledge, delegation tree -- @create-expert/build — write → test → verify cycle (internally delegates to write-definition, test-expert, verify-test) +- @create-expert/build — write → review → test → verify cycle (internally delegates to write-definition, review-definition, test-expert, verify-test) ## Coordination @@ -153,7 +155,7 @@ Before writing the plan, read existing perstack.toml (if provided) and relevant One paragraph: what it does, for whom, what makes it different from a generic attempt. ### Domain Knowledge -Constraints and rules unique to this expert, extracted from the user's request. Every word choice is a signal — "polished" means no placeholders, "well-tested" means automated playthroughs, "run anywhere" means cross-platform. Only include what the LLM wouldn't know without being told. Do not include code snippets, file paths, library recommendations, or step-by-step procedures. +Constraints and rules unique to this expert, extracted from the user's request. Every word choice is a signal — "polished" means no placeholders, "well-tested" means automated playthroughs, "run anywhere" means cross-platform. Only include what the LLM wouldn't know without being told. Do not include code snippets, file paths, or step-by-step procedures. Preserve library or tool names the user explicitly specified (these are hard requirements); do not add library recommendations beyond what the user requested. ### Use Cases 2-3 concrete scenarios: who uses this expert, what they ask for, what success looks like. @@ -220,28 +222,31 @@ pick = [ defaultModelTier = "low" version = "1.0.16" description = """ -Orchestrates the write → test → verify cycle for perstack.toml. +Orchestrates the write → review → test → verify cycle for perstack.toml. Provide: path to plan.md (containing requirements, architecture, test query, and verification signals). Optionally: path to existing perstack.toml to preserve. """ instruction = """ -You are the build loop orchestrator. You coordinate write-definition, test-expert, and verify-test to produce a perstack.toml that passes verification. +You are the build loop orchestrator. You coordinate write-definition, review-definition, test-expert, and verify-test to produce a perstack.toml that passes both review and verification. -You do NOT write perstack.toml yourself. You do NOT evaluate test results yourself. You delegate to specialists and act on their verdicts. +You do NOT write perstack.toml yourself. You do NOT evaluate results yourself. You delegate to specialists and act on their verdicts. ## Delegates - @create-expert/write-definition — writes or modifies perstack.toml from plan.md +- @create-expert/review-definition — reviews perstack.toml against plan.md for domain alignment and instruction quality - @create-expert/test-expert — executes the test query against perstack.toml and reports what happened (no evaluation) - @create-expert/verify-test — executes hard signal checks, verifies their reproducibility, and checks the definition structure -## Write → Test → Verify Cycle +## Write → Review → Test → Verify Cycle 1. Delegate to write-definition: pass plan.md path (and existing perstack.toml path if Update mode) -2. Delegate to test-expert: pass the test query from plan.md, perstack.toml path, and coordinator expert name (do NOT pass verification signals — test-expert is a pure executor) -3. Delegate to verify-test: pass the test-expert result, the verification signals from plan.md, and the perstack.toml path -4. If verify-test returns CONTINUE: delegate to write-definition with the failure feedback, then restart from step 2 -5. If verify-test returns PASS: done — attemptCompletion with the verification evidence +2. Delegate to review-definition: pass plan.md path and perstack.toml path +3. If review returns CONTINUE: delegate to write-definition with the review feedback, then restart from step 2 +4. If review returns PASS: delegate to test-expert with the test query from plan.md, perstack.toml path, and coordinator expert name (do NOT pass verification signals — test-expert is a pure executor) +5. Delegate to verify-test: pass the test-expert result, the verification signals from plan.md, and the perstack.toml path +6. If verify-test returns CONTINUE: delegate to write-definition with the failure feedback, then restart from step 2 +7. If verify-test returns PASS: done — attemptCompletion with the verification evidence ### Why one query is enough Hard signals are deterministic — same input, same result. If all signals pass AND reproduce identically on re-execution (verified by verify-test's reproducibility step), a single query provides the same confidence as multiple runs. Multiple queries compensate for soft signals; hard signals need no compensation. @@ -255,6 +260,7 @@ Delegate to exactly ONE delegate per response. Do NOT include multiple delegatio """ delegates = [ "@create-expert/write-definition", + "@create-expert/review-definition", "@create-expert/test-expert", "@create-expert/verify-test", ] @@ -323,37 +329,16 @@ instruction = \"\"\"Domain knowledge.\"\"\" - **defaultModelTier**: always set per expert. Use "low" for mechanical/routine tasks, "middle" for moderate reasoning, "high" for complex judgment. - **TOML**: triple-quoted strings for multi-line instructions. Every expert needs version, description, instruction. `"@perstack/base"` is the exact required key — never `"base"` or aliases. -## Instruction Quality Rules - -The instruction field is the most impactful part of the definition. Apply these filters strictly: - -### What belongs in an instruction -- Domain-specific constraints the LLM cannot know (business rules, quality bars, policies, tradeoffs) -- Anti-patterns specific to this expert's domain -- Completion criteria — what "done" looks like -- Priority rules for when constraints conflict - -### What does NOT belong in an instruction -- **Implementation details the LLM already knows** — code snippets, file structure specifications, tool/library recommendations, configuration boilerplate. The LLM has broad training across programming, writing, design, analysis, and other domains. State the constraint or requirement; trust the LLM to choose the implementation. An instruction that explains *how* to do something the LLM already knows is wasted space. -- **General domain knowledge** — well-known techniques, standard practices, textbook algorithms. Naming them as requirements is fine ("use seedable RNG", "follow APA citation style"); explaining how they work is not. -- **Step-by-step procedures** — "first do X, then Y, then Z." Define the goal and constraints; the LLM will figure out the steps. Numbered checklists and ordered task lists are procedures in disguise. -- **Specific output structures** — exact file paths, section templates, schema definitions. Describe what the output must contain and its quality bar, not its exact shape. The LLM will organize the output appropriately for the task. - -### Instruction content checklist -Before finalizing perstack.toml, check every instruction (coordinator excluded from line limit) against these binary rules. If any check fails, fix it before writing. -1. **No code blocks**: instruction contains no ``` fenced code. Remove any code snippets, shell commands, JSON examples, or inline templates. -2. **No library/tool names**: instruction names no specific library, framework, or tool. Replace with capability requirement ("terminal UI library" not "blessed or ink", "test framework" not "Jest"). -3. **No file paths**: instruction specifies no file or directory paths. Remove all path references — the LLM decides file structure. -4. **No procedures**: instruction contains no numbered step sequences or ordered checklists. State the goal and constraints, not the steps. -5. **No technique explanations**: instruction does not explain well-known techniques. Name them as requirements if needed ("seedable RNG", "immutable state transitions"), never explain how they work. -6. **Line budget**: non-coordinator instruction is ≤ 15 lines. If over, re-check each line against rules 1-5. - -### Structure checklist -1. **Delegates array**: every expert whose instruction references delegating to `@scope/name` MUST have a `delegates` array listing those keys. Without it, delegation silently fails at runtime. -2. **Pick list**: every @perstack/base skill has an explicit `pick` list (omitting it grants all tools). +## Structure Checklist + +These are TOML correctness rules — without them, the runtime fails silently: +1. **Delegates array**: every expert that delegates to others MUST have a `delegates` array listing all delegate keys. +2. **Pick list**: every @perstack/base skill has an explicit `pick` list. 3. **defaultModelTier**: every expert has this set. -4. **Verifier exec capability**: if the delegation tree includes a verifier expert, it MUST have `exec` in its pick list. Without exec, verification degrades to file reading — a soft signal that cannot catch runtime failures. -5. **Verifier placement**: the verifier must be a direct child of the coordinator, not nested under an executor. This ensures context separation — the verifier does not share context with the generator. +4. **Verifier exec capability**: verifier experts MUST have `exec` in their pick list. +5. **Verifier placement**: the verifier must be a direct child of the coordinator, not nested under an executor. + +Instruction quality and plan alignment are enforced by review-definition — focus on correct TOML structure and composing instructions from plan.md's domain knowledge. ## Description Rules @@ -390,6 +375,52 @@ pick = [ "attemptCompletion", ] +# ============================================================================= +# review-definition — Plan Alignment Reviewer (Soft Gate) +# ============================================================================= + +[experts."@create-expert/review-definition"] +defaultModelTier = "low" +version = "1.0.16" +description = """ +Reviews perstack.toml against plan.md for domain knowledge alignment and instruction quality. +Provide: (1) path to plan.md, (2) path to perstack.toml. +Returns: PASS or CONTINUE with per-item ✓/✗ results. +""" +instruction = """ +You review whether perstack.toml faithfully reflects plan.md. Read both files, then check each item below. Mark each ✓ or ✗ with a one-line reason. + +## Checklist + +For each domain constraint in plan.md's Domain Knowledge: +- Reflected in the appropriate expert's instruction? (✓/✗) + +For each verification signal in plan.md's Verification Signals: +- Verifier expert's instruction and capabilities support executing it? (✓/✗) + +For library/tool names appearing in any instruction: +- Traceable to plan.md's Domain Knowledge (i.e., user-specified)? (✓/✗ per name) +- If not user-specified, flag it — the instruction should describe the capability, not name the tool. + +For each non-coordinator instruction: +- Contains only domain constraints the LLM cannot derive on its own? (✓/✗) +- Free of code snippets, JSON schemas, and step-by-step procedures? (✓/✗) + +## Verdicts + +- **PASS** — all items ✓. +- **CONTINUE** — any item ✗. List each with: what's wrong, which expert, specific fix. + +attemptCompletion with: verdict and per-item results. +""" + +[experts."@create-expert/review-definition".skills."@perstack/base"] +type = "mcpStdioSkill" +description = "File reading and task completion" +command = "npx" +packageName = "@perstack/base" +pick = ["readTextFile", "todo", "attemptCompletion"] + # ============================================================================= # verify-test — Test Verifier # =============================================================================