From e59679eb3862fc4a835a5e67aa4bc91a30a86ede Mon Sep 17 00:00:00 2001 From: HiranoMasaaki Date: Fri, 13 Mar 2026 15:38:27 +0000 Subject: [PATCH] refactor: add failure conditions to plan, further slim plan instruction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plan now defines 5 things: domain constraints, realistic usage, test queries, evaluation (success + failure conditions + restart points), and role division. Key changes: - New "Failure Conditions" section in plan output: hard reject rules derived from domain constraints, with which expert caused the failure and where to restart - Success Criteria no longer includes "what failure looks like" (moved to dedicated Failure Conditions section) - Failure conditions are not the inverse of success criteria — they are domain-specific rules that require deep understanding of constraints Co-Authored-By: Claude Opus 4.6 --- definitions/create-expert/perstack.toml | 82 +++++++++---------------- 1 file changed, 28 insertions(+), 54 deletions(-) diff --git a/definitions/create-expert/perstack.toml b/definitions/create-expert/perstack.toml index e79989b5..88394214 100644 --- a/definitions/create-expert/perstack.toml +++ b/definitions/create-expert/perstack.toml @@ -15,7 +15,7 @@ [experts."create-expert"] defaultModelTier = "high" -version = "1.0.9" +version = "1.0.11" description = "Creates and modifies Perstack expert definitions in perstack.toml" instruction = """ You are the coordinator for creating and modifying Perstack expert definitions. perstack.toml is the single source of truth — your job is to produce or modify it according to the user's request. @@ -60,72 +60,46 @@ pick = ["readTextFile", "exec", "attemptCompletion"] [experts."@create-expert/plan"] defaultModelTier = "high" -version = "1.0.9" +version = "1.0.11" description = """ -Analyzes the user's request, designs test scenarios with verification methods, and architects the expert system. +Analyzes the user's request and produces plan.md: domain constraints, test queries, verification methods, and role architecture. Provide: (1) what the expert should do, (2) path to existing perstack.toml if one exists. -Writes plan.md covering test queries, verification methods, domain knowledge, and delegation architecture. """ instruction = """ -Your job is to deeply understand what the user needs and produce a plan that downstream delegates can execute against. The plan's core value is two things: (1) concrete test queries that exercise the expert's full range, and (2) correct verification methods for each query. +Analyze the user's request and produce plan.md. The plan defines five things: -## Investigation +1. **What domain constraints exist** — rules the LLM cannot derive on its own +2. **What realistic usage looks like** — concrete scenarios and test queries +3. **What to execute** — the actual queries to run against the expert +4. **How to evaluate results** — success conditions, failure conditions, and where to restart on failure +5. **What role division follows from the above** — who does the work, who verifies it -Before writing the plan: -- If an existing perstack.toml path was provided, read it to understand the current state -- Read relevant workspace files to understand the domain - -## Domain Knowledge Extraction - -Extract the constraints, values, and quality bars embedded in the user's request. Every word choice is a signal — "polished" implies no placeholders, "well-tested" implies automated playthroughs, "run anywhere" implies cross-platform npx. Convert implicit values into explicit rules the expert can follow. Focus on what makes THIS expert's output different from a generic attempt. - -Domain knowledge is NOT generic facts the LLM already knows, general best practices, or step-by-step procedures. - -## Verification Thinking - -For each test query, think carefully about how an independent person would verify the result. Not by reading the code — by running it. Ask: - -- What commands would you execute to confirm it works? -- What output would you expect to see? -- What would a failure look like? - -This thinking naturally leads to architectural separation between executors and verifiers. In the real world, the person who did the work is never the person who signs off on it. The same applies here: experts that produce artifacts (code, files, configs) must be verified by a separate expert that builds, runs, and executes those artifacts to confirm they actually work. Without this separation, the executor's reasoning biases the quality judgment. +Before writing the plan, read existing perstack.toml (if provided) and relevant workspace files to understand the domain. ## Output: plan.md -Write plan.md with the following sections: - ### Expert Purpose -One paragraph defining the expert's wedge — what it does, for whom, and why it is valuable. +One paragraph: what it does, for whom, what makes it different from a generic attempt. + +### Domain Knowledge +Constraints and rules unique to this expert, extracted from the user's request. Every word choice is a signal — "polished" means no placeholders, "well-tested" means automated playthroughs, "run anywhere" means cross-platform. Only include what the LLM wouldn't know without being told. Do not include code snippets, file paths, library recommendations, or step-by-step procedures. -### Use Case Analysis -Concrete scenarios where this expert would be used. Include the user's context, their goal, and what a successful outcome looks like. +### Use Cases +2-3 concrete scenarios: who uses this expert, what they ask for, what success looks like. ### 3 Test Queries -A numbered list of 3 realistic queries that would actually be sent to this expert. These must: -- Cover the full range of the expert's capabilities -- Include simple and complex cases -- Include at least one edge case -- Be specific enough to evaluate (not vague like "do something") +Realistic queries that would actually be sent to the expert. Cover simple, complex, and edge cases. ### Success Criteria -For each of the 3 test queries, define: -- What the correct output looks like (concrete, observable conditions) -- How to verify it actually works (specific commands to run, expected results) -- What a failure looks like (so the verifier knows when to reject) - -### Domain Knowledge -The specific constraints and rules the expert's instruction must contain. Only include knowledge the LLM cannot derive on its own. Keep it focused. - -### Architecture Design - -#### Delegation Tree -Visual tree showing coordinator → delegate relationships. Explain the cohesion rationale for each grouping. +For each test query: +- What correct output looks like (observable conditions) +- What commands to run to verify it works -Every tree that includes experts producing work must include a separate verifier expert with exec capability. The verifier does not review code — it builds, runs, and executes the output to confirm it works. This is the same principle as real-world quality assurance: the person who did the work is not the person who signs off on it. +### Failure Conditions +Conditions derived from domain constraints that mean the work must be rejected. These are not the inverse of success criteria — they are hard reject rules that come from deeply understanding the domain. For each failure condition: what specifically is wrong, which expert's work caused it, and where to restart. Example: if the user requires "pure game logic with no I/O," then engine code containing console.log is a failure condition that requires redoing the engine expert's work. -#### Expert Definitions -For each expert: name, one-line purpose, and role (executor or verifier). +### Architecture +Delegation tree with role assignments. Every expert that produces artifacts needs a separate verifier expert that builds, runs, and executes the output to confirm it works — the person who did the work is not the person who signs off on it. For each expert: name, one-line purpose, executor or verifier. After writing plan.md, attemptCompletion with the file path. """ @@ -150,7 +124,7 @@ pick = [ [experts."@create-expert/build"] defaultModelTier = "low" -version = "1.0.9" +version = "1.0.11" description = """ Orchestrates the write → test → verify → improve cycle for perstack.toml. Provide: path to plan.md (containing requirements, architecture, test queries, and success criteria). @@ -212,7 +186,7 @@ pick = ["readTextFile", "exec", "todo", "attemptCompletion"] [experts."@create-expert/write-definition"] defaultModelTier = "low" -version = "1.0.9" +version = "1.0.11" description = """ Writes or modifies a perstack.toml definition from plan.md requirements and architecture. Provide: (1) path to plan.md, (2) optionally path to existing perstack.toml to preserve, (3) optionally feedback from a failed test to address. @@ -322,7 +296,7 @@ pick = [ [experts."@create-expert/verify-test"] defaultModelTier = "low" -version = "1.0.9" +version = "1.0.11" description = """ Verifies test-expert results by inspecting produced artifacts, executing them, and reviewing the definition against plan.md. Provide: (1) the test-expert's factual report (query, what was produced, errors), (2) the success criteria from plan.md, (3) path to plan.md (for semantic review of instructions), (4) path to perstack.toml. @@ -383,7 +357,7 @@ pick = ["readTextFile", "exec", "todo", "attemptCompletion"] [experts."@create-expert/test-expert"] defaultModelTier = "low" -version = "1.0.9" +version = "1.0.11" description = """ Executes a single test query against a Perstack expert definition and reports what happened. Provide: (1) path to perstack.toml, (2) the test query to execute, (3) the coordinator expert name to test.