From 49c7fe3e38ee8dcec854f5cd1f6f4c8f843d1a66 Mon Sep 17 00:00:00 2001 From: stepan_romankov Date: Mon, 25 May 2026 14:40:13 +0200 Subject: [PATCH 1/4] =?UTF-8?q?feat(skill):=20add=20business-level=20namin?= =?UTF-8?q?g=20guidance=20(Variant=20A=20=E2=80=94=20single=20skill)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variant A of A/B test for samurai test-naming guidance. Extends the existing skill with an inline Naming section in SKILL.md (forbidden patterns + lazy-load pointer) plus skill/naming.md with the actor / verb / outcome replacement protocol, heuristic table, and bad → good pairs grounded in real production trees. Worked example in api.md updated so its Test() names already clear the new bar (no "list empty", no generic "check"). Co-Authored-By: Claude Opus 4.7 --- cmd/claude-setup/main.go | 8 +++- cmd/claude-setup/skill/SKILL.md | 14 +++++++ cmd/claude-setup/skill/api.md | 4 +- cmd/claude-setup/skill/naming.md | 70 ++++++++++++++++++++++++++++++++ 4 files changed, 92 insertions(+), 4 deletions(-) create mode 100644 cmd/claude-setup/skill/naming.md diff --git a/cmd/claude-setup/main.go b/cmd/claude-setup/main.go index f774f3b..57e9c16 100644 --- a/cmd/claude-setup/main.go +++ b/cmd/claude-setup/main.go @@ -4,7 +4,7 @@ // // go run github.com/zerosixty/samurai/cmd/claude-setup@latest // -// It creates .claude/skills/samurai/ with SKILL.md, api.md and pitfalls.md +// It creates .claude/skills/samurai/ with SKILL.md, api.md, naming.md and pitfalls.md // so Claude Code understands the samurai API when writing or modifying tests. package main @@ -23,6 +23,9 @@ var skillContent string //go:embed skill/api.md var apiContent string +//go:embed skill/naming.md +var namingContent string + //go:embed skill/pitfalls.md var pitfallsContent string @@ -74,6 +77,7 @@ func main() { files := map[string]string{ "SKILL.md": fmt.Sprintf("%s\n%s\n", skillContent, skillMarker(version)), "api.md": apiContent, + "naming.md": namingContent, "pitfalls.md": pitfallsContent, } @@ -88,7 +92,7 @@ func main() { oldSamurai := filepath.Join(skillDir, "samurai.md") os.Remove(oldSamurai) // ignore error — may not exist - fmt.Println("Created .claude/skills/samurai/ (SKILL.md, api.md, pitfalls.md)") + fmt.Println("Created .claude/skills/samurai/ (SKILL.md, api.md, naming.md, pitfalls.md)") fmt.Println("Claude Code will use samurai context when working with tests.") fmt.Println("Invoke /samurai to load the reference manually.") } diff --git a/cmd/claude-setup/skill/SKILL.md b/cmd/claude-setup/skill/SKILL.md index f3c6143..c1e0d82 100644 --- a/cmd/claude-setup/skill/SKILL.md +++ b/cmd/claude-setup/skill/SKILL.md @@ -20,6 +20,19 @@ description: "Samurai scoped testing framework for Go (github.com/zerosixty/samu - Parallel by default; `samurai.Sequential()` forces order; `go test -parallel N` controls concurrency - Assertion-agnostic: use `w.Testing()` with any library (testify, is, stdlib) +## Naming + +Every `s.Test("name", ...)` — top, intermediate, leaf — names a business action or outcome from the actor's point of view. Never the function under test, the assertion, or the input data shape. The same bar applies at every level of the tree. + +Forbidden patterns (flag and rewrite when authoring or reviewing): + +- **Identifier leak** — any CamelCase token that appears as a Go identifier in the Test body (function, type, method). `"UpdateQuote calls produce audit records"` leaks `UpdateQuote`. +- **HTTP / status leak** — bare 3-digit codes matching `\b[1-5]\d{2}\b`. `"returns 409"` → `"rejects duplicate id"`. +- **Assertion phrasing** — verbs like `is / equals / returns / not nil / has length / contains` describing the value, not the step. `"err is not nil"` → `"rejects invalid signature"`. +- **Structure phrasing** — input-shape words like `empty / nil / with N keys / map / slice / array / struct`. `"with empty slice"` → `"returns no matches when filter is unset"`. + +When a name (yours or one you are reviewing) matches any of the above, read the `Test()` body and rewrite. Do not silently change names — propose 1–2 concrete replacements derived from the body. See [naming.md](naming.md) for the replacement protocol and worked examples. + ## File layout For samurai test files, top-to-bottom: @@ -60,4 +73,5 @@ Cost: samurai re-executes the full path per leaf — same cost as manually dupli ## Additional resources (lazy-load — read only when triggered) - **Read [api.md](api.md) only when** you need the worked-out `samurai.Run` / `RunWith` API example (e.g. wiring a new test scope, writing a custom context type for `RunWith`). The "When to Use" / "Detection protocol" sections above are sufficient for review and detection work. +- **Read [naming.md](naming.md) only when** authoring a new `s.Test(...)` name and the right phrasing isn't obvious, or reviewing a samurai test where at least one name matches a forbidden pattern above. The inline "Naming" section is enough for clearly-good or clearly-bad names; load this when you need the actor/verb/outcome replacement protocol. - **Read [pitfalls.md](pitfalls.md) only when** debugging a panic, a validation error, or a wrong pattern (e.g. asserting on a parent's result inside a child Test, declaring vars where they should be assigned). Skip otherwise — the rules above already cover the common authoring decisions. diff --git a/cmd/claude-setup/skill/api.md b/cmd/claude-setup/skill/api.md index 932f524..1f33407 100644 --- a/cmd/claude-setup/skill/api.md +++ b/cmd/claude-setup/skill/api.md @@ -25,7 +25,7 @@ func TestFeature(t *testing.T) { assert.NotEmpty(w.Testing(), user.Role) // 6. one action + assertions = one Test }) - s.Test("list empty", func(ctx context.Context, w samurai.W) { + s.Test("lists no users initially", func(ctx context.Context, w samurai.W) { users, err := db.ListUsers(ctx) // 7. own db — isolated from "create user" paths assert.NoError(w.Testing(), err) assert.Empty(w.Testing(), users) @@ -59,7 +59,7 @@ func TestWithAssertions(t *testing.T) { samurai.RunWith(t, func(w samurai.W) *MyCtx { return &MyCtx{BaseContext: w, Assertions: assert.New(w.Testing())} }, func(s *S) { - s.Test("check", func(_ context.Context, c *MyCtx) { + s.Test("the answer matches the spec", func(_ context.Context, c *MyCtx) { c.Equal(42, value) // assertion methods directly on context }) }) diff --git a/cmd/claude-setup/skill/naming.md b/cmd/claude-setup/skill/naming.md new file mode 100644 index 0000000..90f9ca6 --- /dev/null +++ b/cmd/claude-setup/skill/naming.md @@ -0,0 +1,70 @@ +# Samurai Test Naming + +The "Naming" section in SKILL.md states the bar and the four forbidden patterns. This file is the replacement protocol and worked examples — load it when you have a bad name in hand and need to propose a good one. + +## Replacement protocol + +Do not look at the diff hunk header, the parent test function name, or the function under test. **Open the `Test()` callback body.** Identify three things from the code in that body: + +1. **Actor** — who acts. Usually a domain noun: `borrower`, `admin`, `provider`, `API caller`, `counterparty`. If there is no clear actor, the system itself (`the service`, `the API`) is acceptable but rarely the best choice. +2. **Verb** — the action call that mutates state or makes a request: `db.CreateUser`, `client.UpdateQuote`, `svc.Settle`. Express it as a domain verb (`creates`, `updates`, `settles`), not the Go method name. +3. **Outcome** — what the assertions in the same callback verify: a state change, a rejection, a returned value's domain meaning. + +Write a 3–7 word, lowercase, present-tense phrase combining (1)+(2)+(3) — or (1)+(2) when the outcome is implied by success. Propose 1–2 alternatives so the author can pick. + +If you cannot identify the actor or the outcome from the body, the `Test()` is doing too much or too little — flag this before renaming. Either the action is missing (a setup-only test) or many actions are bundled (split it). + +## Detection heuristics + +| Pattern | Why it's bad | What to look at in the body instead | +|---|---|---| +| CamelCase token matching a Go identifier in the Test body | Couples the test name to the SUT symbol; rename of the function breaks the name | The effect of that call in domain terms | +| Bare HTTP status code (`200`, `404`, `409`) | Leaks the HTTP layer; the same behaviour at gRPC would be unnamable | The outcome verb: `accepts`, `rejects duplicate id`, `reports missing` | +| `is / equals / returns / not nil / has length / contains` describing a value | Describes the assertion, not the step | The action that produced the value; the assertion is implementation | +| `empty / nil / with N keys / map / slice / struct` | Leaks the input data structure | The domain meaning of the input: `no past loans`, `missing preferences`, `unset filter` | + +## Bad → good (grounded in real samurai trees) + +- `"UpdateQuote calls produce audit records with matching client quote ids"` — identifier leak (`UpdateQuote`) + → `"updating a quote writes an audit record"` + → `"the audit record links to the originating client quote"` + +- `"err is not nil"` — assertion phrasing + → `"rejects request with invalid signature"` + → `"refuses unauthenticated callers"` + +- `"returns 409"` — HTTP leak + → `"rejects duplicate id"` + → `"refuses to create a provider that already exists"` + +- `"with empty slice"` — structure leak + → `"returns no matches when the filter is unset"` + → `"borrower has no past loans"` + +- `"CreateProvider returns nil"` — identifier leak + assertion phrasing + → `"creates a new provider successfully"` + → `"a fresh provider id is issued"` + +## Uniformly across tree levels + +The bar is the same at top, intermediate, and leaf. Example: + +``` +"borrower opens a credit line" // top — business setup, not "CreateCreditLine returns ok" + ├─ "borrower draws on the credit line" // intermediate — action, not "Draw=true" + │ ├─ "balance reflects the drawdown" // leaf — outcome, not "balance equals 100" + │ └─ "draw is rejected when limit is exceeded" + └─ "counterparty accepts KYB access" +``` + +No level mentions the Go type (`CreditLine`), the method (`Draw`), the HTTP code, the assertion verb (`equals`), or the data shape (`with balance > 0`). + +## During review + +When reviewing a samurai test, scan every `s.Test(...)` string literal against the heuristics above. For each hit: + +1. Quote the offending name and identify which forbidden pattern it matches. +2. Read the `Test()` body and derive actor + verb + outcome. +3. Propose 1–2 concrete replacements. + +Do not silently rewrite — surface the issue and the proposal so the author confirms domain accuracy. From d7d2f05e3246691daf63bf7b68c25f97cb550fff Mon Sep 17 00:00:00 2001 From: stepan_romankov Date: Mon, 25 May 2026 15:36:01 +0200 Subject: [PATCH 2/4] feat(skill): switch naming section to V2 (eval winner) Hardened A/B eval (no lazy load + mixed fixtures + N=3) ranked V2 at 98.4% vs V1 at 92.1%. V1's extra detail over-flagged passive voice ("first loan is added to the history"). V2 keeps all 4 forbidden patterns with examples and adds a safe-examples disclaimer so domain-natural uses of "has", "returns", "no" stay PASS. Full results: eval/naming-length/RESULTS-HARD.md Co-Authored-By: Claude Opus 4.7 --- cmd/claude-setup/skill/SKILL.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cmd/claude-setup/skill/SKILL.md b/cmd/claude-setup/skill/SKILL.md index c1e0d82..c06e06c 100644 --- a/cmd/claude-setup/skill/SKILL.md +++ b/cmd/claude-setup/skill/SKILL.md @@ -22,16 +22,16 @@ description: "Samurai scoped testing framework for Go (github.com/zerosixty/samu ## Naming -Every `s.Test("name", ...)` — top, intermediate, leaf — names a business action or outcome from the actor's point of view. Never the function under test, the assertion, or the input data shape. The same bar applies at every level of the tree. +Every `s.Test("name", ...)` — at every tree level — names a business action or outcome from the actor's POV. Not the function under test, the assertion, or the input shape. -Forbidden patterns (flag and rewrite when authoring or reviewing): +Forbidden patterns: -- **Identifier leak** — any CamelCase token that appears as a Go identifier in the Test body (function, type, method). `"UpdateQuote calls produce audit records"` leaks `UpdateQuote`. -- **HTTP / status leak** — bare 3-digit codes matching `\b[1-5]\d{2}\b`. `"returns 409"` → `"rejects duplicate id"`. -- **Assertion phrasing** — verbs like `is / equals / returns / not nil / has length / contains` describing the value, not the step. `"err is not nil"` → `"rejects invalid signature"`. -- **Structure phrasing** — input-shape words like `empty / nil / with N keys / map / slice / array / struct`. `"with empty slice"` → `"returns no matches when filter is unset"`. +- **Identifier leak** — CamelCase tokens matching Go identifiers in the Test body. `"UpdateQuote calls..."` leaks `UpdateQuote`. +- **HTTP code leak** — bare 3-digit codes (`\b[1-5]\d{2}\b`) or `4xx/5xx` shorthand. `"returns 409"` → `"rejects duplicate id"`. +- **Assertion phrasing** — `is / equals / returns / not nil / has length / contains / has`. `"err is not nil"` → `"rejects invalid signature"`. +- **Structure phrasing** — `empty / nil / with N / map / slice / struct`. `"with empty slice"` → `"borrower has no past loans"`. -When a name (yours or one you are reviewing) matches any of the above, read the `Test()` body and rewrite. Do not silently change names — propose 1–2 concrete replacements derived from the body. See [naming.md](naming.md) for the replacement protocol and worked examples. +Risky-looking words used in a domain-natural sense (`borrower returns the equipment`, `has insufficient collateral`, `no past loans`) are clean — only flag when the word describes the assertion, the data shape, or the identifier. Flag matches, read the body, propose 1–2 replacements. See [naming.md](naming.md) for the protocol. ## File layout From fe713c1cad90992d780fee89cdf262a40603860b Mon Sep 17 00:00:00 2001 From: stepan_romankov Date: Mon, 25 May 2026 16:18:39 +0200 Subject: [PATCH 3/4] docs(eval): add naming-length A/B eval (rationale for V2) Records the two-pass evaluation that picked V2 (129w) as the inline naming section: first eval was non-discriminating because every variant loaded the lazy naming.md (RESULTS.md); hardened eval (RESULTS-HARD.md) stripped the pointer, used mixed textbook/borderline/clean fixtures, and ran N=3 per cell to expose stable failure modes. V2 won at 98.4%. Includes both fixture sets, both ground-truth files, all 5 variants (loaded and no-load), and the prompt-build script. --- eval/naming-length/RESULTS-HARD.md | 91 +++++++++++++++++++ eval/naming-length/RESULTS.md | 69 ++++++++++++++ eval/naming-length/build_prompts.sh | 51 +++++++++++ .../fixtures/F1_function_leak.go.txt | 35 +++++++ eval/naming-length/fixtures/F1_hard.go.txt | 39 ++++++++ .../fixtures/F2_assertion_phrasing.go.txt | 40 ++++++++ eval/naming-length/fixtures/F2_hard.go.txt | 41 +++++++++ eval/naming-length/fixtures/F3_hard.go.txt | 39 ++++++++ .../fixtures/F3_structure_leak.go.txt | 37 ++++++++ eval/naming-length/fixtures/F4_hard.go.txt | 38 ++++++++ .../fixtures/F4_http_leak.go.txt | 37 ++++++++ .../fixtures/F5_good_names_control.go.txt | 45 +++++++++ eval/naming-length/fixtures/F5_hard.go.txt | 42 +++++++++ .../fixtures/expectations-hard.json | 56 ++++++++++++ eval/naming-length/fixtures/expectations.json | 50 ++++++++++ eval/naming-length/variants/naming.md | 70 ++++++++++++++ eval/naming-length/variants/no-load/v1.md | 12 +++ eval/naming-length/variants/no-load/v2.md | 12 +++ eval/naming-length/variants/no-load/v3.md | 5 + eval/naming-length/variants/no-load/v4.md | 3 + eval/naming-length/variants/no-load/v5.md | 3 + eval/naming-length/variants/v1.md | 12 +++ eval/naming-length/variants/v2.md | 12 +++ eval/naming-length/variants/v3.md | 5 + eval/naming-length/variants/v4.md | 3 + eval/naming-length/variants/v5.md | 3 + 26 files changed, 850 insertions(+) create mode 100644 eval/naming-length/RESULTS-HARD.md create mode 100644 eval/naming-length/RESULTS.md create mode 100644 eval/naming-length/build_prompts.sh create mode 100644 eval/naming-length/fixtures/F1_function_leak.go.txt create mode 100644 eval/naming-length/fixtures/F1_hard.go.txt create mode 100644 eval/naming-length/fixtures/F2_assertion_phrasing.go.txt create mode 100644 eval/naming-length/fixtures/F2_hard.go.txt create mode 100644 eval/naming-length/fixtures/F3_hard.go.txt create mode 100644 eval/naming-length/fixtures/F3_structure_leak.go.txt create mode 100644 eval/naming-length/fixtures/F4_hard.go.txt create mode 100644 eval/naming-length/fixtures/F4_http_leak.go.txt create mode 100644 eval/naming-length/fixtures/F5_good_names_control.go.txt create mode 100644 eval/naming-length/fixtures/F5_hard.go.txt create mode 100644 eval/naming-length/fixtures/expectations-hard.json create mode 100644 eval/naming-length/fixtures/expectations.json create mode 100644 eval/naming-length/variants/naming.md create mode 100644 eval/naming-length/variants/no-load/v1.md create mode 100644 eval/naming-length/variants/no-load/v2.md create mode 100644 eval/naming-length/variants/no-load/v3.md create mode 100644 eval/naming-length/variants/no-load/v4.md create mode 100644 eval/naming-length/variants/no-load/v5.md create mode 100644 eval/naming-length/variants/v1.md create mode 100644 eval/naming-length/variants/v2.md create mode 100644 eval/naming-length/variants/v3.md create mode 100644 eval/naming-length/variants/v4.md create mode 100644 eval/naming-length/variants/v5.md diff --git a/eval/naming-length/RESULTS-HARD.md b/eval/naming-length/RESULTS-HARD.md new file mode 100644 index 0000000..893b013 --- /dev/null +++ b/eval/naming-length/RESULTS-HARD.md @@ -0,0 +1,91 @@ +# Naming-section length A/B — hardened eval (no lazy load + mixed fixtures + N=3) + +## Why a second eval + +The first eval (`RESULTS.md`) was non-discriminating — every variant scored 20/20 because all variants pointed to the same lazy-loaded `naming.md` (681 words) and subagents loaded it. The inline section's own carrying capacity was never tested. + +This eval fixes three things: + +1. **No lazy load.** Variants live in `variants/no-load/v{1..5}.md`, stripped of the `[naming.md]` pointer. The directory has no `naming.md` sibling. The inline text is the only guidance. +2. **Mixed fixtures.** New `F{1..5}_hard.go.txt` mix textbook violations + borderline cases (passive voice, "has X" used in domain phrasing, "rejected" as domain outcome, status-range shorthand `4xx`) + clean controls with risky-looking words. +3. **N=3 per cell.** 5 variants × 5 fixtures × 3 runs = 75 subagent invocations, clean context each. + +Ground truth: 21 verdicts per run, 7 FLAG + 14 PASS — see `fixtures/expectations-hard.json`. Each name labelled `textbook`, `borderline`, or `clean` for diagnostic slicing. + +## Variant sizes + +| Variant | Words | What's in it | +|---|---|---| +| v1 | 206 | All 4 forbidden patterns + worked examples + safe-examples carve-out paragraph. | +| v2 | 129 | All 4 patterns with bulleted examples + safe-examples one-liner. | +| v3 | 49 | All 4 patterns with parenthesized examples + brief safe-words clause. | +| v4 | 31 | All 4 patterns in one sentence, no safe-examples clause. | +| v5 | 16 | "business actions, never function names, status codes, or assertions" — no structure clause. | + +## Scores + +| Variant | Total correct | Score | Per-run (run 1 / 2 / 3) | Failure mode | +|---|---|---|---|---| +| v1 (206w) | 58/63 | 92.1% | 20 / 18 / 20 | Over-flags passive voice (`"first loan is added to the history"` — all 3 runs FLAGged for "is") | +| **v2 (129w)** | **62/63** | **98.4%** | 21 / 20 / 21 | One stray false-FLAG; everything else stable. | +| v3 (49w) | 60/63 | 95.2% | 20 / 20 / 20 | Consistently false-FLAGs `"the request is rejected for missing auth"` (no carve-out for passive domain outcomes). | +| v4 (31w) | 59/63 | 93.7% | 20 / 19 / 20 | Consistently false-FLAGs `"borrower has insufficient collateral"` — v4 lists `has X` as forbidden but drops the safe-examples clause. | +| v5 (16w) | 54/63 | 85.7% | 18 / 16 / 18 | **Structure leak detection collapses.** All 3 runs PASS `"borrower with empty loan history"` (textbook structure leak — should FLAG). Also misses `"the new provider has the requested name"`. | + +## Per-fixture breakdown + +| | F1 (id) | F2 (assert) | F3 (struct) | F4 (HTTP) | F5 (control) | +|---|---|---|---|---|---| +| v1 | 12/12 | 11/12 | 9/12 | 11/12 | 15/15 | +| **v2** | **12/12** | **12/12** | **12/12** | **11/12** | **15/15** | +| v3 | 12/12 | 12/12 | 12/12 | 9/12 | 15/15 | +| v4 | 12/12 | 12/12 | 12/12 | 11/12 | 12/15 | +| v5 | 12/12 | 9/12 | 8/12 | 10/12 | 15/15 | + +## Where each variant breaks + +- **v1's over-detail backfires.** Adding `is` to the assertion-verb list trips passive constructions like `"is added"`. The longer prose gives the subagent more rules to misapply. +- **v2 is the sweet spot.** Lists all 4 patterns with concrete examples *and* has the safe-examples disclaimer. Enough constraint, not too much. +- **v3 holds on 3/4 categories** but its briefer safe-words clause isn't enough to protect `"rejected"` as a domain outcome. +- **v4 loses the safe-examples clause.** "has X" is listed as forbidden but `"borrower has insufficient collateral"` is exactly the canonical domain phrasing. Without the carve-out, the variant punishes correct names. +- **v5 collapses on structure-leak detection.** The 16-word one-liner has no `structure / data-shape / empty / nil` cue. Three textbook leaks pass through (`"with empty loan history"`, `"has the requested name"`), and `"empty"` slips entirely (all 3 runs). + +The degradation isn't monotonic in length — it's monotonic in **rule coverage**. v1 has too much detail, v5 has too little, v2–v4 trade precision against verbosity. + +## Why v2 beats v1 + +v1 (Variant A's current naming section) is the section originally drafted as "good enough". The eval shows it's *worse* than a 129-word version. v1's extra detail — explicit `is` in the assertion-verb list, regex hints, repeated examples — makes the rule too aggressive on perfectly fine passive constructions. Trimming detail to v2 actually raises score. + +## Recommendation + +**Ship v2** as the inline `## Naming` section in `cmd/claude-setup/skill/SKILL.md`, plus the full `naming.md` lazy-loaded. + +Rationale: +- v2 scored highest in the hardened eval (98.4% vs 92.1% for v1). +- v2 holds all 5 categories with one stray miss; no consistent failure mode. +- The `naming.md` lazy load remains net positive on borderline cases (it adds the actor/verb/outcome protocol), but is no longer load-bearing for basic rule application. +- Cost: 129 words inline, loaded with every samurai-skill activation. Acceptable. + +Concrete v2 to ship (back-edit the naming.md pointer in for production): + +```markdown +## Naming + +Every `s.Test("name", ...)` — at every tree level — names a business action or outcome from the actor's POV. Not the function under test, the assertion, or the input shape. + +Forbidden patterns: + +- **Identifier leak** — CamelCase tokens matching Go identifiers in the Test body. `"UpdateQuote calls..."` leaks `UpdateQuote`. +- **HTTP code leak** — bare 3-digit codes (`\b[1-5]\d{2}\b`) or `4xx/5xx` shorthand. `"returns 409"` → `"rejects duplicate id"`. +- **Assertion phrasing** — `is / equals / returns / not nil / has length / contains / has`. `"err is not nil"` → `"rejects invalid signature"`. +- **Structure phrasing** — `empty / nil / with N / map / slice / struct`. `"with empty slice"` → `"borrower has no past loans"`. + +Risky-looking words used in a domain-natural sense (`borrower returns the equipment`, `has insufficient collateral`, `no past loans`) are clean — only flag when the word describes the assertion, the data shape, or the identifier. Flag matches, read the body, propose 1–2 replacements. See [naming.md](naming.md) for the protocol. +``` + +## Eval quality notes + +- N=3 was enough to expose consistent failure modes (v3, v4, v5 each had a 3/3 false-FLAG or false-PASS). For ranking purposes more runs aren't needed. +- The 7-FLAG / 14-PASS asymmetry biases the eval toward variants that PASS too eagerly (v5 still scored 86% by passing everything in F5). Real samurai trees have closer to 50/50 — re-run on production samples to confirm v2 holds. +- Ground-truth for `"first loan is added to the history"` is a judgement call. The naming.md examples use passive forms (`"first loan is added"` is the spirit of `"a fresh provider id is issued"`). If you treat this as FLAG instead, v1 jumps from 92.1% to ~96.8% — still behind v2. +- Cost: 75 subagents × ~21k tokens ≈ 1.6M tokens. The original suspect eval was 25 × 23k ≈ 575k. Worth the 2.8× spend to actually answer the question. diff --git a/eval/naming-length/RESULTS.md b/eval/naming-length/RESULTS.md new file mode 100644 index 0000000..22043b7 --- /dev/null +++ b/eval/naming-length/RESULTS.md @@ -0,0 +1,69 @@ +# Naming-section length A/B — results + +## Setup + +- 5 length variants of the inline `## Naming` section in `SKILL.md`. +- All variants point to the same lazy-loaded `naming.md` (681 words: replacement protocol, heuristics table, 5 bad→good pairs, uniform-tree example). +- 5 fixtures × 1 run × 5 variants = 25 subagent invocations, each with clean context. +- Each subagent received the variant inline text + access to `naming.md` + fixture, and emitted a structured PASS/FLAG verdict per `s.Test(...)` literal. +- Ground truth: 20 verdicts total (13 FLAG, 7 PASS) per fixture set, in `fixtures/expectations.json`. + +## Variant sizes + +| Variant | Words | Description | +|---|---|---| +| v1 | 206 | Full Variant A naming section (rule + 4 forbidden patterns with regex hints + worked examples). | +| v2 | 129 | Trimmed: rule + bulleted forbidden patterns with 1 example each. | +| v3 | 49 | Compressed: rule + parenthesized forbidden examples. | +| v4 | 31 | Two-sentence: "names are business actions/outcomes... not function names, HTTP codes, assertion verbs, data-shape words". | +| v5 | 16 | One-liner: "`s.Test(...)` names are business actions, never function names, status codes, or assertions." | + +All variants end with a pointer to `naming.md`. + +## Scores + +| Variant | Correct verdicts | FP on controls | FN on FLAGs | +|---|---|---|---| +| v1 | 20/20 | 0 | 0 | +| v2 | 20/20 | 0 | 0 | +| v3 | 20/20 | 0 | 0 | +| v4 | 20/20 | 0 | 0 | +| v5 | 20/20 | 0 | 0 | + +Replacement quality (subjective, eyeballed across 65 FLAG blocks): consistently business-level across all variants — `"updating a quote writes an audit record"`, `"rejects duplicate provider name"`, `"borrower has no past loans"`, `"admin creates a new provider"`. No variant produced word-shuffle replacements. + +## Interpretation — caveat + +The eval **does not discriminate**. Even the 16-word v5 matched the 206-word v1 perfectly. Two non-exclusive explanations: + +1. **Lazy-loaded `naming.md` is doing the work.** Every variant points at it, and a one-shot eval with no context cost incentivises the subagent to load the reference unconditionally. So the eval measured "any pointer + full naming.md", not the inline section's own carrying capacity. +2. **The task is too easy.** The 5 fixtures use textbook violations (literal `CreateQuote`, literal `returns 409`, literal `err is not nil`, literal `with empty`). A one-line cue plus the user's prior knowledge of naming conventions is enough. + +The eval cannot tell which dominates without a follow-up run that suppresses the lazy load. + +## What would actually discriminate + +Re-run with **no access to `naming.md`** — only the inline variant text. Then v5 ("never function names, status codes, or assertions") is forced to carry the whole load. Predictions if that eval runs: + +- v1/v2 still ≈ 20/20 (self-contained). +- v3 likely 18–20 (compressed but mentions all 4 patterns). +- v4 likely 16–19 (drops the protocol — replacement quality may degrade). +- v5 likely 14–18 (no structure-leak language; "with empty" / "with nil filter" / "map with 0 entries" may slip through as PASS). + +## Recommendation + +**Ship a v3-class section** in `SKILL.md` (~50 words inline) + the full `naming.md` lazy-loaded. Rationale: + +- v3 names all 4 forbidden patterns with one literal example each — enough to trigger the lazy load on a match. +- v1's worked examples duplicate `naming.md`. Net cost to every samurai-skill activation. +- v4/v5 read as catchy one-liners but drop the structure-phrasing cue (the hardest category — "empty / nil / with N" doesn't pattern-match as obviously as `CreateQuote` or `409`). + +Concrete v3 inline: + +> ## Naming +> +> Every `s.Test("name", ...)` — at every tree level — names a business action or outcome. Forbidden: identifier leak (`UpdateQuote calls...`), HTTP codes (`returns 409`), assertion phrasing (`err is not nil`), structure phrasing (`with empty slice`). Flag matches; read the `Test()` body; propose 1–2 domain replacements. See [naming.md](naming.md). + +## Follow-up + +Before shipping, run the suppressed-load eval to confirm v3 holds without `naming.md`. If v3 degrades, fall back to v2. If even v5 holds (unlikely), the inline section can be reduced further. diff --git a/eval/naming-length/build_prompts.sh b/eval/naming-length/build_prompts.sh new file mode 100644 index 0000000..e2e55a1 --- /dev/null +++ b/eval/naming-length/build_prompts.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# Builds 25 (variant × fixture) prompts for the subagent A/B test. +# Each prompt is self-contained: variant SKILL.md inline + shared naming.md + fixture code. + +set -euo pipefail + +cd "$(dirname "$0")" + +mkdir -p runs/prompts +rm -f runs/prompts/*.txt + +NAMING_MD=$(cat variants/naming.md) + +INSTRUCTIONS='--- END TEST FILE --- + +INSTRUCTIONS: + +For every `s.Test("...")` string literal in the file (in source order), output ONE block in this EXACT format: + +NAME: +VERDICT: PASS or FLAG +REASON: +REPLACEMENT: + +Separate blocks with a SINGLE blank line. Output nothing else — no preamble, no summary, no headers, no commentary.' + +for v in v1 v2 v3 v4 v5; do + variant_content=$(cat "variants/${v}.md") + for f in F1_function_leak F2_assertion_phrasing F3_structure_leak F4_http_leak F5_good_names_control; do + fixture_content=$(cat "fixtures/${f}.go.txt") + out="runs/prompts/${v}__${f}.txt" + { + echo "You are reviewing test names in a Go samurai test file. Apply the naming guidance below; output a structured per-name verdict." + echo + echo "--- GUIDANCE (inline SKILL.md section) ---" + echo "$variant_content" + echo "--- END GUIDANCE ---" + echo + echo "--- LAZY-LOADED naming.md (load only if your guidance refers to it or you need the full protocol) ---" + echo "$NAMING_MD" + echo "--- END naming.md ---" + echo + echo "--- TEST FILE TO REVIEW ---" + echo "$fixture_content" + echo "$INSTRUCTIONS" + } > "$out" + done +done + +echo "Built $(ls runs/prompts | wc -l | tr -d ' ') prompts in runs/prompts/" +ls runs/prompts/ | head -5 diff --git a/eval/naming-length/fixtures/F1_function_leak.go.txt b/eval/naming-length/fixtures/F1_function_leak.go.txt new file mode 100644 index 0000000..a1d0def --- /dev/null +++ b/eval/naming-length/fixtures/F1_function_leak.go.txt @@ -0,0 +1,35 @@ +package quotes_test + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/zerosixty/samurai" +) + +func TestQuotes(t *testing.T) { + samurai.Run(t, func(s *samurai.Scope) { + var svc *QuoteService + var quote *Quote + + s.Test("CreateQuote returns a quote with id", func(_ context.Context, w samurai.W) { + svc = newQuoteService() + quote = svc.CreateQuote(QuoteRequest{Amount: 100}) + assert.NotEmpty(w.Testing(), quote.ID) + }, func(s *samurai.Scope) { + s.Test("UpdateQuote produces audit records with matching client quote ids", func(_ context.Context, w samurai.W) { + svc.UpdateQuote(quote.ID, QuoteUpdate{Amount: 200}) + audits := svc.AuditLog(quote.ID) + assert.Len(w.Testing(), audits, 1) + assert.Equal(w.Testing(), quote.ClientQuoteID, audits[0].ClientQuoteID) + }) + + s.Test("CancelQuote sets status to canceled", func(_ context.Context, w samurai.W) { + svc.CancelQuote(quote.ID) + updated := svc.GetQuote(quote.ID) + assert.Equal(w.Testing(), "canceled", updated.Status) + }) + }) + }) +} diff --git a/eval/naming-length/fixtures/F1_hard.go.txt b/eval/naming-length/fixtures/F1_hard.go.txt new file mode 100644 index 0000000..5ffa70d --- /dev/null +++ b/eval/naming-length/fixtures/F1_hard.go.txt @@ -0,0 +1,39 @@ +package quotes_test + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/zerosixty/samurai" +) + +func TestQuotes(t *testing.T) { + samurai.Run(t, func(s *samurai.Scope) { + var svc *QuoteService + var quote *Quote + + s.Test("CreateQuote returns a quote with id", func(_ context.Context, w samurai.W) { + svc = newQuoteService() + quote = svc.CreateQuote(QuoteRequest{Amount: 100}) + assert.NotEmpty(w.Testing(), quote.ID) + }, func(s *samurai.Scope) { + s.Test("updates a quote and writes an audit", func(_ context.Context, w samurai.W) { + svc.UpdateQuote(quote.ID, QuoteUpdate{Amount: 200}) + assert.Len(w.Testing(), svc.AuditLog(quote.ID), 1) + }) + + s.Test("the quote is updated via UpdateQuote", func(_ context.Context, w samurai.W) { + svc.UpdateQuote(quote.ID, QuoteUpdate{Amount: 300}) + updated := svc.GetQuote(quote.ID) + assert.Equal(w.Testing(), 300, updated.Amount) + }) + + s.Test("cancels an open quote", func(_ context.Context, w samurai.W) { + svc.CancelQuote(quote.ID) + updated := svc.GetQuote(quote.ID) + assert.Equal(w.Testing(), "canceled", updated.Status) + }) + }) + }) +} diff --git a/eval/naming-length/fixtures/F2_assertion_phrasing.go.txt b/eval/naming-length/fixtures/F2_assertion_phrasing.go.txt new file mode 100644 index 0000000..1bf6331 --- /dev/null +++ b/eval/naming-length/fixtures/F2_assertion_phrasing.go.txt @@ -0,0 +1,40 @@ +package providers_test + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/zerosixty/samurai" +) + +func TestProviderCreation(t *testing.T) { + samurai.Run(t, func(s *samurai.Scope) { + var svc *ProviderService + var existing *Provider + + s.Test("admin creates a new provider", func(ctx context.Context, w samurai.W) { + svc = newProviderService(ctx) + var err error + existing, err = svc.CreateProvider(ctx, "Acme Inc") + assert.NoError(w.Testing(), err) + assert.NotEmpty(w.Testing(), existing.ID) + }, func(s *samurai.Scope) { + s.Test("err is not nil for duplicate name", func(ctx context.Context, w samurai.W) { + _, err := svc.CreateProvider(ctx, "Acme Inc") + assert.Error(w.Testing(), err) + }) + + s.Test("count equals 1 after creation", func(ctx context.Context, w samurai.W) { + n := svc.CountProviders(ctx) + assert.Equal(w.Testing(), 1, n) + }) + + s.Test("looks up the provider by id", func(ctx context.Context, w samurai.W) { + found, err := svc.GetProvider(ctx, existing.ID) + assert.NoError(w.Testing(), err) + assert.Equal(w.Testing(), existing.Name, found.Name) + }) + }) + }) +} diff --git a/eval/naming-length/fixtures/F2_hard.go.txt b/eval/naming-length/fixtures/F2_hard.go.txt new file mode 100644 index 0000000..9c32b98 --- /dev/null +++ b/eval/naming-length/fixtures/F2_hard.go.txt @@ -0,0 +1,41 @@ +package providers_test + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/zerosixty/samurai" +) + +func TestProviderCreation(t *testing.T) { + samurai.Run(t, func(s *samurai.Scope) { + var svc *ProviderService + var existing *Provider + + s.Test("admin creates a new provider", func(ctx context.Context, w samurai.W) { + svc = newProviderService(ctx) + var err error + existing, err = svc.CreateProvider(ctx, "Acme Inc") + assert.NoError(w.Testing(), err) + assert.NotEmpty(w.Testing(), existing.ID) + }, func(s *samurai.Scope) { + s.Test("err is not nil for duplicate name", func(ctx context.Context, w samurai.W) { + _, err := svc.CreateProvider(ctx, "Acme Inc") + assert.Error(w.Testing(), err) + }) + + s.Test("the new provider has the requested name", func(ctx context.Context, w samurai.W) { + found, err := svc.GetProvider(ctx, existing.ID) + assert.NoError(w.Testing(), err) + assert.Equal(w.Testing(), "Acme Inc", found.Name) + }) + + s.Test("audits include the change", func(ctx context.Context, w samurai.W) { + _, _ = svc.RenameProvider(ctx, existing.ID, "Acme Holdings") + entries := svc.AuditLog(ctx, existing.ID) + assert.NotEmpty(w.Testing(), entries) + }) + }) + }) +} diff --git a/eval/naming-length/fixtures/F3_hard.go.txt b/eval/naming-length/fixtures/F3_hard.go.txt new file mode 100644 index 0000000..1811513 --- /dev/null +++ b/eval/naming-length/fixtures/F3_hard.go.txt @@ -0,0 +1,39 @@ +package loans_test + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/zerosixty/samurai" +) + +func TestLoanHistory(t *testing.T) { + samurai.Run(t, func(s *samurai.Scope) { + var svc *LoanService + var borrowerID string + + s.Test("borrower with empty loan history", func(ctx context.Context, w samurai.W) { + svc = newLoanService(ctx) + borrowerID = svc.RegisterBorrower(ctx, "alice") + loans := svc.LoanHistory(ctx, borrowerID) + assert.Empty(w.Testing(), loans) + }, func(s *samurai.Scope) { + s.Test("new borrower has no past loans", func(ctx context.Context, w samurai.W) { + loans := svc.LoanHistory(ctx, borrowerID) + assert.Len(w.Testing(), loans, 0) + }) + + s.Test("first loan is added to the history", func(ctx context.Context, w samurai.W) { + svc.IssueLoan(ctx, borrowerID, 1000) + loans := svc.LoanHistory(ctx, borrowerID) + assert.Len(w.Testing(), loans, 1) + }) + + s.Test("returns an empty list when filter is unset", func(ctx context.Context, w samurai.W) { + loans := svc.SearchLoans(ctx, nil) + assert.NotEmpty(w.Testing(), loans) + }) + }) + }) +} diff --git a/eval/naming-length/fixtures/F3_structure_leak.go.txt b/eval/naming-length/fixtures/F3_structure_leak.go.txt new file mode 100644 index 0000000..df5522a --- /dev/null +++ b/eval/naming-length/fixtures/F3_structure_leak.go.txt @@ -0,0 +1,37 @@ +package lending_test + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/zerosixty/samurai" +) + +func TestBorrowerLoans(t *testing.T) { + samurai.Run(t, func(s *samurai.Scope) { + var svc *LendingService + var borrower *Borrower + + s.Test("borrower with empty loan history", func(ctx context.Context, w samurai.W) { + svc = newLendingService(ctx) + borrower = svc.CreateBorrower(ctx, "Alice") + }, func(s *samurai.Scope) { + s.Test("returns map with 0 entries", func(ctx context.Context, w samurai.W) { + loans := svc.GetLoans(ctx, borrower.ID) + assert.Empty(w.Testing(), loans) + }) + + s.Test("with nil filter lists all loans across borrowers", func(ctx context.Context, w samurai.W) { + loans := svc.ListLoans(ctx, nil) + assert.NotNil(w.Testing(), loans) + }) + + s.Test("first loan is added to the history", func(ctx context.Context, w samurai.W) { + svc.OpenLoan(ctx, borrower.ID, 5000) + loans := svc.GetLoans(ctx, borrower.ID) + assert.Len(w.Testing(), loans, 1) + }) + }) + }) +} diff --git a/eval/naming-length/fixtures/F4_hard.go.txt b/eval/naming-length/fixtures/F4_hard.go.txt new file mode 100644 index 0000000..453315c --- /dev/null +++ b/eval/naming-length/fixtures/F4_hard.go.txt @@ -0,0 +1,38 @@ +package providers_http_test + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/zerosixty/samurai" +) + +func TestProvidersHTTP(t *testing.T) { + samurai.Run(t, func(s *samurai.Scope) { + var client *APIClient + var id string + + s.Test("POST /providers returns 201", func(ctx context.Context, w samurai.W) { + client = newClient(ctx) + resp, _ := client.CreateProvider(ctx, "Acme") + id = resp.ID + assert.Equal(w.Testing(), 201, resp.Status) + }, func(s *samurai.Scope) { + s.Test("admin deletes the provider", func(ctx context.Context, w samurai.W) { + err := client.DeleteProvider(ctx, id) + assert.NoError(w.Testing(), err) + }) + + s.Test("the request is rejected for missing auth", func(ctx context.Context, w samurai.W) { + resp, _ := client.AsAnonymous().CreateProvider(ctx, "Beta") + assert.True(w.Testing(), resp.Rejected) + }) + + s.Test("returns 4xx on invalid input", func(ctx context.Context, w samurai.W) { + resp, _ := client.CreateProvider(ctx, "") + assert.GreaterOrEqual(w.Testing(), resp.Status, 400) + }) + }) + }) +} diff --git a/eval/naming-length/fixtures/F4_http_leak.go.txt b/eval/naming-length/fixtures/F4_http_leak.go.txt new file mode 100644 index 0000000..4f5254b --- /dev/null +++ b/eval/naming-length/fixtures/F4_http_leak.go.txt @@ -0,0 +1,37 @@ +package httpapi_test + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/zerosixty/samurai" +) + +func TestProvidersAPI(t *testing.T) { + samurai.Run(t, func(s *samurai.Scope) { + var client *TestClient + + s.Test("POST /providers returns 201", func(ctx context.Context, w samurai.W) { + client = newTestClient(ctx) + resp := client.PostJSON("/providers", map[string]any{"name": "Acme"}) + assert.Equal(w.Testing(), 201, resp.StatusCode) + }, func(s *samurai.Scope) { + s.Test("returns 409 on duplicate name", func(ctx context.Context, w samurai.W) { + resp := client.PostJSON("/providers", map[string]any{"name": "Acme"}) + assert.Equal(w.Testing(), 409, resp.StatusCode) + }) + + s.Test("GET /providers/:id returns 200 with body", func(ctx context.Context, w samurai.W) { + resp := client.Get("/providers/acme") + assert.Equal(w.Testing(), 200, resp.StatusCode) + assert.Contains(w.Testing(), resp.Body, "Acme") + }) + + s.Test("admin deletes the provider", func(ctx context.Context, w samurai.W) { + resp := client.Delete("/providers/acme") + assert.Equal(w.Testing(), 204, resp.StatusCode) + }) + }) + }) +} diff --git a/eval/naming-length/fixtures/F5_good_names_control.go.txt b/eval/naming-length/fixtures/F5_good_names_control.go.txt new file mode 100644 index 0000000..ea7d398 --- /dev/null +++ b/eval/naming-length/fixtures/F5_good_names_control.go.txt @@ -0,0 +1,45 @@ +package credit_test + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/zerosixty/samurai" +) + +func TestCreditLine(t *testing.T) { + samurai.Run(t, func(s *samurai.Scope) { + var svc *CreditService + var borrower, counterparty *Provider + + s.Test("creditor grants a credit line to borrower", func(ctx context.Context, w samurai.W) { + svc = newCreditService(ctx) + borrower, counterparty = svc.SeedTwoProviders(ctx) + svc.GrantCreditLine(ctx, counterparty.ID, borrower.ID, 10_000) + }, func(s *samurai.Scope) { + s.Test("borrower draws on the credit line", func(ctx context.Context, w samurai.W) { + svc.Draw(ctx, borrower.ID, counterparty.ID, 3_000) + bal := svc.Balance(ctx, borrower.ID, counterparty.ID) + assert.Equal(w.Testing(), int64(7_000), bal.Available) + }, func(s *samurai.Scope) { + s.Test("borrower repays part of the drawn amount", func(ctx context.Context, w samurai.W) { + svc.Repay(ctx, borrower.ID, counterparty.ID, 1_000) + bal := svc.Balance(ctx, borrower.ID, counterparty.ID) + assert.Equal(w.Testing(), int64(8_000), bal.Available) + }) + + s.Test("draw is rejected when limit is exceeded", func(ctx context.Context, w samurai.W) { + err := svc.Draw(ctx, borrower.ID, counterparty.ID, 100_000) + assert.Error(w.Testing(), err) + }) + }) + + s.Test("counterparty revokes the credit line", func(ctx context.Context, w samurai.W) { + svc.RevokeCreditLine(ctx, counterparty.ID, borrower.ID) + bal := svc.Balance(ctx, borrower.ID, counterparty.ID) + assert.Equal(w.Testing(), int64(0), bal.Available) + }) + }) + }) +} diff --git a/eval/naming-length/fixtures/F5_hard.go.txt b/eval/naming-length/fixtures/F5_hard.go.txt new file mode 100644 index 0000000..d30abe1 --- /dev/null +++ b/eval/naming-length/fixtures/F5_hard.go.txt @@ -0,0 +1,42 @@ +package credit_test + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/zerosixty/samurai" +) + +func TestCreditLine(t *testing.T) { + samurai.Run(t, func(s *samurai.Scope) { + var svc *CreditService + var lineID string + + s.Test("creditor grants a credit line", func(ctx context.Context, w samurai.W) { + svc = newCreditService(ctx) + lineID = svc.GrantLine(ctx, "borrower-1", 10_000) + assert.NotEmpty(w.Testing(), lineID) + }, func(s *samurai.Scope) { + s.Test("borrower returns the equipment", func(ctx context.Context, w samurai.W) { + err := svc.ReturnEquipment(ctx, lineID) + assert.NoError(w.Testing(), err) + }) + + s.Test("the loan is fully repaid", func(ctx context.Context, w samurai.W) { + svc.Repay(ctx, lineID, 10_000) + assert.True(w.Testing(), svc.IsClosed(ctx, lineID)) + }) + + s.Test("borrower has insufficient collateral", func(ctx context.Context, w samurai.W) { + ok := svc.CheckCollateral(ctx, "borrower-1", 50_000) + assert.False(w.Testing(), ok) + }) + + s.Test("creditor revokes access", func(ctx context.Context, w samurai.W) { + svc.RevokeLine(ctx, lineID) + assert.True(w.Testing(), svc.IsRevoked(ctx, lineID)) + }) + }) + }) +} diff --git a/eval/naming-length/fixtures/expectations-hard.json b/eval/naming-length/fixtures/expectations-hard.json new file mode 100644 index 0000000..2bbc07f --- /dev/null +++ b/eval/naming-length/fixtures/expectations-hard.json @@ -0,0 +1,56 @@ +{ + "_comment": "Hardened fixtures: each file mixes textbook violations, borderline cases, and clean controls. Borderline cases test whether the variant carries enough rule detail without naming.md.", + "fixtures": { + "F1_hard": { + "category": "identifier-leak-mixed", + "names": [ + {"name": "CreateQuote returns a quote with id", "verdict": "FLAG", "kind": "textbook", "why": "CamelCase identifier (CreateQuote) + assertion phrasing (returns ... with)"}, + {"name": "updates a quote and writes an audit", "verdict": "PASS", "kind": "clean", "why": "domain verbs (updates, writes) + domain objects; no identifier, no assertion, no structure"}, + {"name": "the quote is updated via UpdateQuote", "verdict": "FLAG", "kind": "borderline", "why": "identifier leak (UpdateQuote) even though phrased domain-first; rule: no CamelCase Go identifiers in the name"}, + {"name": "cancels an open quote", "verdict": "PASS", "kind": "clean", "why": "domain verb + domain object"} + ] + }, + "F2_hard": { + "category": "assertion-style-mixed", + "names": [ + {"name": "admin creates a new provider", "verdict": "PASS", "kind": "clean", "why": "actor + verb + outcome"}, + {"name": "err is not nil for duplicate name", "verdict": "FLAG", "kind": "textbook", "why": "assertion phrasing (err is not nil)"}, + {"name": "the new provider has the requested name", "verdict": "FLAG", "kind": "borderline", "why": "assertion phrasing (has the requested X) — describes the assertion value, not the step. Domain rewrite: 'provider is created with the requested name' or 'created provider preserves the submitted name'"}, + {"name": "audits include the change", "verdict": "PASS", "kind": "clean", "why": "domain verb (include) + domain object (audits, change); 'include' is not on the assertion-verb list"} + ] + }, + "F3_hard": { + "category": "structure-leak-mixed", + "names": [ + {"name": "borrower with empty loan history", "verdict": "FLAG", "kind": "textbook", "why": "structure phrasing (empty) + no action"}, + {"name": "new borrower has no past loans", "verdict": "PASS", "kind": "borderline", "why": "'no past loans' is the canonical good replacement for 'with empty loan history' per naming.md examples; describes domain state, not data shape"}, + {"name": "first loan is added to the history", "verdict": "PASS", "kind": "clean", "why": "domain outcome (loan added to history)"}, + {"name": "returns an empty list when filter is unset", "verdict": "FLAG", "kind": "borderline", "why": "assertion + structure phrasing (returns empty list); domain rewrite: 'unfiltered query returns all loans' or 'lists every loan when filter is unset'"} + ] + }, + "F4_hard": { + "category": "http-leak-mixed", + "names": [ + {"name": "POST /providers returns 201", "verdict": "FLAG", "kind": "textbook", "why": "HTTP method + status code"}, + {"name": "admin deletes the provider", "verdict": "PASS", "kind": "clean", "why": "actor + verb + domain object"}, + {"name": "the request is rejected for missing auth", "verdict": "PASS", "kind": "borderline", "why": "no status code, no method; 'rejected' is a domain outcome; clean even though it sits in an HTTP test file"}, + {"name": "returns 4xx on invalid input", "verdict": "FLAG", "kind": "borderline", "why": "HTTP status range shorthand (4xx) + assertion phrasing (returns); same category as bare status codes"} + ] + }, + "F5_hard": { + "category": "control-with-risky-words", + "names": [ + {"name": "creditor grants a credit line", "verdict": "PASS", "kind": "clean", "why": "actor + verb + outcome"}, + {"name": "borrower returns the equipment", "verdict": "PASS", "kind": "borderline", "why": "'returns' here is the domain verb (returning equipment), not the assertion verb describing a value. Action by an actor on a domain object"}, + {"name": "the loan is fully repaid", "verdict": "PASS", "kind": "clean", "why": "domain outcome; 'is' here is the state-of-being copula, not an assertion verb describing a value"}, + {"name": "borrower has insufficient collateral", "verdict": "PASS", "kind": "borderline", "why": "'has' here describes a domain condition of the actor, not an assertion about a returned value. Canonical domain phrasing"}, + {"name": "creditor revokes access", "verdict": "PASS", "kind": "clean", "why": "actor + verb + domain object"} + ] + } + }, + "totals": { + "by_kind": {"textbook": 5, "borderline": 7, "clean": 9}, + "by_verdict": {"FLAG": 7, "PASS": 14}, + "total": 21 + } +} diff --git a/eval/naming-length/fixtures/expectations.json b/eval/naming-length/fixtures/expectations.json new file mode 100644 index 0000000..87d75dd --- /dev/null +++ b/eval/naming-length/fixtures/expectations.json @@ -0,0 +1,50 @@ +{ + "_comment": "Each fixture lists every s.Test name and whether it should be FLAGged or stay quiet. The subagent scores against this.", + "fixtures": { + "F1_function_leak": { + "category": "function-method-leak", + "names": [ + {"name": "CreateQuote returns a quote with id", "verdict": "FLAG", "why": "identifier leak (CreateQuote) + assertion phrasing (returns ... with)"}, + {"name": "UpdateQuote produces audit records with matching client quote ids", "verdict": "FLAG", "why": "identifier leak (UpdateQuote)"}, + {"name": "CancelQuote sets status to canceled", "verdict": "FLAG", "why": "identifier leak (CancelQuote) + assertion phrasing (sets status to)"} + ] + }, + "F2_assertion_phrasing": { + "category": "assertion-style", + "names": [ + {"name": "admin creates a new provider", "verdict": "PASS", "why": "actor + verb + outcome"}, + {"name": "err is not nil for duplicate name", "verdict": "FLAG", "why": "assertion phrasing (err is not nil)"}, + {"name": "count equals 1 after creation", "verdict": "FLAG", "why": "assertion phrasing (count equals 1)"}, + {"name": "looks up the provider by id", "verdict": "PASS", "why": "action verb (looks up) + domain object"} + ] + }, + "F3_structure_leak": { + "category": "data-structure-leak", + "names": [ + {"name": "borrower with empty loan history", "verdict": "FLAG", "why": "structure phrasing (empty)"}, + {"name": "returns map with 0 entries", "verdict": "FLAG", "why": "data-structure leak (map, 0 entries) + assertion phrasing (returns)"}, + {"name": "with nil filter lists all loans across borrowers", "verdict": "FLAG", "why": "structure phrasing (nil filter)"}, + {"name": "first loan is added to the history", "verdict": "PASS", "why": "domain outcome (loan added to history)"} + ] + }, + "F4_http_leak": { + "category": "http-code-leak", + "names": [ + {"name": "POST /providers returns 201", "verdict": "FLAG", "why": "HTTP method + status code"}, + {"name": "returns 409 on duplicate name", "verdict": "FLAG", "why": "HTTP status code"}, + {"name": "GET /providers/:id returns 200 with body", "verdict": "FLAG", "why": "HTTP method + status code"}, + {"name": "admin deletes the provider", "verdict": "PASS", "why": "actor + verb + domain object"} + ] + }, + "F5_good_names_control": { + "category": "good-names-control", + "names": [ + {"name": "creditor grants a credit line to borrower", "verdict": "PASS", "why": "actor + verb + outcome"}, + {"name": "borrower draws on the credit line", "verdict": "PASS", "why": "actor + verb + domain object"}, + {"name": "borrower repays part of the drawn amount", "verdict": "PASS", "why": "actor + verb + outcome"}, + {"name": "draw is rejected when limit is exceeded", "verdict": "PASS", "why": "domain outcome with domain condition"}, + {"name": "counterparty revokes the credit line", "verdict": "PASS", "why": "actor + verb + domain object"} + ] + } + } +} diff --git a/eval/naming-length/variants/naming.md b/eval/naming-length/variants/naming.md new file mode 100644 index 0000000..90f9ca6 --- /dev/null +++ b/eval/naming-length/variants/naming.md @@ -0,0 +1,70 @@ +# Samurai Test Naming + +The "Naming" section in SKILL.md states the bar and the four forbidden patterns. This file is the replacement protocol and worked examples — load it when you have a bad name in hand and need to propose a good one. + +## Replacement protocol + +Do not look at the diff hunk header, the parent test function name, or the function under test. **Open the `Test()` callback body.** Identify three things from the code in that body: + +1. **Actor** — who acts. Usually a domain noun: `borrower`, `admin`, `provider`, `API caller`, `counterparty`. If there is no clear actor, the system itself (`the service`, `the API`) is acceptable but rarely the best choice. +2. **Verb** — the action call that mutates state or makes a request: `db.CreateUser`, `client.UpdateQuote`, `svc.Settle`. Express it as a domain verb (`creates`, `updates`, `settles`), not the Go method name. +3. **Outcome** — what the assertions in the same callback verify: a state change, a rejection, a returned value's domain meaning. + +Write a 3–7 word, lowercase, present-tense phrase combining (1)+(2)+(3) — or (1)+(2) when the outcome is implied by success. Propose 1–2 alternatives so the author can pick. + +If you cannot identify the actor or the outcome from the body, the `Test()` is doing too much or too little — flag this before renaming. Either the action is missing (a setup-only test) or many actions are bundled (split it). + +## Detection heuristics + +| Pattern | Why it's bad | What to look at in the body instead | +|---|---|---| +| CamelCase token matching a Go identifier in the Test body | Couples the test name to the SUT symbol; rename of the function breaks the name | The effect of that call in domain terms | +| Bare HTTP status code (`200`, `404`, `409`) | Leaks the HTTP layer; the same behaviour at gRPC would be unnamable | The outcome verb: `accepts`, `rejects duplicate id`, `reports missing` | +| `is / equals / returns / not nil / has length / contains` describing a value | Describes the assertion, not the step | The action that produced the value; the assertion is implementation | +| `empty / nil / with N keys / map / slice / struct` | Leaks the input data structure | The domain meaning of the input: `no past loans`, `missing preferences`, `unset filter` | + +## Bad → good (grounded in real samurai trees) + +- `"UpdateQuote calls produce audit records with matching client quote ids"` — identifier leak (`UpdateQuote`) + → `"updating a quote writes an audit record"` + → `"the audit record links to the originating client quote"` + +- `"err is not nil"` — assertion phrasing + → `"rejects request with invalid signature"` + → `"refuses unauthenticated callers"` + +- `"returns 409"` — HTTP leak + → `"rejects duplicate id"` + → `"refuses to create a provider that already exists"` + +- `"with empty slice"` — structure leak + → `"returns no matches when the filter is unset"` + → `"borrower has no past loans"` + +- `"CreateProvider returns nil"` — identifier leak + assertion phrasing + → `"creates a new provider successfully"` + → `"a fresh provider id is issued"` + +## Uniformly across tree levels + +The bar is the same at top, intermediate, and leaf. Example: + +``` +"borrower opens a credit line" // top — business setup, not "CreateCreditLine returns ok" + ├─ "borrower draws on the credit line" // intermediate — action, not "Draw=true" + │ ├─ "balance reflects the drawdown" // leaf — outcome, not "balance equals 100" + │ └─ "draw is rejected when limit is exceeded" + └─ "counterparty accepts KYB access" +``` + +No level mentions the Go type (`CreditLine`), the method (`Draw`), the HTTP code, the assertion verb (`equals`), or the data shape (`with balance > 0`). + +## During review + +When reviewing a samurai test, scan every `s.Test(...)` string literal against the heuristics above. For each hit: + +1. Quote the offending name and identify which forbidden pattern it matches. +2. Read the `Test()` body and derive actor + verb + outcome. +3. Propose 1–2 concrete replacements. + +Do not silently rewrite — surface the issue and the proposal so the author confirms domain accuracy. diff --git a/eval/naming-length/variants/no-load/v1.md b/eval/naming-length/variants/no-load/v1.md new file mode 100644 index 0000000..2854734 --- /dev/null +++ b/eval/naming-length/variants/no-load/v1.md @@ -0,0 +1,12 @@ +## Naming + +Every `s.Test("name", ...)` — top, intermediate, leaf — names a business action or outcome from the actor's point of view. Never the function under test, the assertion, or the input data shape. The same bar applies at every level of the tree. + +Forbidden patterns (flag and rewrite when authoring or reviewing): + +- **Identifier leak** — any CamelCase token that appears as a Go identifier in the Test body (function, type, method). `"UpdateQuote calls produce audit records"` leaks `UpdateQuote`. +- **HTTP / status leak** — bare 3-digit codes matching `\b[1-5]\d{2}\b`, or status-range shorthand like `4xx / 5xx`. `"returns 409"` → `"rejects duplicate id"`. +- **Assertion phrasing** — verbs like `is / equals / returns / not nil / has length / contains / has` describing the value, not the step. `"err is not nil"` → `"rejects invalid signature"`. `"the provider has the requested name"` → `"provider is created with the requested name"`. +- **Structure phrasing** — input-shape words like `empty / nil / with N keys / map / slice / array / struct`, or assertion shapes like `returns an empty list`. `"with empty slice"` → `"returns no matches when filter is unset"`. + +When a name matches any of the above, read the `Test()` body and propose 1–2 concrete domain replacements derived from the body. Names that use risky-looking words in a domain-natural sense (`"borrower returns the equipment"`, `"borrower has insufficient collateral"`, `"new borrower has no past loans"`) are clean — only flag when the word describes the assertion, the data shape, or the identifier. diff --git a/eval/naming-length/variants/no-load/v2.md b/eval/naming-length/variants/no-load/v2.md new file mode 100644 index 0000000..7acc9b0 --- /dev/null +++ b/eval/naming-length/variants/no-load/v2.md @@ -0,0 +1,12 @@ +## Naming + +Every `s.Test("name", ...)` — at every tree level — names a business action or outcome from the actor's POV. Not the function under test, the assertion, or the input shape. + +Forbidden patterns: + +- **Identifier leak** — CamelCase tokens matching Go identifiers in the Test body. `"UpdateQuote calls..."` leaks `UpdateQuote`. +- **HTTP code leak** — bare 3-digit codes (`\b[1-5]\d{2}\b`) or `4xx/5xx` shorthand. `"returns 409"` → `"rejects duplicate id"`. +- **Assertion phrasing** — `is / equals / returns / not nil / has length / contains / has`. `"err is not nil"` → `"rejects invalid signature"`. +- **Structure phrasing** — `empty / nil / with N / map / slice / struct`. `"with empty slice"` → `"borrower has no past loans"`. + +Risky-looking words used in a domain-natural sense (`borrower returns the equipment`, `has insufficient collateral`, `no past loans`) are clean — only flag when the word describes the assertion, the data shape, or the identifier. Flag matches, read the body, propose 1–2 replacements. diff --git a/eval/naming-length/variants/no-load/v3.md b/eval/naming-length/variants/no-load/v3.md new file mode 100644 index 0000000..c41154a --- /dev/null +++ b/eval/naming-length/variants/no-load/v3.md @@ -0,0 +1,5 @@ +## Naming + +Every `s.Test("name", ...)` — at every tree level — names a business action or outcome. Forbidden: identifier leak (`UpdateQuote calls...`), HTTP codes incl. `4xx/5xx` (`returns 409`), assertion phrasing (`err is not nil`, `has the requested name`), structure phrasing (`with empty slice`, `returns empty list`). + +Flag matches; read the `Test()` body; propose 1–2 domain replacements. Risky words in domain-natural sense (`returns the equipment`, `no past loans`) are clean. diff --git a/eval/naming-length/variants/no-load/v4.md b/eval/naming-length/variants/no-load/v4.md new file mode 100644 index 0000000..941eb10 --- /dev/null +++ b/eval/naming-length/variants/no-load/v4.md @@ -0,0 +1,3 @@ +## Naming + +`s.Test(...)` names are business actions/outcomes at every level — not function names, HTTP codes, assertion verbs (`is nil`, `equals`, `has X`), or data-shape words (`empty slice`, `nil filter`). Propose domain replacements. diff --git a/eval/naming-length/variants/no-load/v5.md b/eval/naming-length/variants/no-load/v5.md new file mode 100644 index 0000000..1c86ef0 --- /dev/null +++ b/eval/naming-length/variants/no-load/v5.md @@ -0,0 +1,3 @@ +## Naming + +`s.Test(...)` names are business actions, never function names, status codes, or assertions. diff --git a/eval/naming-length/variants/v1.md b/eval/naming-length/variants/v1.md new file mode 100644 index 0000000..d74aa26 --- /dev/null +++ b/eval/naming-length/variants/v1.md @@ -0,0 +1,12 @@ +## Naming + +Every `s.Test("name", ...)` — top, intermediate, leaf — names a business action or outcome from the actor's point of view. Never the function under test, the assertion, or the input data shape. The same bar applies at every level of the tree. + +Forbidden patterns (flag and rewrite when authoring or reviewing): + +- **Identifier leak** — any CamelCase token that appears as a Go identifier in the Test body (function, type, method). `"UpdateQuote calls produce audit records"` leaks `UpdateQuote`. +- **HTTP / status leak** — bare 3-digit codes matching `\b[1-5]\d{2}\b`. `"returns 409"` → `"rejects duplicate id"`. +- **Assertion phrasing** — verbs like `is / equals / returns / not nil / has length / contains` describing the value, not the step. `"err is not nil"` → `"rejects invalid signature"`. +- **Structure phrasing** — input-shape words like `empty / nil / with N keys / map / slice / array / struct`. `"with empty slice"` → `"returns no matches when filter is unset"`. + +When a name (yours or one you are reviewing) matches any of the above, read the `Test()` body and rewrite. Do not silently change names — propose 1–2 concrete replacements derived from the body. See [naming.md](naming.md) for the replacement protocol and worked examples. diff --git a/eval/naming-length/variants/v2.md b/eval/naming-length/variants/v2.md new file mode 100644 index 0000000..efe693c --- /dev/null +++ b/eval/naming-length/variants/v2.md @@ -0,0 +1,12 @@ +## Naming + +Every `s.Test("name", ...)` — at every tree level — names a business action or outcome from the actor's POV. Not the function under test, the assertion, or the input shape. + +Forbidden patterns: + +- **Identifier leak** — CamelCase tokens matching Go identifiers in the Test body. `"UpdateQuote calls..."` leaks `UpdateQuote`. +- **HTTP code leak** — bare 3-digit codes (`\b[1-5]\d{2}\b`). `"returns 409"` → `"rejects duplicate id"`. +- **Assertion phrasing** — `is / equals / returns / not nil / has length / contains`. `"err is not nil"` → `"rejects invalid signature"`. +- **Structure phrasing** — `empty / nil / with N / map / slice / struct`. `"with empty slice"` → `"borrower has no past loans"`. + +Flag matches, read the body, propose 1–2 replacements. See [naming.md](naming.md) for the protocol. diff --git a/eval/naming-length/variants/v3.md b/eval/naming-length/variants/v3.md new file mode 100644 index 0000000..18d193d --- /dev/null +++ b/eval/naming-length/variants/v3.md @@ -0,0 +1,5 @@ +## Naming + +Every `s.Test("name", ...)` — at every tree level — names a business action or outcome. Forbidden: identifier leak (`UpdateQuote calls...`), HTTP codes (`returns 409`), assertion phrasing (`err is not nil`), structure phrasing (`with empty slice`). + +Flag matches; read the `Test()` body; propose 1–2 domain replacements. See [naming.md](naming.md). diff --git a/eval/naming-length/variants/v4.md b/eval/naming-length/variants/v4.md new file mode 100644 index 0000000..d327c0d --- /dev/null +++ b/eval/naming-length/variants/v4.md @@ -0,0 +1,3 @@ +## Naming + +`s.Test(...)` names are business actions/outcomes at every level — not function names, HTTP codes, assertion verbs (`is nil`, `equals`), or data-shape words (`empty slice`). Propose domain replacements. See [naming.md](naming.md). diff --git a/eval/naming-length/variants/v5.md b/eval/naming-length/variants/v5.md new file mode 100644 index 0000000..6736eca --- /dev/null +++ b/eval/naming-length/variants/v5.md @@ -0,0 +1,3 @@ +## Naming + +`s.Test(...)` names are business actions, never function names, status codes, or assertions. See [naming.md](naming.md). From 07431742e8a4d4778e7469f66198d1770a2b7c59 Mon Sep 17 00:00:00 2001 From: stepan_romankov Date: Mon, 25 May 2026 16:22:32 +0200 Subject: [PATCH 4/4] fix(skill): resolve naming-guidance inconsistencies caught in skill-creator review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Drop bare `has` from the inline assertion-phrasing list in SKILL.md: it conflicted with the safe-examples carve-out for `has insufficient collateral` and was absent from naming.md's heuristics row, so a model cross-checking the two files saw conflicting signals. - Pair the HTTP-status examples in naming.md with their domain verbs (`200 → accepts`, `404 → reports missing`, `409 → rejects duplicate id`) so the list no longer reads as if all HTTP leaks are rejections. Coverage on the hardened eval is unchanged (both edits are inside cases V2 already PASSed). --- cmd/claude-setup/skill/SKILL.md | 2 +- cmd/claude-setup/skill/naming.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/claude-setup/skill/SKILL.md b/cmd/claude-setup/skill/SKILL.md index c06e06c..0def146 100644 --- a/cmd/claude-setup/skill/SKILL.md +++ b/cmd/claude-setup/skill/SKILL.md @@ -28,7 +28,7 @@ Forbidden patterns: - **Identifier leak** — CamelCase tokens matching Go identifiers in the Test body. `"UpdateQuote calls..."` leaks `UpdateQuote`. - **HTTP code leak** — bare 3-digit codes (`\b[1-5]\d{2}\b`) or `4xx/5xx` shorthand. `"returns 409"` → `"rejects duplicate id"`. -- **Assertion phrasing** — `is / equals / returns / not nil / has length / contains / has`. `"err is not nil"` → `"rejects invalid signature"`. +- **Assertion phrasing** — `is / equals / returns / not nil / has length / contains`. `"err is not nil"` → `"rejects invalid signature"`. - **Structure phrasing** — `empty / nil / with N / map / slice / struct`. `"with empty slice"` → `"borrower has no past loans"`. Risky-looking words used in a domain-natural sense (`borrower returns the equipment`, `has insufficient collateral`, `no past loans`) are clean — only flag when the word describes the assertion, the data shape, or the identifier. Flag matches, read the body, propose 1–2 replacements. See [naming.md](naming.md) for the protocol. diff --git a/cmd/claude-setup/skill/naming.md b/cmd/claude-setup/skill/naming.md index 90f9ca6..46d2a17 100644 --- a/cmd/claude-setup/skill/naming.md +++ b/cmd/claude-setup/skill/naming.md @@ -19,7 +19,7 @@ If you cannot identify the actor or the outcome from the body, the `Test()` is d | Pattern | Why it's bad | What to look at in the body instead | |---|---|---| | CamelCase token matching a Go identifier in the Test body | Couples the test name to the SUT symbol; rename of the function breaks the name | The effect of that call in domain terms | -| Bare HTTP status code (`200`, `404`, `409`) | Leaks the HTTP layer; the same behaviour at gRPC would be unnamable | The outcome verb: `accepts`, `rejects duplicate id`, `reports missing` | +| Bare HTTP status code (`200 → accepts`, `404 → reports missing`, `409 → rejects duplicate id`) | Leaks the HTTP layer; the same behaviour at gRPC would be unnamable | The outcome verb the code stands for, not the code itself | | `is / equals / returns / not nil / has length / contains` describing a value | Describes the assertion, not the step | The action that produced the value; the assertion is implementation | | `empty / nil / with N keys / map / slice / struct` | Leaks the input data structure | The domain meaning of the input: `no past loans`, `missing preferences`, `unset filter` |