From 9ed9a1411bcd67c56b65e000488693e02ad14b3d Mon Sep 17 00:00:00 2001 From: dadachi Date: Sun, 3 May 2026 14:40:03 +0900 Subject: [PATCH] Stage 1 rubric: drop domain-match, align to docs/SPEC.md scope MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first real Layer 3 visual smoke produced an iOS Stage 1 FAIL — not because of a rename bug, but because Stage 1 captured the substrate's onboarding screen ("Onboarding 1 / Landscape / Onboarding description 1") and the rubric's domain-match criterion ("does this read as a clinic queue?") asks a question Stage 1 can't answer until mobile-mcp navigation lands. Per docs/SPEC.md, Stage 1's stated purpose is catching "egregious rename failures" (substrate-leak), and Stage 2 — UI-driven navigation past onboarding/login via mobile-mcp — is where domain-semantic UI judging belongs. The current rubric overreaches into Stage 2 territory at Stage 1. Changes: - DEFAULT_STAGE1_RUBRIC: drop domain-match. Keep no-substrate-leak (Stage 1's actual job per SPEC.md) and renders-cleanly (broad enough to catch crash/layout breakage on any screen including onboarding placeholders). - tests/smoke.test.ts: update the assertion to match new IDs. - docs/SPEC.md: re-pin domain-semantic question to Stage 2, explicitly call Stage 1 "substrate-leak detection and render sanity." Resolves an internal inconsistency that pinned the domain question at Stage 1 vs. its declared Stage-1 scope. Out of scope: Stage 2 (mobile-mcp navigation past onboarding to the actual domain UI). That's where domain-match belongs and is the eventual full demo. Significant feature, separate PR(s). Verification: - npm run ci: 19/19 green. - Real-mode visual run on the next attempt should show Layer 3 2/2 pass on iOS (onboarding placeholder satisfies both no-substrate-leak and renders-cleanly). Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/SPEC.md | 2 +- src/validation/visual-judge.ts | 21 +++++++++++++-------- tests/smoke.test.ts | 2 +- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/docs/SPEC.md b/docs/SPEC.md index 4c7f531..8730e0a 100644 --- a/docs/SPEC.md +++ b/docs/SPEC.md @@ -126,7 +126,7 @@ Even Stage 1 is enough to surface silent field renames: a field renamed on the s - Parse the original NL spec into a structured "expected domain" (entities, fields, relationships, key verbs). - Read the generated Rails code and ask Opus 4.7 to score, on a structured rubric (Yes/No per criterion with justification), whether the implementation expresses the expected domain. - Repeat separately for iOS copy / labels and Android copy / labels, derived from `Localizable.strings` and `strings.xml`. -- For UI specifically, Opus 4.7 vision reads the simulator and emulator screenshots captured in Layer 2 / section-5 Stage 1 and scores: "does this screen read as a [clinic queue / restaurant waitlist / task tracker] to a real user?" +- For UI specifically, Opus 4.7 vision reads the simulator and emulator screenshots captured in Layer 2. **Stage 1** scores the post-launch screen for substrate-leak detection and render sanity (catches "the app still says 'Shop' everywhere"). **Stage 2** navigates past onboarding/login via mobile-mcp to the actual domain UI and scores: "does this screen read as a [clinic queue / restaurant waitlist / task tracker] to a real user?" Guardrails against known LLM-as-judge weaknesses: - Rubrics are structured Yes/No-per-criterion, never free-form scoring, to reduce the "well, I guess that's fine" drift LLM judges default to. diff --git a/src/validation/visual-judge.ts b/src/validation/visual-judge.ts index cb784d8..c152f9a 100644 --- a/src/validation/visual-judge.ts +++ b/src/validation/visual-judge.ts @@ -95,15 +95,20 @@ export async function runVisualJudge(input: VisualJudgeInput): Promise { test("DEFAULT_STAGE1_RUBRIC has the expected criteria ids", () => { const ids = DEFAULT_STAGE1_RUBRIC.map((c) => c.id); - assert.deepEqual(ids, ["domain-match", "no-substrate-leak", "renders-cleanly"]); + assert.deepEqual(ids, ["no-substrate-leak", "renders-cleanly"]); }); test("runVisualJudge short-circuits on launch failure (no sim booted)", async () => {