nativeapptemplate · dadachi · May 3, 2026 · May 3, 2026
diff --git a/docs/SPEC.md b/docs/SPEC.md
@@ -126,7 +126,7 @@ Even Stage 1 is enough to surface silent field renames: a field renamed on the s
 - Parse the original NL spec into a structured "expected domain" (entities, fields, relationships, key verbs).
 - Read the generated Rails code and ask Opus 4.7 to score, on a structured rubric (Yes/No per criterion with justification), whether the implementation expresses the expected domain.
 - Repeat separately for iOS copy / labels and Android copy / labels, derived from `Localizable.strings` and `strings.xml`.
-- For UI specifically, Opus 4.7 vision reads the simulator and emulator screenshots captured in Layer 2 / section-5 Stage 1 and scores: "does this screen read as a [clinic queue / restaurant waitlist / task tracker] to a real user?"
+- For UI specifically, Opus 4.7 vision reads the simulator and emulator screenshots captured in Layer 2. **Stage 1** scores the post-launch screen for substrate-leak detection and render sanity (catches "the app still says 'Shop' everywhere"). **Stage 2** navigates past onboarding/login via mobile-mcp to the actual domain UI and scores: "does this screen read as a [clinic queue / restaurant waitlist / task tracker] to a real user?"
 
 Guardrails against known LLM-as-judge weaknesses:
 - Rubrics are structured Yes/No-per-criterion, never free-form scoring, to reduce the "well, I guess that's fine" drift LLM judges default to.

diff --git a/src/validation/visual-judge.ts b/src/validation/visual-judge.ts
@@ -95,15 +95,20 @@ export async function runVisualJudge(input: VisualJudgeInput): Promise<VisualJud
   };
 }
 
-// Default Stage 1 rubric for home-screen judging — three Yes/No criteria
-// covering domain match, substrate-leak detection, and basic render sanity.
-// Phrased so pass=true is the desired state on every criterion.
+// Default Stage 1 rubric — two Yes/No criteria covering substrate-leak
+// detection and basic render sanity. Phrased so pass=true is the desired
+// state on every criterion.
+//
+// Per docs/SPEC.md, Stage 1 captures the post-launch screen and is scoped
+// to "catch egregious rename failures" — substrate-leak detection plus
+// 'does anything render at all'. Domain-semantic matching (e.g. "does this
+// read as a clinic queue?") requires reaching the actual domain UI past
+// onboarding/login and lives in Stage 2 (mobile-mcp navigation), where
+// the agent can drive the app to a list/detail/form screen before
+// judging. Adding a domain-match criterion here would false-fail any
+// substrate that ships with onboarding-before-domain-UI — which is the
+// substrate's intentional design.
 export const DEFAULT_STAGE1_RUBRIC: readonly Layer3Criterion[] = [
-  {
-    id: "domain-match",
-    question:
-      "Does this screen unambiguously read as the SaaS product described in the spec, to a typical user seeing it for the first time?",
-  },
   {
     id: "no-substrate-leak",
     question:

diff --git a/tests/smoke.test.ts b/tests/smoke.test.ts
@@ -47,7 +47,7 @@ test("discoverIosArtifact returns null for missing dir", async () => {
 
 test("DEFAULT_STAGE1_RUBRIC has the expected criteria ids", () => {
   const ids = DEFAULT_STAGE1_RUBRIC.map((c) => c.id);
-  assert.deepEqual(ids, ["domain-match", "no-substrate-leak", "renders-cleanly"]);
+  assert.deepEqual(ids, ["no-substrate-leak", "renders-cleanly"]);
 });
 
 test("runVisualJudge short-circuits on launch failure (no sim booted)", async () => {