From e6ad0f3b40a115f523d79a7e558cd5d736de4180 Mon Sep 17 00:00:00 2001
From: dadachi <maurois@mac.com>
Date: Sat, 2 May 2026 21:45:55 +0900
Subject: [PATCH] Layer 3 Phase 5b: integrate runVisualJudge into runJudge
 (opt-in)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extends JudgeInput with an optional `visual` field. When provided,
runJudge calls runVisualJudge per configured platform (#40), captures
results into the JudgeResult, and rolls them into the summary line.

Default behavior unchanged: dispatch.ts doesn't pass `visual`, so
runJudge takes the existing skip path with a clearer trace
("visual config not provided; skipped") instead of the old
"not yet wired" placeholder.

When visual IS configured:
  Layer 1 3/3 pass · Layer 2 3/3 pass · Layer 3 2/2 pass

Per-platform reports surface in JudgeResult.visual.{ios,android}:
  - pass: boolean (overall median-of-3 verdict)
  - screenshotPath: where the captured PNG lives
  - scores: per-criterion verdict + rationale
  - error: populated when launch / capture / judge failed before
    Layer 3 produced scores

Default rubric is DEFAULT_STAGE1_RUBRIC (3 criteria, from #40).
Default screenshot dir is tmp/screenshots/<slug>/.
Default spec text is domain.displayName; callers can override.

JudgeResult shape grows to:
  { overallPass, summary, visual?: { ios?, android? } }
overallPass requires layer1+layer2+visual all passing (when visual
is configured). Strict-optional types: visual fields only present
when the platform was judged.

Out of scope:
  - Wiring `visual` from dispatch.ts (caller-side; driver script /
    plugin entry point would populate this).
  - Resolving artifactPath / bundleId / packageName from the slug
    (done by the driver via Info.plist / AndroidManifest.xml read).
  - Bridge from Layer 2 build mode → installAndLaunch.

Tests: 12/12 npm run ci green. Existing dispatch e2e test still
passes unchanged (visual not passed → skip path). New shape verified
via TypeScript strict-optional compilation.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/agents/judge.ts | 105 +++++++++++++++++++++++++++++++++++++++++---
 src/agents/types.ts |  13 ++++++
 2 files changed, 113 insertions(+), 5 deletions(-)
diff --git a/src/agents/judge.ts b/src/agents/judge.ts
index 18443d6..262bd65 100644
--- a/src/agents/judge.ts
+++ b/src/agents/judge.ts
@@ -1,9 +1,22 @@
-import { resolve } from "node:path";
+import { resolve, join } from "node:path";
 import { trace } from "../trace.js";
 import { isStub } from "../stub.js";
 import { runLayer1 } from "../validation/layer1.js";
 import { runLayer2 } from "../validation/layer2.js";
-import type { DomainSpec, JudgeResult, Platform, ReviewerResult, WorkerResult } from "./types.js";
+import {
+  runVisualJudge,
+  DEFAULT_STAGE1_RUBRIC,
+  type VisualJudgeResult,
+} from "../validation/visual-judge.js";
+import type { Layer3Criterion } from "../validation/layer3.js";
+import type {
+  DomainSpec,
+  JudgeResult,
+  Platform,
+  ReviewerResult,
+  VisualJudgePlatformReport,
+  WorkerResult,
+} from "./types.js";
 
 export type JudgeInput = {
   domain: DomainSpec;
@@ -11,6 +24,15 @@ export type JudgeInput = {
   ios: WorkerResult;
   android: WorkerResult;
   reviewer: ReviewerResult;
+  visual?: VisualJudgeConfig;
+};
+
+export type VisualJudgeConfig = {
+  ios?: { artifactPath: string; bundleId: string };
+  android?: { artifactPath: string; packageName: string };
+  screenshotDir?: string;
+  rubric?: readonly Layer3Criterion[];
+  spec?: string;
 };
 
 type PlatformReport = {
@@ -40,18 +62,91 @@ export async function runJudge(input: JudgeInput): Promise<JudgeResult> {
     trace("judge", `${r.platform}: Layer 1 ${l1} · Layer 2 ${l2} [${r.layer2Command}]`);
   }
 
-  trace("judge", "Layer 3 (semantic, Opus 4.7 vision judge) — not yet wired; treating as skipped");
+  let visualReport: { ios?: VisualJudgePlatformReport; android?: VisualJudgePlatformReport } | undefined;
+  let layer3Summary = "Layer 3 skipped";
+  if (input.visual && (input.visual.ios || input.visual.android)) {
+    visualReport = await runVisualPhase(input.visual, input.domain);
+    layer3Summary = formatLayer3Summary(visualReport);
+  } else {
+    trace("judge", "Layer 3 (semantic, Opus 4.7 vision judge) — visual config not provided; skipped");
+  }
 
-  const overallPass = reports.every((r) => r.layer1Pass && r.layer2Pass);
+  const layer1Layer2Pass = reports.every((r) => r.layer1Pass && r.layer2Pass);
+  const visualPass = visualReport
+    ? Object.values(visualReport).every((r): r is VisualJudgePlatformReport => Boolean(r) && r!.pass)
+    : true;
+  const overallPass = layer1Layer2Pass && visualPass;
   const l1Total = reports.filter((r) => r.layer1Pass).length;
   const l2Total = reports.filter((r) => r.layer2Pass).length;
 
   return {
     overallPass,
-    summary: `Layer 1 ${l1Total}/3 pass · Layer 2 ${l2Total}/3 pass · Layer 3 skipped`,
+    summary: `Layer 1 ${l1Total}/3 pass · Layer 2 ${l2Total}/3 pass · ${layer3Summary}`,
+    ...(visualReport ? { visual: visualReport } : {}),
   };
 }
 
+async function runVisualPhase(
+  config: VisualJudgeConfig,
+  domain: DomainSpec,
+): Promise<{ ios?: VisualJudgePlatformReport; android?: VisualJudgePlatformReport }> {
+  const screenshotDir = config.screenshotDir ?? resolve(process.cwd(), "tmp", "screenshots", domain.slug);
+  const rubric = config.rubric ?? DEFAULT_STAGE1_RUBRIC;
+  const spec = config.spec ?? domain.displayName;
+
+  const platforms: Array<"ios" | "android"> = [];
+  if (config.ios) platforms.push("ios");
+  if (config.android) platforms.push("android");
+
+  trace("judge", `Layer 3 (semantic) — judging ${platforms.join(" + ")} home screen against ${rubric.length}-criterion rubric`);
+
+  const results = await Promise.all(platforms.map(async (platform) => {
+    const cfg = platform === "ios" ? config.ios! : config.android!;
+    const screenshotPath = join(screenshotDir, `${platform}-home.png`);
+    const visualResult = await runVisualJudge(
+      platform === "ios"
+        ? {
+            platform: "ios",
+            artifactPath: cfg.artifactPath,
+            bundleId: (cfg as { bundleId: string }).bundleId,
+            screenshotPath,
+            spec,
+            rubric,
+          }
+        : {
+            platform: "android",
+            artifactPath: cfg.artifactPath,
+            packageName: (cfg as { packageName: string }).packageName,
+            screenshotPath,
+            spec,
+            rubric,
+          },
+    );
+    trace(
+      "judge",
+      `Layer 3 ${platform}: ${visualResult.ok ? "PASS" : "FAIL"}` +
+        (visualResult.error ? ` — ${visualResult.error}` : ""),
+    );
+    return [platform, toPlatformReport(visualResult)] as const;
+  }));
+
+  return Object.fromEntries(results);
+}
+
+function toPlatformReport(result: VisualJudgeResult): VisualJudgePlatformReport {
+  const report: VisualJudgePlatformReport = { pass: result.ok };
+  if (result.screenshotPath !== undefined) report.screenshotPath = result.screenshotPath;
+  if (result.layer3?.scores) report.scores = result.layer3.scores;
+  if (result.error !== undefined) report.error = result.error;
+  return report;
+}
+
+function formatLayer3Summary(report: { ios?: VisualJudgePlatformReport; android?: VisualJudgePlatformReport }): string {
+  const platforms = (["ios", "android"] as const).filter((p) => report[p] !== undefined);
+  const passing = platforms.filter((p) => report[p]?.pass).length;
+  return `Layer 3 ${passing}/${platforms.length} pass`;
+}
+
 const delay = (ms: number): Promise<void> => new Promise((r) => { setTimeout(r, ms); });
 
 async function runStubJudge(): Promise<JudgeResult> {
diff --git a/src/agents/types.ts b/src/agents/types.ts
index 1043f74..a2361ac 100644
--- a/src/agents/types.ts
+++ b/src/agents/types.ts
@@ -43,4 +43,17 @@ export type ReviewerResult = {
 export type JudgeResult = {
   overallPass: boolean;
   summary: string;
+  visual?: VisualJudgeReport;
+};
+
+export type VisualJudgeReport = {
+  ios?: VisualJudgePlatformReport;
+  android?: VisualJudgePlatformReport;
+};
+
+export type VisualJudgePlatformReport = {
+  pass: boolean;
+  screenshotPath?: string;
+  scores?: readonly { criterionId: string; pass: boolean; rationale: string }[];
+  error?: string;
 };