From e6ad0f3b40a115f523d79a7e558cd5d736de4180 Mon Sep 17 00:00:00 2001 From: dadachi Date: Sat, 2 May 2026 21:45:55 +0900 Subject: [PATCH] Layer 3 Phase 5b: integrate runVisualJudge into runJudge (opt-in) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends JudgeInput with an optional `visual` field. When provided, runJudge calls runVisualJudge per configured platform (#40), captures results into the JudgeResult, and rolls them into the summary line. Default behavior unchanged: dispatch.ts doesn't pass `visual`, so runJudge takes the existing skip path with a clearer trace ("visual config not provided; skipped") instead of the old "not yet wired" placeholder. When visual IS configured: Layer 1 3/3 pass · Layer 2 3/3 pass · Layer 3 2/2 pass Per-platform reports surface in JudgeResult.visual.{ios,android}: - pass: boolean (overall median-of-3 verdict) - screenshotPath: where the captured PNG lives - scores: per-criterion verdict + rationale - error: populated when launch / capture / judge failed before Layer 3 produced scores Default rubric is DEFAULT_STAGE1_RUBRIC (3 criteria, from #40). Default screenshot dir is tmp/screenshots//. Default spec text is domain.displayName; callers can override. JudgeResult shape grows to: { overallPass, summary, visual?: { ios?, android? } } overallPass requires layer1+layer2+visual all passing (when visual is configured). Strict-optional types: visual fields only present when the platform was judged. Out of scope: - Wiring `visual` from dispatch.ts (caller-side; driver script / plugin entry point would populate this). - Resolving artifactPath / bundleId / packageName from the slug (done by the driver via Info.plist / AndroidManifest.xml read). - Bridge from Layer 2 build mode → installAndLaunch. Tests: 12/12 npm run ci green. Existing dispatch e2e test still passes unchanged (visual not passed → skip path). New shape verified via TypeScript strict-optional compilation. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/agents/judge.ts | 105 +++++++++++++++++++++++++++++++++++++++++--- src/agents/types.ts | 13 ++++++ 2 files changed, 113 insertions(+), 5 deletions(-) diff --git a/src/agents/judge.ts b/src/agents/judge.ts index 18443d6..262bd65 100644 --- a/src/agents/judge.ts +++ b/src/agents/judge.ts @@ -1,9 +1,22 @@ -import { resolve } from "node:path"; +import { resolve, join } from "node:path"; import { trace } from "../trace.js"; import { isStub } from "../stub.js"; import { runLayer1 } from "../validation/layer1.js"; import { runLayer2 } from "../validation/layer2.js"; -import type { DomainSpec, JudgeResult, Platform, ReviewerResult, WorkerResult } from "./types.js"; +import { + runVisualJudge, + DEFAULT_STAGE1_RUBRIC, + type VisualJudgeResult, +} from "../validation/visual-judge.js"; +import type { Layer3Criterion } from "../validation/layer3.js"; +import type { + DomainSpec, + JudgeResult, + Platform, + ReviewerResult, + VisualJudgePlatformReport, + WorkerResult, +} from "./types.js"; export type JudgeInput = { domain: DomainSpec; @@ -11,6 +24,15 @@ export type JudgeInput = { ios: WorkerResult; android: WorkerResult; reviewer: ReviewerResult; + visual?: VisualJudgeConfig; +}; + +export type VisualJudgeConfig = { + ios?: { artifactPath: string; bundleId: string }; + android?: { artifactPath: string; packageName: string }; + screenshotDir?: string; + rubric?: readonly Layer3Criterion[]; + spec?: string; }; type PlatformReport = { @@ -40,18 +62,91 @@ export async function runJudge(input: JudgeInput): Promise { trace("judge", `${r.platform}: Layer 1 ${l1} · Layer 2 ${l2} [${r.layer2Command}]`); } - trace("judge", "Layer 3 (semantic, Opus 4.7 vision judge) — not yet wired; treating as skipped"); + let visualReport: { ios?: VisualJudgePlatformReport; android?: VisualJudgePlatformReport } | undefined; + let layer3Summary = "Layer 3 skipped"; + if (input.visual && (input.visual.ios || input.visual.android)) { + visualReport = await runVisualPhase(input.visual, input.domain); + layer3Summary = formatLayer3Summary(visualReport); + } else { + trace("judge", "Layer 3 (semantic, Opus 4.7 vision judge) — visual config not provided; skipped"); + } - const overallPass = reports.every((r) => r.layer1Pass && r.layer2Pass); + const layer1Layer2Pass = reports.every((r) => r.layer1Pass && r.layer2Pass); + const visualPass = visualReport + ? Object.values(visualReport).every((r): r is VisualJudgePlatformReport => Boolean(r) && r!.pass) + : true; + const overallPass = layer1Layer2Pass && visualPass; const l1Total = reports.filter((r) => r.layer1Pass).length; const l2Total = reports.filter((r) => r.layer2Pass).length; return { overallPass, - summary: `Layer 1 ${l1Total}/3 pass · Layer 2 ${l2Total}/3 pass · Layer 3 skipped`, + summary: `Layer 1 ${l1Total}/3 pass · Layer 2 ${l2Total}/3 pass · ${layer3Summary}`, + ...(visualReport ? { visual: visualReport } : {}), }; } +async function runVisualPhase( + config: VisualJudgeConfig, + domain: DomainSpec, +): Promise<{ ios?: VisualJudgePlatformReport; android?: VisualJudgePlatformReport }> { + const screenshotDir = config.screenshotDir ?? resolve(process.cwd(), "tmp", "screenshots", domain.slug); + const rubric = config.rubric ?? DEFAULT_STAGE1_RUBRIC; + const spec = config.spec ?? domain.displayName; + + const platforms: Array<"ios" | "android"> = []; + if (config.ios) platforms.push("ios"); + if (config.android) platforms.push("android"); + + trace("judge", `Layer 3 (semantic) — judging ${platforms.join(" + ")} home screen against ${rubric.length}-criterion rubric`); + + const results = await Promise.all(platforms.map(async (platform) => { + const cfg = platform === "ios" ? config.ios! : config.android!; + const screenshotPath = join(screenshotDir, `${platform}-home.png`); + const visualResult = await runVisualJudge( + platform === "ios" + ? { + platform: "ios", + artifactPath: cfg.artifactPath, + bundleId: (cfg as { bundleId: string }).bundleId, + screenshotPath, + spec, + rubric, + } + : { + platform: "android", + artifactPath: cfg.artifactPath, + packageName: (cfg as { packageName: string }).packageName, + screenshotPath, + spec, + rubric, + }, + ); + trace( + "judge", + `Layer 3 ${platform}: ${visualResult.ok ? "PASS" : "FAIL"}` + + (visualResult.error ? ` — ${visualResult.error}` : ""), + ); + return [platform, toPlatformReport(visualResult)] as const; + })); + + return Object.fromEntries(results); +} + +function toPlatformReport(result: VisualJudgeResult): VisualJudgePlatformReport { + const report: VisualJudgePlatformReport = { pass: result.ok }; + if (result.screenshotPath !== undefined) report.screenshotPath = result.screenshotPath; + if (result.layer3?.scores) report.scores = result.layer3.scores; + if (result.error !== undefined) report.error = result.error; + return report; +} + +function formatLayer3Summary(report: { ios?: VisualJudgePlatformReport; android?: VisualJudgePlatformReport }): string { + const platforms = (["ios", "android"] as const).filter((p) => report[p] !== undefined); + const passing = platforms.filter((p) => report[p]?.pass).length; + return `Layer 3 ${passing}/${platforms.length} pass`; +} + const delay = (ms: number): Promise => new Promise((r) => { setTimeout(r, ms); }); async function runStubJudge(): Promise { diff --git a/src/agents/types.ts b/src/agents/types.ts index 1043f74..a2361ac 100644 --- a/src/agents/types.ts +++ b/src/agents/types.ts @@ -43,4 +43,17 @@ export type ReviewerResult = { export type JudgeResult = { overallPass: boolean; summary: string; + visual?: VisualJudgeReport; +}; + +export type VisualJudgeReport = { + ios?: VisualJudgePlatformReport; + android?: VisualJudgePlatformReport; +}; + +export type VisualJudgePlatformReport = { + pass: boolean; + screenshotPath?: string; + scores?: readonly { criterionId: string; pass: boolean; rationale: string }[]; + error?: string; };