diff --git a/src/agents/judge.ts b/src/agents/judge.ts index 18443d6..262bd65 100644 --- a/src/agents/judge.ts +++ b/src/agents/judge.ts @@ -1,9 +1,22 @@ -import { resolve } from "node:path"; +import { resolve, join } from "node:path"; import { trace } from "../trace.js"; import { isStub } from "../stub.js"; import { runLayer1 } from "../validation/layer1.js"; import { runLayer2 } from "../validation/layer2.js"; -import type { DomainSpec, JudgeResult, Platform, ReviewerResult, WorkerResult } from "./types.js"; +import { + runVisualJudge, + DEFAULT_STAGE1_RUBRIC, + type VisualJudgeResult, +} from "../validation/visual-judge.js"; +import type { Layer3Criterion } from "../validation/layer3.js"; +import type { + DomainSpec, + JudgeResult, + Platform, + ReviewerResult, + VisualJudgePlatformReport, + WorkerResult, +} from "./types.js"; export type JudgeInput = { domain: DomainSpec; @@ -11,6 +24,15 @@ export type JudgeInput = { ios: WorkerResult; android: WorkerResult; reviewer: ReviewerResult; + visual?: VisualJudgeConfig; +}; + +export type VisualJudgeConfig = { + ios?: { artifactPath: string; bundleId: string }; + android?: { artifactPath: string; packageName: string }; + screenshotDir?: string; + rubric?: readonly Layer3Criterion[]; + spec?: string; }; type PlatformReport = { @@ -40,18 +62,91 @@ export async function runJudge(input: JudgeInput): Promise { trace("judge", `${r.platform}: Layer 1 ${l1} · Layer 2 ${l2} [${r.layer2Command}]`); } - trace("judge", "Layer 3 (semantic, Opus 4.7 vision judge) — not yet wired; treating as skipped"); + let visualReport: { ios?: VisualJudgePlatformReport; android?: VisualJudgePlatformReport } | undefined; + let layer3Summary = "Layer 3 skipped"; + if (input.visual && (input.visual.ios || input.visual.android)) { + visualReport = await runVisualPhase(input.visual, input.domain); + layer3Summary = formatLayer3Summary(visualReport); + } else { + trace("judge", "Layer 3 (semantic, Opus 4.7 vision judge) — visual config not provided; skipped"); + } - const overallPass = reports.every((r) => r.layer1Pass && r.layer2Pass); + const layer1Layer2Pass = reports.every((r) => r.layer1Pass && r.layer2Pass); + const visualPass = visualReport + ? Object.values(visualReport).every((r): r is VisualJudgePlatformReport => Boolean(r) && r!.pass) + : true; + const overallPass = layer1Layer2Pass && visualPass; const l1Total = reports.filter((r) => r.layer1Pass).length; const l2Total = reports.filter((r) => r.layer2Pass).length; return { overallPass, - summary: `Layer 1 ${l1Total}/3 pass · Layer 2 ${l2Total}/3 pass · Layer 3 skipped`, + summary: `Layer 1 ${l1Total}/3 pass · Layer 2 ${l2Total}/3 pass · ${layer3Summary}`, + ...(visualReport ? { visual: visualReport } : {}), }; } +async function runVisualPhase( + config: VisualJudgeConfig, + domain: DomainSpec, +): Promise<{ ios?: VisualJudgePlatformReport; android?: VisualJudgePlatformReport }> { + const screenshotDir = config.screenshotDir ?? resolve(process.cwd(), "tmp", "screenshots", domain.slug); + const rubric = config.rubric ?? DEFAULT_STAGE1_RUBRIC; + const spec = config.spec ?? domain.displayName; + + const platforms: Array<"ios" | "android"> = []; + if (config.ios) platforms.push("ios"); + if (config.android) platforms.push("android"); + + trace("judge", `Layer 3 (semantic) — judging ${platforms.join(" + ")} home screen against ${rubric.length}-criterion rubric`); + + const results = await Promise.all(platforms.map(async (platform) => { + const cfg = platform === "ios" ? config.ios! : config.android!; + const screenshotPath = join(screenshotDir, `${platform}-home.png`); + const visualResult = await runVisualJudge( + platform === "ios" + ? { + platform: "ios", + artifactPath: cfg.artifactPath, + bundleId: (cfg as { bundleId: string }).bundleId, + screenshotPath, + spec, + rubric, + } + : { + platform: "android", + artifactPath: cfg.artifactPath, + packageName: (cfg as { packageName: string }).packageName, + screenshotPath, + spec, + rubric, + }, + ); + trace( + "judge", + `Layer 3 ${platform}: ${visualResult.ok ? "PASS" : "FAIL"}` + + (visualResult.error ? ` — ${visualResult.error}` : ""), + ); + return [platform, toPlatformReport(visualResult)] as const; + })); + + return Object.fromEntries(results); +} + +function toPlatformReport(result: VisualJudgeResult): VisualJudgePlatformReport { + const report: VisualJudgePlatformReport = { pass: result.ok }; + if (result.screenshotPath !== undefined) report.screenshotPath = result.screenshotPath; + if (result.layer3?.scores) report.scores = result.layer3.scores; + if (result.error !== undefined) report.error = result.error; + return report; +} + +function formatLayer3Summary(report: { ios?: VisualJudgePlatformReport; android?: VisualJudgePlatformReport }): string { + const platforms = (["ios", "android"] as const).filter((p) => report[p] !== undefined); + const passing = platforms.filter((p) => report[p]?.pass).length; + return `Layer 3 ${passing}/${platforms.length} pass`; +} + const delay = (ms: number): Promise => new Promise((r) => { setTimeout(r, ms); }); async function runStubJudge(): Promise { diff --git a/src/agents/types.ts b/src/agents/types.ts index 1043f74..a2361ac 100644 --- a/src/agents/types.ts +++ b/src/agents/types.ts @@ -43,4 +43,17 @@ export type ReviewerResult = { export type JudgeResult = { overallPass: boolean; summary: string; + visual?: VisualJudgeReport; +}; + +export type VisualJudgeReport = { + ios?: VisualJudgePlatformReport; + android?: VisualJudgePlatformReport; +}; + +export type VisualJudgePlatformReport = { + pass: boolean; + screenshotPath?: string; + scores?: readonly { criterionId: string; pass: boolean; rationale: string }[]; + error?: string; };