Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 100 additions & 5 deletions src/agents/judge.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,38 @@
import { resolve } from "node:path";
import { resolve, join } from "node:path";
import { trace } from "../trace.js";
import { isStub } from "../stub.js";
import { runLayer1 } from "../validation/layer1.js";
import { runLayer2 } from "../validation/layer2.js";
import type { DomainSpec, JudgeResult, Platform, ReviewerResult, WorkerResult } from "./types.js";
import {
runVisualJudge,
DEFAULT_STAGE1_RUBRIC,
type VisualJudgeResult,
} from "../validation/visual-judge.js";
import type { Layer3Criterion } from "../validation/layer3.js";
import type {
DomainSpec,
JudgeResult,
Platform,
ReviewerResult,
VisualJudgePlatformReport,
WorkerResult,
} from "./types.js";

export type JudgeInput = {
domain: DomainSpec;
rails: WorkerResult;
ios: WorkerResult;
android: WorkerResult;
reviewer: ReviewerResult;
visual?: VisualJudgeConfig;
};

export type VisualJudgeConfig = {
ios?: { artifactPath: string; bundleId: string };
android?: { artifactPath: string; packageName: string };
screenshotDir?: string;
rubric?: readonly Layer3Criterion[];
spec?: string;
};

type PlatformReport = {
Expand Down Expand Up @@ -40,18 +62,91 @@ export async function runJudge(input: JudgeInput): Promise<JudgeResult> {
trace("judge", `${r.platform}: Layer 1 ${l1} · Layer 2 ${l2} [${r.layer2Command}]`);
}

trace("judge", "Layer 3 (semantic, Opus 4.7 vision judge) — not yet wired; treating as skipped");
let visualReport: { ios?: VisualJudgePlatformReport; android?: VisualJudgePlatformReport } | undefined;
let layer3Summary = "Layer 3 skipped";
if (input.visual && (input.visual.ios || input.visual.android)) {
visualReport = await runVisualPhase(input.visual, input.domain);
layer3Summary = formatLayer3Summary(visualReport);
} else {
trace("judge", "Layer 3 (semantic, Opus 4.7 vision judge) — visual config not provided; skipped");
}

const overallPass = reports.every((r) => r.layer1Pass && r.layer2Pass);
const layer1Layer2Pass = reports.every((r) => r.layer1Pass && r.layer2Pass);
const visualPass = visualReport
? Object.values(visualReport).every((r): r is VisualJudgePlatformReport => Boolean(r) && r!.pass)
: true;
const overallPass = layer1Layer2Pass && visualPass;
const l1Total = reports.filter((r) => r.layer1Pass).length;
const l2Total = reports.filter((r) => r.layer2Pass).length;

return {
overallPass,
summary: `Layer 1 ${l1Total}/3 pass · Layer 2 ${l2Total}/3 pass · Layer 3 skipped`,
summary: `Layer 1 ${l1Total}/3 pass · Layer 2 ${l2Total}/3 pass · ${layer3Summary}`,
...(visualReport ? { visual: visualReport } : {}),
};
}

async function runVisualPhase(
config: VisualJudgeConfig,
domain: DomainSpec,
): Promise<{ ios?: VisualJudgePlatformReport; android?: VisualJudgePlatformReport }> {
const screenshotDir = config.screenshotDir ?? resolve(process.cwd(), "tmp", "screenshots", domain.slug);
const rubric = config.rubric ?? DEFAULT_STAGE1_RUBRIC;
const spec = config.spec ?? domain.displayName;

const platforms: Array<"ios" | "android"> = [];
if (config.ios) platforms.push("ios");
if (config.android) platforms.push("android");

trace("judge", `Layer 3 (semantic) — judging ${platforms.join(" + ")} home screen against ${rubric.length}-criterion rubric`);

const results = await Promise.all(platforms.map(async (platform) => {
const cfg = platform === "ios" ? config.ios! : config.android!;
const screenshotPath = join(screenshotDir, `${platform}-home.png`);
const visualResult = await runVisualJudge(
platform === "ios"
? {
platform: "ios",
artifactPath: cfg.artifactPath,
bundleId: (cfg as { bundleId: string }).bundleId,
screenshotPath,
spec,
rubric,
}
: {
platform: "android",
artifactPath: cfg.artifactPath,
packageName: (cfg as { packageName: string }).packageName,
screenshotPath,
spec,
rubric,
},
);
trace(
"judge",
`Layer 3 ${platform}: ${visualResult.ok ? "PASS" : "FAIL"}` +
(visualResult.error ? ` — ${visualResult.error}` : ""),
);
return [platform, toPlatformReport(visualResult)] as const;
}));

return Object.fromEntries(results);
}

function toPlatformReport(result: VisualJudgeResult): VisualJudgePlatformReport {
const report: VisualJudgePlatformReport = { pass: result.ok };
if (result.screenshotPath !== undefined) report.screenshotPath = result.screenshotPath;
if (result.layer3?.scores) report.scores = result.layer3.scores;
if (result.error !== undefined) report.error = result.error;
return report;
}

function formatLayer3Summary(report: { ios?: VisualJudgePlatformReport; android?: VisualJudgePlatformReport }): string {
const platforms = (["ios", "android"] as const).filter((p) => report[p] !== undefined);
const passing = platforms.filter((p) => report[p]?.pass).length;
return `Layer 3 ${passing}/${platforms.length} pass`;
}

const delay = (ms: number): Promise<void> => new Promise((r) => { setTimeout(r, ms); });

async function runStubJudge(): Promise<JudgeResult> {
Expand Down
13 changes: 13 additions & 0 deletions src/agents/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,17 @@ export type ReviewerResult = {
export type JudgeResult = {
overallPass: boolean;
summary: string;
visual?: VisualJudgeReport;
};

export type VisualJudgeReport = {
ios?: VisualJudgePlatformReport;
android?: VisualJudgePlatformReport;
};

export type VisualJudgePlatformReport = {
pass: boolean;
screenshotPath?: string;
scores?: readonly { criterionId: string; pass: boolean; rationale: string }[];
error?: string;
};
Loading