diff --git a/README.md b/README.md index 8927ed6..2525211 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,11 @@ The agent will also be available as a Claude Code plugin. - For runtime validation (Layer 2 onwards): Xcode 26.3+ with iOS 26.2+ simulator, Android SDK with API 26+ emulator - For UI automation: [`mobile-next/mobile-mcp`](https://github.com/mobile-next/mobile-mcp) (installed automatically as a Claude Code MCP server) +### Optional flags + +- `NATIVEAPPTEMPLATE_VISUAL=1` — opts the run into Stage 1 visual judging (Layer 3). When set, Layer 2 runs in **build mode** instead of fast mode (full `xcodebuild build` + `./gradlew assembleDebug`), then for each platform the agent installs the app on the booted sim/emulator, captures the home screen, and judges it with Opus 4.7 vision against `DEFAULT_STAGE1_RUBRIC`. Adds 60-180s per platform depending on cold-build time. Requires a sim/emulator booted for each platform you want judged. Off by default — `npm run dev` keeps the existing fast path. +- `NATIVEAPPTEMPLATE_AGENT_ANTHROPIC_KEY` — dedicated workspace key, see [Security](#security). + ## Validation (three layers) The agent doesn't just generate code and exit — it validates the output. diff --git a/src/agents/judge.ts b/src/agents/judge.ts index 262bd65..d1cd50f 100644 --- a/src/agents/judge.ts +++ b/src/agents/judge.ts @@ -1,14 +1,11 @@ -import { resolve, join } from "node:path"; +import { resolve } from "node:path"; import { trace } from "../trace.js"; import { isStub } from "../stub.js"; import { runLayer1 } from "../validation/layer1.js"; -import { runLayer2 } from "../validation/layer2.js"; -import { - runVisualJudge, - DEFAULT_STAGE1_RUBRIC, - type VisualJudgeResult, -} from "../validation/visual-judge.js"; +import { runLayer2, type Layer2Mode } from "../validation/layer2.js"; +import { runStage1Visual } from "../validation/stage1.js"; import type { Layer3Criterion } from "../validation/layer3.js"; +import type { VisualJudgeResult } from "../validation/visual-judge.js"; import type { DomainSpec, JudgeResult, @@ -24,12 +21,16 @@ export type JudgeInput = { ios: WorkerResult; android: WorkerResult; reviewer: ReviewerResult; + layer2Mode?: Layer2Mode; visual?: VisualJudgeConfig; }; +// outDir-based: runJudge calls runStage1Visual which discovers the build +// artifact + identifier from each provided platform dir post-Layer-2-build. +// Caller must enable layer2Mode: "build" for the discovery to find anything. export type VisualJudgeConfig = { - ios?: { artifactPath: string; bundleId: string }; - android?: { artifactPath: string; packageName: string }; + iosDir?: string; + androidDir?: string; screenshotDir?: string; rubric?: readonly Layer3Criterion[]; spec?: string; @@ -47,13 +48,14 @@ type PlatformReport = { export async function runJudge(input: JudgeInput): Promise { if (isStub("judge")) return runStubJudge(); + const layer2Mode: Layer2Mode = input.layer2Mode ?? "fast"; trace("judge", "Layer 1 (structural) — scanning for leftover tokens"); - trace("judge", "Layer 2 (runtime) — validating toolchains load"); + trace("judge", `Layer 2 (runtime, ${layer2Mode} mode) — validating toolchains load`); const reports = await Promise.all([ - evaluate(input.rails), - evaluate(input.ios), - evaluate(input.android), + evaluate(input.rails, layer2Mode), + evaluate(input.ios, layer2Mode), + evaluate(input.android, layer2Mode), ]); for (const r of reports) { @@ -64,7 +66,7 @@ export async function runJudge(input: JudgeInput): Promise { let visualReport: { ios?: VisualJudgePlatformReport; android?: VisualJudgePlatformReport } | undefined; let layer3Summary = "Layer 3 skipped"; - if (input.visual && (input.visual.ios || input.visual.android)) { + if (input.visual && (input.visual.iosDir || input.visual.androidDir)) { visualReport = await runVisualPhase(input.visual, input.domain); layer3Summary = formatLayer3Summary(visualReport); } else { @@ -90,47 +92,31 @@ async function runVisualPhase( config: VisualJudgeConfig, domain: DomainSpec, ): Promise<{ ios?: VisualJudgePlatformReport; android?: VisualJudgePlatformReport }> { - const screenshotDir = config.screenshotDir ?? resolve(process.cwd(), "tmp", "screenshots", domain.slug); - const rubric = config.rubric ?? DEFAULT_STAGE1_RUBRIC; - const spec = config.spec ?? domain.displayName; - const platforms: Array<"ios" | "android"> = []; - if (config.ios) platforms.push("ios"); - if (config.android) platforms.push("android"); - - trace("judge", `Layer 3 (semantic) — judging ${platforms.join(" + ")} home screen against ${rubric.length}-criterion rubric`); - - const results = await Promise.all(platforms.map(async (platform) => { - const cfg = platform === "ios" ? config.ios! : config.android!; - const screenshotPath = join(screenshotDir, `${platform}-home.png`); - const visualResult = await runVisualJudge( - platform === "ios" - ? { - platform: "ios", - artifactPath: cfg.artifactPath, - bundleId: (cfg as { bundleId: string }).bundleId, - screenshotPath, - spec, - rubric, - } - : { - platform: "android", - artifactPath: cfg.artifactPath, - packageName: (cfg as { packageName: string }).packageName, - screenshotPath, - spec, - rubric, - }, - ); - trace( - "judge", - `Layer 3 ${platform}: ${visualResult.ok ? "PASS" : "FAIL"}` + - (visualResult.error ? ` — ${visualResult.error}` : ""), - ); - return [platform, toPlatformReport(visualResult)] as const; - })); - - return Object.fromEntries(results); + if (config.iosDir) platforms.push("ios"); + if (config.androidDir) platforms.push("android"); + + const rubric = config.rubric; + trace("judge", `Layer 3 (semantic) — judging ${platforms.join(" + ")} home screen against rubric`); + + const stage1 = await runStage1Visual({ + ...(config.iosDir !== undefined ? { iosDir: config.iosDir } : {}), + ...(config.androidDir !== undefined ? { androidDir: config.androidDir } : {}), + spec: config.spec ?? domain.displayName, + ...(rubric !== undefined ? { rubric } : {}), + ...(config.screenshotDir !== undefined ? { screenshotDir: config.screenshotDir } : {}), + }); + + const report: { ios?: VisualJudgePlatformReport; android?: VisualJudgePlatformReport } = {}; + if (stage1.ios) { + report.ios = toPlatformReport(stage1.ios); + trace("judge", `Layer 3 ios: ${stage1.ios.ok ? "PASS" : "FAIL"}` + (stage1.ios.error ? ` — ${stage1.ios.error}` : "")); + } + if (stage1.android) { + report.android = toPlatformReport(stage1.android); + trace("judge", `Layer 3 android: ${stage1.android.ok ? "PASS" : "FAIL"}` + (stage1.android.error ? ` — ${stage1.android.error}` : "")); + } + return report; } function toPlatformReport(result: VisualJudgeResult): VisualJudgePlatformReport { @@ -165,12 +151,12 @@ async function runStubJudge(): Promise { return { overallPass: true, summary: "Layer 1/2/3 PASS" }; } -async function evaluate(worker: WorkerResult): Promise { +async function evaluate(worker: WorkerResult, layer2Mode: Layer2Mode): Promise { const outDir = resolve(process.cwd(), worker.outDir); const [layer1, layer2] = await Promise.all([ runLayer1({ projectDir: outDir, forbiddenTokens: worker.renamedFrom }), - runLayer2({ platform: worker.platform, outDir }), + runLayer2({ platform: worker.platform, outDir, mode: layer2Mode }), ]); return { diff --git a/src/dispatch.ts b/src/dispatch.ts index f2a1cbf..330ec2c 100644 --- a/src/dispatch.ts +++ b/src/dispatch.ts @@ -1,9 +1,10 @@ +import { resolve } from "node:path"; import { runPlanner } from "./agents/planner.js"; import { runRailsWorker } from "./agents/workers/rails.js"; import { runIosWorker } from "./agents/workers/ios.js"; import { runAndroidWorker } from "./agents/workers/android.js"; import { runReviewer } from "./agents/reviewer.js"; -import { runJudge } from "./agents/judge.js"; +import { runJudge, type VisualJudgeConfig } from "./agents/judge.js"; import type { JudgeResult } from "./agents/types.js"; export async function dispatch(spec: string): Promise { @@ -14,5 +15,27 @@ export async function dispatch(spec: string): Promise { runAndroidWorker(domain), ]); const reviewer = await runReviewer({ domain, rails, ios, android }); - return runJudge({ domain, rails, ios, android, reviewer }); + + // Stage 1 visual judging is opt-in via NATIVEAPPTEMPLATE_VISUAL=1. + // It forces Layer 2 build mode (so .app / .apk artifacts exist for the + // discovery + install + launch + capture chain) and adds 60-180s to a + // run depending on the substrate's cold-build time. Off by default. + const visualEnabled = process.env['NATIVEAPPTEMPLATE_VISUAL'] === "1"; + const visual: VisualJudgeConfig | undefined = visualEnabled + ? { + iosDir: resolve(process.cwd(), ios.outDir), + androidDir: resolve(process.cwd(), android.outDir), + spec: domain.displayName, + } + : undefined; + + return runJudge({ + domain, + rails, + ios, + android, + reviewer, + ...(visualEnabled ? { layer2Mode: "build" as const } : {}), + ...(visual ? { visual } : {}), + }); }