Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,11 @@ The agent will also be available as a Claude Code plugin.
- For runtime validation (Layer 2 onwards): Xcode 26.3+ with iOS 26.2+ simulator, Android SDK with API 26+ emulator
- For UI automation: [`mobile-next/mobile-mcp`](https://github.com/mobile-next/mobile-mcp) (installed automatically as a Claude Code MCP server)

### Optional flags

- `NATIVEAPPTEMPLATE_VISUAL=1` — opts the run into Stage 1 visual judging (Layer 3). When set, Layer 2 runs in **build mode** instead of fast mode (full `xcodebuild build` + `./gradlew assembleDebug`), then for each platform the agent installs the app on the booted sim/emulator, captures the home screen, and judges it with Opus 4.7 vision against `DEFAULT_STAGE1_RUBRIC`. Adds 60-180s per platform depending on cold-build time. Requires a sim/emulator booted for each platform you want judged. Off by default — `npm run dev` keeps the existing fast path.
- `NATIVEAPPTEMPLATE_AGENT_ANTHROPIC_KEY` — dedicated workspace key, see [Security](#security).

## Validation (three layers)

The agent doesn't just generate code and exit — it validates the output.
Expand Down
98 changes: 42 additions & 56 deletions src/agents/judge.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
import { resolve, join } from "node:path";
import { resolve } from "node:path";
import { trace } from "../trace.js";
import { isStub } from "../stub.js";
import { runLayer1 } from "../validation/layer1.js";
import { runLayer2 } from "../validation/layer2.js";
import {
runVisualJudge,
DEFAULT_STAGE1_RUBRIC,
type VisualJudgeResult,
} from "../validation/visual-judge.js";
import { runLayer2, type Layer2Mode } from "../validation/layer2.js";
import { runStage1Visual } from "../validation/stage1.js";
import type { Layer3Criterion } from "../validation/layer3.js";
import type { VisualJudgeResult } from "../validation/visual-judge.js";
import type {
DomainSpec,
JudgeResult,
Expand All @@ -24,12 +21,16 @@ export type JudgeInput = {
ios: WorkerResult;
android: WorkerResult;
reviewer: ReviewerResult;
layer2Mode?: Layer2Mode;
visual?: VisualJudgeConfig;
};

// outDir-based: runJudge calls runStage1Visual which discovers the build
// artifact + identifier from each provided platform dir post-Layer-2-build.
// Caller must enable layer2Mode: "build" for the discovery to find anything.
export type VisualJudgeConfig = {
ios?: { artifactPath: string; bundleId: string };
android?: { artifactPath: string; packageName: string };
iosDir?: string;
androidDir?: string;
screenshotDir?: string;
rubric?: readonly Layer3Criterion[];
spec?: string;
Expand All @@ -47,13 +48,14 @@ type PlatformReport = {
export async function runJudge(input: JudgeInput): Promise<JudgeResult> {
if (isStub("judge")) return runStubJudge();

const layer2Mode: Layer2Mode = input.layer2Mode ?? "fast";
trace("judge", "Layer 1 (structural) — scanning for leftover tokens");
trace("judge", "Layer 2 (runtime) — validating toolchains load");
trace("judge", `Layer 2 (runtime, ${layer2Mode} mode) — validating toolchains load`);

const reports = await Promise.all([
evaluate(input.rails),
evaluate(input.ios),
evaluate(input.android),
evaluate(input.rails, layer2Mode),
evaluate(input.ios, layer2Mode),
evaluate(input.android, layer2Mode),
]);

for (const r of reports) {
Expand All @@ -64,7 +66,7 @@ export async function runJudge(input: JudgeInput): Promise<JudgeResult> {

let visualReport: { ios?: VisualJudgePlatformReport; android?: VisualJudgePlatformReport } | undefined;
let layer3Summary = "Layer 3 skipped";
if (input.visual && (input.visual.ios || input.visual.android)) {
if (input.visual && (input.visual.iosDir || input.visual.androidDir)) {
visualReport = await runVisualPhase(input.visual, input.domain);
layer3Summary = formatLayer3Summary(visualReport);
} else {
Expand All @@ -90,47 +92,31 @@ async function runVisualPhase(
config: VisualJudgeConfig,
domain: DomainSpec,
): Promise<{ ios?: VisualJudgePlatformReport; android?: VisualJudgePlatformReport }> {
const screenshotDir = config.screenshotDir ?? resolve(process.cwd(), "tmp", "screenshots", domain.slug);
const rubric = config.rubric ?? DEFAULT_STAGE1_RUBRIC;
const spec = config.spec ?? domain.displayName;

const platforms: Array<"ios" | "android"> = [];
if (config.ios) platforms.push("ios");
if (config.android) platforms.push("android");

trace("judge", `Layer 3 (semantic) — judging ${platforms.join(" + ")} home screen against ${rubric.length}-criterion rubric`);

const results = await Promise.all(platforms.map(async (platform) => {
const cfg = platform === "ios" ? config.ios! : config.android!;
const screenshotPath = join(screenshotDir, `${platform}-home.png`);
const visualResult = await runVisualJudge(
platform === "ios"
? {
platform: "ios",
artifactPath: cfg.artifactPath,
bundleId: (cfg as { bundleId: string }).bundleId,
screenshotPath,
spec,
rubric,
}
: {
platform: "android",
artifactPath: cfg.artifactPath,
packageName: (cfg as { packageName: string }).packageName,
screenshotPath,
spec,
rubric,
},
);
trace(
"judge",
`Layer 3 ${platform}: ${visualResult.ok ? "PASS" : "FAIL"}` +
(visualResult.error ? ` — ${visualResult.error}` : ""),
);
return [platform, toPlatformReport(visualResult)] as const;
}));

return Object.fromEntries(results);
if (config.iosDir) platforms.push("ios");
if (config.androidDir) platforms.push("android");

const rubric = config.rubric;
trace("judge", `Layer 3 (semantic) — judging ${platforms.join(" + ")} home screen against rubric`);

const stage1 = await runStage1Visual({
...(config.iosDir !== undefined ? { iosDir: config.iosDir } : {}),
...(config.androidDir !== undefined ? { androidDir: config.androidDir } : {}),
spec: config.spec ?? domain.displayName,
...(rubric !== undefined ? { rubric } : {}),
...(config.screenshotDir !== undefined ? { screenshotDir: config.screenshotDir } : {}),
});

const report: { ios?: VisualJudgePlatformReport; android?: VisualJudgePlatformReport } = {};
if (stage1.ios) {
report.ios = toPlatformReport(stage1.ios);
trace("judge", `Layer 3 ios: ${stage1.ios.ok ? "PASS" : "FAIL"}` + (stage1.ios.error ? ` — ${stage1.ios.error}` : ""));
}
if (stage1.android) {
report.android = toPlatformReport(stage1.android);
trace("judge", `Layer 3 android: ${stage1.android.ok ? "PASS" : "FAIL"}` + (stage1.android.error ? ` — ${stage1.android.error}` : ""));
}
return report;
}

function toPlatformReport(result: VisualJudgeResult): VisualJudgePlatformReport {
Expand Down Expand Up @@ -165,12 +151,12 @@ async function runStubJudge(): Promise<JudgeResult> {
return { overallPass: true, summary: "Layer 1/2/3 PASS" };
}

async function evaluate(worker: WorkerResult): Promise<PlatformReport> {
async function evaluate(worker: WorkerResult, layer2Mode: Layer2Mode): Promise<PlatformReport> {
const outDir = resolve(process.cwd(), worker.outDir);

const [layer1, layer2] = await Promise.all([
runLayer1({ projectDir: outDir, forbiddenTokens: worker.renamedFrom }),
runLayer2({ platform: worker.platform, outDir }),
runLayer2({ platform: worker.platform, outDir, mode: layer2Mode }),
]);

return {
Expand Down
27 changes: 25 additions & 2 deletions src/dispatch.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import { resolve } from "node:path";
import { runPlanner } from "./agents/planner.js";
import { runRailsWorker } from "./agents/workers/rails.js";
import { runIosWorker } from "./agents/workers/ios.js";
import { runAndroidWorker } from "./agents/workers/android.js";
import { runReviewer } from "./agents/reviewer.js";
import { runJudge } from "./agents/judge.js";
import { runJudge, type VisualJudgeConfig } from "./agents/judge.js";
import type { JudgeResult } from "./agents/types.js";

export async function dispatch(spec: string): Promise<JudgeResult> {
Expand All @@ -14,5 +15,27 @@ export async function dispatch(spec: string): Promise<JudgeResult> {
runAndroidWorker(domain),
]);
const reviewer = await runReviewer({ domain, rails, ios, android });
return runJudge({ domain, rails, ios, android, reviewer });

// Stage 1 visual judging is opt-in via NATIVEAPPTEMPLATE_VISUAL=1.
// It forces Layer 2 build mode (so .app / .apk artifacts exist for the
// discovery + install + launch + capture chain) and adds 60-180s to a
// run depending on the substrate's cold-build time. Off by default.
const visualEnabled = process.env['NATIVEAPPTEMPLATE_VISUAL'] === "1";
const visual: VisualJudgeConfig | undefined = visualEnabled
? {
iosDir: resolve(process.cwd(), ios.outDir),
androidDir: resolve(process.cwd(), android.outDir),
spec: domain.displayName,
}
: undefined;

return runJudge({
domain,
rails,
ios,
android,
reviewer,
...(visualEnabled ? { layer2Mode: "build" as const } : {}),
...(visual ? { visual } : {}),
});
}
Loading