diff --git a/python/cube/automation/judge_panel.py b/python/cube/automation/judge_panel.py index 2f4de988..129143ff 100644 --- a/python/cube/automation/judge_panel.py +++ b/python/cube/automation/judge_panel.py @@ -208,6 +208,13 @@ def _maybe_start_dev_server(judges: list, config, task_id: str): # configured AND (b) node_modules is missing in the chosen worktree. _ensure_node_modules_for_dev_server(worktree, config) + # Ensure Playwright's Chromium is present. @playwright/mcp shells out to + # a real browser; if it was never installed (`browser_navigate` would + # fail with "Executable doesn't exist"), install it now. Cheap no-op once + # the cache exists. Mirrors the node_modules fallback above — cube auto's + # environment can't be assumed by cube prv. + _ensure_playwright_browsers(judges) + return start_dev_server( worktree=worktree, config=config.verify.dev_server, @@ -215,6 +222,61 @@ def _maybe_start_dev_server(judges: list, config, task_id: str): ) +def _ensure_playwright_browsers(judges: list) -> None: + """Install Playwright's Chromium when a browser-judge needs it. + + Gated on at least one judge declaring an ``@playwright/mcp`` server. + Skips the install when the ms-playwright cache already has a chromium + build (the common case after first run / in a baked container image). + + Non-fatal: on failure the MCP server's ``browser_navigate`` will error + at call time and the judge falls back to static review — same graceful + degradation as a dead dev server. We don't crash the panel over a + browser download. + """ + wants_playwright = any( + isinstance(cfg, dict) and any(isinstance(a, str) and "@playwright/mcp" in a for a in (cfg.get("args") or [])) + for j in judges + for cfg in (j.mcp_servers or {}).values() + ) + if not wants_playwright: + return + + # ms-playwright cache lives under a platform-specific dir; check the two + # common locations rather than importing playwright (it's a node dep, not + # a python one). + cache_dirs = [ + Path.home() / "Library" / "Caches" / "ms-playwright", # macOS + Path.home() / ".cache" / "ms-playwright", # Linux + ] + has_chromium = any(d.exists() and any(p.name.startswith("chromium") for p in d.iterdir()) for d in cache_dirs) + if has_chromium: + return + + import subprocess as _subprocess + + print_info("playwright: Chromium not installed — running `npx playwright install chromium` (one-time)") + try: + result = _subprocess.run( + "npx --yes playwright install chromium chromium-headless-shell", + shell=True, + capture_output=True, + text=True, + timeout=300, + ) + if result.returncode != 0: + tail = (result.stderr or result.stdout or "").splitlines()[-5:] + print_warning( + "playwright: Chromium install failed (exit %d) — browser tools " + "will fail and the judge will review statically. Last lines:\n %s" + % (result.returncode, "\n ".join(tail)) + ) + else: + print_info("playwright: Chromium install complete") + except (OSError, _subprocess.TimeoutExpired) as e: + print_warning(f"playwright: Chromium install raised {type(e).__name__}: {e}") + + def _ensure_node_modules_for_dev_server(worktree: Path, config) -> None: """Lazy-install dependencies into ``worktree`` when missing. @@ -290,6 +352,51 @@ def _populate_browser_context(judges: list, handle, task_id: str) -> None: j_dir = screenshots_root / j.key j_dir.mkdir(parents=True, exist_ok=True) j.screenshots_dir = j_dir + j.mcp_servers = _augment_playwright_args(j.mcp_servers, j_dir) + + +def _augment_playwright_args(mcp_servers: dict, screenshots_dir: Path) -> dict: + """Inject runtime flags into any ``@playwright/mcp`` stdio server. + + Three flags matter for a headless judge subprocess: + + * ``--headless`` — @playwright/mcp launches *headed* by default. A headed + browser has nowhere to draw in a judge subprocess (and certainly not in + the cube-runner container), so it hangs/fails and the MCP server never + finishes connecting. This is the single biggest cause of the "pending" + server we saw in the wild. + * ``--output-dir `` — ``browser_take_screenshot`` writes + into the server's output dir, not an arbitrary path. Point it at this + judge's screenshot dir so the files land where the summary block looks + for them. + * ``--isolated`` — in-memory profile, nothing persisted to disk between + runs. Judges are ephemeral; we don't want a profile accreting state. + + Idempotent: only touches servers whose args reference ``@playwright/mcp``, + and only appends a flag that isn't already present (operator overrides + win). Returns a new dict; does not mutate the input. + """ + if not mcp_servers: + return mcp_servers + + augmented: dict = {} + for name, cfg in mcp_servers.items(): + if not isinstance(cfg, dict): + augmented[name] = cfg + continue + args = list(cfg.get("args", []) or []) + is_playwright = any(isinstance(a, str) and "@playwright/mcp" in a for a in args) + if not is_playwright: + augmented[name] = cfg + continue + if "--headless" not in args: + args.append("--headless") + if "--isolated" not in args: + args.append("--isolated") + if "--output-dir" not in args: + args += ["--output-dir", str(screenshots_dir)] + augmented[name] = {**cfg, "args": args} + return augmented def _apply_judge_persona(prompt: str, judge_info: JudgeInfo) -> str: @@ -354,28 +461,34 @@ def _apply_browser_addendum(prompt: str, judge_info: JudgeInfo) -> str: screenshots_dir = judge_info.screenshots_dir addendum = f"""# BROWSER REVIEW (Playwright MCP available) -You have Playwright MCP tools available for this review. Use them to -visually verify any UI change in the diff: - - - ``playwright_navigate`` — open a route at the running dev server - - ``playwright_screenshot`` — capture the current viewport (full-page - or element-scoped) - - ``playwright_click`` / ``playwright_fill`` / ``playwright_select_option`` +You have Playwright MCP tools available for this review. They are exposed +under the ``mcp__playwright__`` namespace — call them by their real names +(the SDK may also surface short aliases, but these are the canonical ones): + + - ``mcp__playwright__browser_navigate`` — open a route at the running dev server + - ``mcp__playwright__browser_snapshot`` — accessibility snapshot of the page + (prefer this for structure/text/ARIA; it's cheaper than a screenshot) + - ``mcp__playwright__browser_take_screenshot`` — capture the viewport + (``fullPage`` for the whole page, or pass an element ref). Screenshots + are written to this judge's output directory automatically. + - ``mcp__playwright__browser_click`` / ``browser_type`` / ``browser_select_option`` — drive interactions to reach the state you want to verify - - ``playwright_evaluate`` — run JS to inspect computed styles, ARIA - labels, layout + - ``mcp__playwright__browser_evaluate`` — run JS to inspect computed styles, + ARIA labels, layout Dev server is running at: **{judge_info.dev_server_url}** For each changed route in the diff: - 1. Navigate to it. + 1. Navigate to it with ``mcp__playwright__browser_navigate``. 2. Screenshot the relevant states (default, hover/focus where applicable, dark mode if the project has it, empty / error / - loading states if the diff touches them). - 3. Save each screenshot under: ``{screenshots_dir}/.png`` - (e.g. ``checkout-empty-cart-dark.png``). - 4. Reference every screenshot you took in your decision file's - ``screenshots`` field: + loading states if the diff touches them) with + ``mcp__playwright__browser_take_screenshot``. Pass a descriptive + ``filename`` (e.g. ``checkout-empty-cart-dark.png``) — the server + writes it into THIS judge's output directory automatically, which + is: ``{screenshots_dir}`` + 3. Reference every screenshot you took in your decision file's + ``screenshots`` field, using the full path under that directory: ```json "screenshots": [ @@ -390,10 +503,16 @@ def _apply_browser_addendum(prompt: str, judge_info: JudgeInfo) -> str: intent, or you spot a regression text review couldn't catch, surface it. -If you decide the change is non-visual (pure refactor, types-only, -test-only, server-side), you can skip the browser pass — but say so -explicitly in your review summary so the operator knows you considered -it. +This PR has frontend changes and a dev server is running, so the +browser pass is NOT optional: before you finalise your decision you +MUST navigate to at least one affected route and capture at least one +screenshot, even when you expect the change to look fine — a clean +screenshot is itself evidence the render path works. The only +exception is a change with genuinely no rendered surface (types-only, +test-only, pure server-side helper with no route touched); in that +case state explicitly in your summary which file you judged non-visual +and why, so the operator knows the browser pass was considered and +deliberately skipped, not forgotten. --- diff --git a/sdk-bridge/dist/cli.js b/sdk-bridge/dist/cli.js index ab3e32ba..51963ce9 100644 --- a/sdk-bridge/dist/cli.js +++ b/sdk-bridge/dist/cli.js @@ -67,6 +67,22 @@ function emitLine(obj) { process.stdout.write(JSON.stringify(obj) + ` `); } +function shapeMcpServers(servers) { + const out = {}; + for (const [name, raw] of Object.entries(servers)) { + if (raw && typeof raw === "object" && "command" in raw) { + const cfg = raw; + out[name] = { + type: "stdio", + alwaysLoad: true, + ...cfg + }; + } else { + out[name] = raw; + } + } + return out; +} async function run(opts) { const { model, effort } = resolveModel(opts.model); const env = { ...process.env }; @@ -81,7 +97,7 @@ async function run(opts) { allowDangerouslySkipPermissions: true, includePartialMessages: true, env, - ...opts.mcpServers && Object.keys(opts.mcpServers).length > 0 ? { mcpServers: opts.mcpServers } : {} + ...opts.mcpServers && Object.keys(opts.mcpServers).length > 0 ? { mcpServers: shapeMcpServers(opts.mcpServers) } : {} } }); try { diff --git a/sdk-bridge/src/providers/claude.ts b/sdk-bridge/src/providers/claude.ts index 01473e3e..12c70ab1 100644 --- a/sdk-bridge/src/providers/claude.ts +++ b/sdk-bridge/src/providers/claude.ts @@ -77,6 +77,48 @@ function emitLine(obj: unknown): void { process.stdout.write(JSON.stringify(obj) + "\n"); } +/** + * Shape the mcpServers map for the Agent SDK. + * + * Two SDK quirks bite browser-judges otherwise (observed: Playwright stayed + * "pending" forever, zero tools in the judge's turn-1 prompt, zero browser + * calls): + * + * 1. MCP startup is NON-BLOCKING by default. The turn-1 prompt is built + * before a stdio server (e.g. `npx @playwright/mcp`) finishes its + * connect handshake, so its tools never make it into the prompt — the + * judge can't call tools it was never told about. + * 2. Tools are deferred behind tool-search by default, so even once the + * server connects the judge has to go looking for them. + * + * Setting `alwaysLoad: true` fixes both: it forces the tools into the turn-1 + * prompt AND blocks startup until the server connects (capped at the SDK's + * 5s connect timeout). We also pin `type: "stdio"` for any entry carrying a + * `command`, so the SDK never mis-routes a command-based server. + * + * Untouched: servers we don't recognise as stdio (http/sse maps pass through + * verbatim), and the operator's config-file-discovered servers (those don't + * come through this param at all). + */ +function shapeMcpServers( + servers: Record, +): Record { + const out: Record = {}; + for (const [name, raw] of Object.entries(servers)) { + if (raw && typeof raw === "object" && "command" in (raw as object)) { + const cfg = raw as Record; + out[name] = { + type: "stdio", + alwaysLoad: true, + ...cfg, + }; + } else { + out[name] = raw; + } + } + return out; +} + export async function run(opts: RunOptions): Promise { const { model, effort } = resolveModel(opts.model); @@ -101,7 +143,7 @@ export async function run(opts: RunOptions): Promise { // discovery. Non-empty → judges get the declared tools (e.g. // Playwright for judge_4's browser-based UI review). ...(opts.mcpServers && Object.keys(opts.mcpServers).length > 0 - ? { mcpServers: opts.mcpServers as any } + ? { mcpServers: shapeMcpServers(opts.mcpServers) as any } : {}), }, }); diff --git a/tests/automation/test_judge_panel_browser_gating.py b/tests/automation/test_judge_panel_browser_gating.py index 86d63eae..2caec00e 100644 --- a/tests/automation/test_judge_panel_browser_gating.py +++ b/tests/automation/test_judge_panel_browser_gating.py @@ -8,7 +8,9 @@ from __future__ import annotations -from cube.automation.judge_panel import _has_frontend_changes +from pathlib import Path + +from cube.automation.judge_panel import _augment_playwright_args, _has_frontend_changes from cube.models.types import JudgeInfo @@ -75,3 +77,48 @@ def test_mixed_browser_and_non_browser_judges(): _judge(mcp=True, scoped_files=["src/Button.tsx"]), ] assert _has_frontend_changes(judges) is True + + +# --------------------------------------------------------------------------- +# _augment_playwright_args: runtime flags injected into the @playwright/mcp +# stdio server so a headless judge can actually drive a browser and its +# screenshots land in the right place. +# --------------------------------------------------------------------------- + + +def test_augment_injects_headless_isolated_and_output_dir(): + servers = {"playwright": {"command": "npx", "args": ["-y", "@playwright/mcp@latest"]}} + out = _augment_playwright_args(servers, Path("/tmp/shots/judge_4")) + args = out["playwright"]["args"] + assert "--headless" in args + assert "--isolated" in args + assert args[args.index("--output-dir") + 1] == "/tmp/shots/judge_4" + # Original list not mutated. + assert servers["playwright"]["args"] == ["-y", "@playwright/mcp@latest"] + + +def test_augment_is_idempotent_and_respects_operator_flags(): + servers = { + "playwright": { + "command": "npx", + "args": ["-y", "@playwright/mcp@latest", "--headless", "--output-dir", "/custom"], + } + } + out = _augment_playwright_args(servers, Path("/tmp/shots/judge_4")) + args = out["playwright"]["args"] + # No duplicate --headless, operator's --output-dir wins. + assert args.count("--headless") == 1 + assert args.count("--output-dir") == 1 + assert args[args.index("--output-dir") + 1] == "/custom" + # --isolated still gets added (operator didn't set it). + assert "--isolated" in args + + +def test_augment_ignores_non_playwright_servers(): + servers = {"other": {"command": "node", "args": ["server.js"]}} + out = _augment_playwright_args(servers, Path("/tmp/x")) + assert out["other"]["args"] == ["server.js"] + + +def test_augment_handles_empty(): + assert _augment_playwright_args({}, Path("/tmp/x")) == {}