aetheronhq · jacsamell · Jun 3, 2026 · Jun 3, 2026
diff --git a/python/cube/automation/judge_panel.py b/python/cube/automation/judge_panel.py
@@ -208,13 +208,75 @@ def _maybe_start_dev_server(judges: list, config, task_id: str):
     # configured AND (b) node_modules is missing in the chosen worktree.
     _ensure_node_modules_for_dev_server(worktree, config)
 
+    # Ensure Playwright's Chromium is present. @playwright/mcp shells out to
+    # a real browser; if it was never installed (`browser_navigate` would
+    # fail with "Executable doesn't exist"), install it now. Cheap no-op once
+    # the cache exists. Mirrors the node_modules fallback above — cube auto's
+    # environment can't be assumed by cube prv.
+    _ensure_playwright_browsers(judges)
+
     return start_dev_server(
         worktree=worktree,
         config=config.verify.dev_server,
         task_id=task_id,
     )
 
 
+def _ensure_playwright_browsers(judges: list) -> None:
+    """Install Playwright's Chromium when a browser-judge needs it.
+
+    Gated on at least one judge declaring an ``@playwright/mcp`` server.
+    Skips the install when the ms-playwright cache already has a chromium
+    build (the common case after first run / in a baked container image).
+
+    Non-fatal: on failure the MCP server's ``browser_navigate`` will error
+    at call time and the judge falls back to static review — same graceful
+    degradation as a dead dev server. We don't crash the panel over a
+    browser download.
+    """
+    wants_playwright = any(
+        isinstance(cfg, dict) and any(isinstance(a, str) and "@playwright/mcp" in a for a in (cfg.get("args") or []))
+        for j in judges
+        for cfg in (j.mcp_servers or {}).values()
+    )
+    if not wants_playwright:
+        return
+
+    # ms-playwright cache lives under a platform-specific dir; check the two
+    # common locations rather than importing playwright (it's a node dep, not
+    # a python one).
+    cache_dirs = [
+        Path.home() / "Library" / "Caches" / "ms-playwright",  # macOS
+        Path.home() / ".cache" / "ms-playwright",  # Linux
+    ]
+    has_chromium = any(d.exists() and any(p.name.startswith("chromium") for p in d.iterdir()) for d in cache_dirs)
+    if has_chromium:
+        return
+
+    import subprocess as _subprocess
+
+    print_info("playwright: Chromium not installed — running `npx playwright install chromium` (one-time)")
+    try:
+        result = _subprocess.run(
+            "npx --yes playwright install chromium chromium-headless-shell",
+            shell=True,
+            capture_output=True,
+            text=True,
+            timeout=300,
+        )
+        if result.returncode != 0:
+            tail = (result.stderr or result.stdout or "").splitlines()[-5:]
+            print_warning(
+                "playwright: Chromium install failed (exit %d) — browser tools "
+                "will fail and the judge will review statically. Last lines:\n  %s"
+                % (result.returncode, "\n  ".join(tail))
+            )
+        else:
+            print_info("playwright: Chromium install complete")
+    except (OSError, _subprocess.TimeoutExpired) as e:
+        print_warning(f"playwright: Chromium install raised {type(e).__name__}: {e}")
+
+
 def _ensure_node_modules_for_dev_server(worktree: Path, config) -> None:
     """Lazy-install dependencies into ``worktree`` when missing.
 
@@ -290,6 +352,51 @@ def _populate_browser_context(judges: list, handle, task_id: str) -> None:
         j_dir = screenshots_root / j.key
         j_dir.mkdir(parents=True, exist_ok=True)
         j.screenshots_dir = j_dir
+        j.mcp_servers = _augment_playwright_args(j.mcp_servers, j_dir)
+
+
+def _augment_playwright_args(mcp_servers: dict, screenshots_dir: Path) -> dict:
+    """Inject runtime flags into any ``@playwright/mcp`` stdio server.
+
+    Three flags matter for a headless judge subprocess:
+
+    * ``--headless`` — @playwright/mcp launches *headed* by default. A headed
+      browser has nowhere to draw in a judge subprocess (and certainly not in
+      the cube-runner container), so it hangs/fails and the MCP server never
+      finishes connecting. This is the single biggest cause of the "pending"
+      server we saw in the wild.
+    * ``--output-dir <screenshots_dir>`` — ``browser_take_screenshot`` writes
+      into the server's output dir, not an arbitrary path. Point it at this
+      judge's screenshot dir so the files land where the summary block looks
+      for them.
+    * ``--isolated`` — in-memory profile, nothing persisted to disk between
+      runs. Judges are ephemeral; we don't want a profile accreting state.
+
+    Idempotent: only touches servers whose args reference ``@playwright/mcp``,
+    and only appends a flag that isn't already present (operator overrides
+    win). Returns a new dict; does not mutate the input.
+    """
+    if not mcp_servers:
+        return mcp_servers
+
+    augmented: dict = {}
+    for name, cfg in mcp_servers.items():
+        if not isinstance(cfg, dict):
+            augmented[name] = cfg
+            continue
+        args = list(cfg.get("args", []) or [])
+        is_playwright = any(isinstance(a, str) and "@playwright/mcp" in a for a in args)
+        if not is_playwright:
+            augmented[name] = cfg
+            continue
+        if "--headless" not in args:
+            args.append("--headless")
+        if "--isolated" not in args:
+            args.append("--isolated")
+        if "--output-dir" not in args:
+            args += ["--output-dir", str(screenshots_dir)]
+        augmented[name] = {**cfg, "args": args}
+    return augmented
 
 
 def _apply_judge_persona(prompt: str, judge_info: JudgeInfo) -> str:
@@ -354,28 +461,34 @@ def _apply_browser_addendum(prompt: str, judge_info: JudgeInfo) -> str:
     screenshots_dir = judge_info.screenshots_dir
     addendum = f"""# BROWSER REVIEW (Playwright MCP available)
 
-You have Playwright MCP tools available for this review. Use them to
-visually verify any UI change in the diff:
-
-  - ``playwright_navigate`` — open a route at the running dev server
-  - ``playwright_screenshot`` — capture the current viewport (full-page
-    or element-scoped)
-  - ``playwright_click`` / ``playwright_fill`` / ``playwright_select_option``
+You have Playwright MCP tools available for this review. They are exposed
+under the ``mcp__playwright__`` namespace — call them by their real names
+(the SDK may also surface short aliases, but these are the canonical ones):
+
+  - ``mcp__playwright__browser_navigate`` — open a route at the running dev server
+  - ``mcp__playwright__browser_snapshot`` — accessibility snapshot of the page
+    (prefer this for structure/text/ARIA; it's cheaper than a screenshot)
+  - ``mcp__playwright__browser_take_screenshot`` — capture the viewport
+    (``fullPage`` for the whole page, or pass an element ref). Screenshots
+    are written to this judge's output directory automatically.
+  - ``mcp__playwright__browser_click`` / ``browser_type`` / ``browser_select_option``
     — drive interactions to reach the state you want to verify
-  - ``playwright_evaluate`` — run JS to inspect computed styles, ARIA
-    labels, layout
+  - ``mcp__playwright__browser_evaluate`` — run JS to inspect computed styles,
+    ARIA labels, layout
 
 Dev server is running at: **{judge_info.dev_server_url}**
 
 For each changed route in the diff:
-  1. Navigate to it.
+  1. Navigate to it with ``mcp__playwright__browser_navigate``.
   2. Screenshot the relevant states (default, hover/focus where
      applicable, dark mode if the project has it, empty / error /
-     loading states if the diff touches them).
-  3. Save each screenshot under: ``{screenshots_dir}/<descriptive-name>.png``
-     (e.g. ``checkout-empty-cart-dark.png``).
-  4. Reference every screenshot you took in your decision file's
-     ``screenshots`` field:
+     loading states if the diff touches them) with
+     ``mcp__playwright__browser_take_screenshot``. Pass a descriptive
+     ``filename`` (e.g. ``checkout-empty-cart-dark.png``) — the server
+     writes it into THIS judge's output directory automatically, which
+     is: ``{screenshots_dir}``
+  3. Reference every screenshot you took in your decision file's
+     ``screenshots`` field, using the full path under that directory:
 
      ```json
      "screenshots": [
@@ -390,10 +503,16 @@ def _apply_browser_addendum(prompt: str, judge_info: JudgeInfo) -> str:
 intent, or you spot a regression text review couldn't catch, surface
 it.
 
-If you decide the change is non-visual (pure refactor, types-only,
-test-only, server-side), you can skip the browser pass — but say so
-explicitly in your review summary so the operator knows you considered
-it.
+This PR has frontend changes and a dev server is running, so the
+browser pass is NOT optional: before you finalise your decision you
+MUST navigate to at least one affected route and capture at least one
+screenshot, even when you expect the change to look fine — a clean
+screenshot is itself evidence the render path works. The only
+exception is a change with genuinely no rendered surface (types-only,
+test-only, pure server-side helper with no route touched); in that
+case state explicitly in your summary which file you judged non-visual
+and why, so the operator knows the browser pass was considered and
+deliberately skipped, not forgotten.
 
 ---
 

diff --git a/sdk-bridge/dist/cli.js b/sdk-bridge/dist/cli.js
@@ -67,6 +67,22 @@ function emitLine(obj) {
   process.stdout.write(JSON.stringify(obj) + `
 `);
 }
+function shapeMcpServers(servers) {
+  const out = {};
+  for (const [name, raw] of Object.entries(servers)) {
+    if (raw && typeof raw === "object" && "command" in raw) {
+      const cfg = raw;
+      out[name] = {
+        type: "stdio",
+        alwaysLoad: true,
+        ...cfg
+      };
+    } else {
+      out[name] = raw;
+    }
+  }
+  return out;
+}
 async function run(opts) {
   const { model, effort } = resolveModel(opts.model);
   const env = { ...process.env };
@@ -81,7 +97,7 @@ async function run(opts) {
       allowDangerouslySkipPermissions: true,
       includePartialMessages: true,
       env,
-      ...opts.mcpServers && Object.keys(opts.mcpServers).length > 0 ? { mcpServers: opts.mcpServers } : {}
+      ...opts.mcpServers && Object.keys(opts.mcpServers).length > 0 ? { mcpServers: shapeMcpServers(opts.mcpServers) } : {}
     }
   });
   try {

diff --git a/sdk-bridge/src/providers/claude.ts b/sdk-bridge/src/providers/claude.ts
@@ -77,6 +77,48 @@ function emitLine(obj: unknown): void {
   process.stdout.write(JSON.stringify(obj) + "\n");
 }
 
+/**
+ * Shape the mcpServers map for the Agent SDK.
+ *
+ * Two SDK quirks bite browser-judges otherwise (observed: Playwright stayed
+ * "pending" forever, zero tools in the judge's turn-1 prompt, zero browser
+ * calls):
+ *
+ *   1. MCP startup is NON-BLOCKING by default. The turn-1 prompt is built
+ *      before a stdio server (e.g. `npx @playwright/mcp`) finishes its
+ *      connect handshake, so its tools never make it into the prompt — the
+ *      judge can't call tools it was never told about.
+ *   2. Tools are deferred behind tool-search by default, so even once the
+ *      server connects the judge has to go looking for them.
+ *
+ * Setting `alwaysLoad: true` fixes both: it forces the tools into the turn-1
+ * prompt AND blocks startup until the server connects (capped at the SDK's
+ * 5s connect timeout). We also pin `type: "stdio"` for any entry carrying a
+ * `command`, so the SDK never mis-routes a command-based server.
+ *
+ * Untouched: servers we don't recognise as stdio (http/sse maps pass through
+ * verbatim), and the operator's config-file-discovered servers (those don't
+ * come through this param at all).
+ */
+function shapeMcpServers(
+  servers: Record<string, unknown>,
+): Record<string, unknown> {
+  const out: Record<string, unknown> = {};
+  for (const [name, raw] of Object.entries(servers)) {
+    if (raw && typeof raw === "object" && "command" in (raw as object)) {
+      const cfg = raw as Record<string, unknown>;
+      out[name] = {
+        type: "stdio",
+        alwaysLoad: true,
+        ...cfg,
+      };
+    } else {
+      out[name] = raw;
+    }
+  }
+  return out;
+}
+
 export async function run(opts: RunOptions): Promise<void> {
   const { model, effort } = resolveModel(opts.model);
 
@@ -101,7 +143,7 @@ export async function run(opts: RunOptions): Promise<void> {
       // discovery. Non-empty → judges get the declared tools (e.g.
       // Playwright for judge_4's browser-based UI review).
       ...(opts.mcpServers && Object.keys(opts.mcpServers).length > 0
-        ? { mcpServers: opts.mcpServers as any }
+        ? { mcpServers: shapeMcpServers(opts.mcpServers) as any }
         : {}),
     },
   });

diff --git a/tests/automation/test_judge_panel_browser_gating.py b/tests/automation/test_judge_panel_browser_gating.py
@@ -8,7 +8,9 @@
 
 from __future__ import annotations
 
-from cube.automation.judge_panel import _has_frontend_changes
+from pathlib import Path
+
+from cube.automation.judge_panel import _augment_playwright_args, _has_frontend_changes
 from cube.models.types import JudgeInfo
 
 
@@ -75,3 +77,48 @@ def test_mixed_browser_and_non_browser_judges():
         _judge(mcp=True, scoped_files=["src/Button.tsx"]),
     ]
     assert _has_frontend_changes(judges) is True
+
+
+# ---------------------------------------------------------------------------
+# _augment_playwright_args: runtime flags injected into the @playwright/mcp
+# stdio server so a headless judge can actually drive a browser and its
+# screenshots land in the right place.
+# ---------------------------------------------------------------------------
+
+
+def test_augment_injects_headless_isolated_and_output_dir():
+    servers = {"playwright": {"command": "npx", "args": ["-y", "@playwright/mcp@latest"]}}
+    out = _augment_playwright_args(servers, Path("/tmp/shots/judge_4"))
+    args = out["playwright"]["args"]
+    assert "--headless" in args
+    assert "--isolated" in args
+    assert args[args.index("--output-dir") + 1] == "/tmp/shots/judge_4"
+    # Original list not mutated.
+    assert servers["playwright"]["args"] == ["-y", "@playwright/mcp@latest"]
+
+
+def test_augment_is_idempotent_and_respects_operator_flags():
+    servers = {
+        "playwright": {
+            "command": "npx",
+            "args": ["-y", "@playwright/mcp@latest", "--headless", "--output-dir", "/custom"],
+        }
+    }
+    out = _augment_playwright_args(servers, Path("/tmp/shots/judge_4"))
+    args = out["playwright"]["args"]
+    # No duplicate --headless, operator's --output-dir wins.
+    assert args.count("--headless") == 1
+    assert args.count("--output-dir") == 1
+    assert args[args.index("--output-dir") + 1] == "/custom"
+    # --isolated still gets added (operator didn't set it).
+    assert "--isolated" in args
+
+
+def test_augment_ignores_non_playwright_servers():
+    servers = {"other": {"command": "node", "args": ["server.js"]}}
+    out = _augment_playwright_args(servers, Path("/tmp/x"))
+    assert out["other"]["args"] == ["server.js"]
+
+
+def test_augment_handles_empty():
+    assert _augment_playwright_args({}, Path("/tmp/x")) == {}