Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 138 additions & 19 deletions python/cube/automation/judge_panel.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,13 +208,75 @@ def _maybe_start_dev_server(judges: list, config, task_id: str):
# configured AND (b) node_modules is missing in the chosen worktree.
_ensure_node_modules_for_dev_server(worktree, config)

# Ensure Playwright's Chromium is present. @playwright/mcp shells out to
# a real browser; if it was never installed (`browser_navigate` would
# fail with "Executable doesn't exist"), install it now. Cheap no-op once
# the cache exists. Mirrors the node_modules fallback above — cube auto's
# environment can't be assumed by cube prv.
_ensure_playwright_browsers(judges)

return start_dev_server(
worktree=worktree,
config=config.verify.dev_server,
task_id=task_id,
)


def _ensure_playwright_browsers(judges: list) -> None:
"""Install Playwright's Chromium when a browser-judge needs it.

Gated on at least one judge declaring an ``@playwright/mcp`` server.
Skips the install when the ms-playwright cache already has a chromium
build (the common case after first run / in a baked container image).

Non-fatal: on failure the MCP server's ``browser_navigate`` will error
at call time and the judge falls back to static review — same graceful
degradation as a dead dev server. We don't crash the panel over a
browser download.
"""
wants_playwright = any(
isinstance(cfg, dict) and any(isinstance(a, str) and "@playwright/mcp" in a for a in (cfg.get("args") or []))
for j in judges
for cfg in (j.mcp_servers or {}).values()
)
if not wants_playwright:
return

# ms-playwright cache lives under a platform-specific dir; check the two
# common locations rather than importing playwright (it's a node dep, not
# a python one).
cache_dirs = [
Path.home() / "Library" / "Caches" / "ms-playwright", # macOS
Path.home() / ".cache" / "ms-playwright", # Linux
]
has_chromium = any(d.exists() and any(p.name.startswith("chromium") for p in d.iterdir()) for d in cache_dirs)
if has_chromium:
return

import subprocess as _subprocess

print_info("playwright: Chromium not installed — running `npx playwright install chromium` (one-time)")
try:
result = _subprocess.run(
"npx --yes playwright install chromium chromium-headless-shell",
shell=True,
capture_output=True,
text=True,
timeout=300,
)
if result.returncode != 0:
tail = (result.stderr or result.stdout or "").splitlines()[-5:]
print_warning(
"playwright: Chromium install failed (exit %d) — browser tools "
"will fail and the judge will review statically. Last lines:\n %s"
% (result.returncode, "\n ".join(tail))
)
else:
print_info("playwright: Chromium install complete")
except (OSError, _subprocess.TimeoutExpired) as e:
print_warning(f"playwright: Chromium install raised {type(e).__name__}: {e}")


def _ensure_node_modules_for_dev_server(worktree: Path, config) -> None:
"""Lazy-install dependencies into ``worktree`` when missing.

Expand Down Expand Up @@ -290,6 +352,51 @@ def _populate_browser_context(judges: list, handle, task_id: str) -> None:
j_dir = screenshots_root / j.key
j_dir.mkdir(parents=True, exist_ok=True)
j.screenshots_dir = j_dir
j.mcp_servers = _augment_playwright_args(j.mcp_servers, j_dir)


def _augment_playwright_args(mcp_servers: dict, screenshots_dir: Path) -> dict:
"""Inject runtime flags into any ``@playwright/mcp`` stdio server.

Three flags matter for a headless judge subprocess:

* ``--headless`` — @playwright/mcp launches *headed* by default. A headed
browser has nowhere to draw in a judge subprocess (and certainly not in
the cube-runner container), so it hangs/fails and the MCP server never
finishes connecting. This is the single biggest cause of the "pending"
server we saw in the wild.
* ``--output-dir <screenshots_dir>`` — ``browser_take_screenshot`` writes
into the server's output dir, not an arbitrary path. Point it at this
judge's screenshot dir so the files land where the summary block looks
for them.
* ``--isolated`` — in-memory profile, nothing persisted to disk between
runs. Judges are ephemeral; we don't want a profile accreting state.

Idempotent: only touches servers whose args reference ``@playwright/mcp``,
and only appends a flag that isn't already present (operator overrides
win). Returns a new dict; does not mutate the input.
"""
if not mcp_servers:
return mcp_servers

augmented: dict = {}
for name, cfg in mcp_servers.items():
if not isinstance(cfg, dict):
augmented[name] = cfg
continue
args = list(cfg.get("args", []) or [])
is_playwright = any(isinstance(a, str) and "@playwright/mcp" in a for a in args)
if not is_playwright:
augmented[name] = cfg
continue
if "--headless" not in args:
args.append("--headless")
if "--isolated" not in args:
args.append("--isolated")
if "--output-dir" not in args:
args += ["--output-dir", str(screenshots_dir)]
augmented[name] = {**cfg, "args": args}
return augmented


def _apply_judge_persona(prompt: str, judge_info: JudgeInfo) -> str:
Expand Down Expand Up @@ -354,28 +461,34 @@ def _apply_browser_addendum(prompt: str, judge_info: JudgeInfo) -> str:
screenshots_dir = judge_info.screenshots_dir
addendum = f"""# BROWSER REVIEW (Playwright MCP available)

You have Playwright MCP tools available for this review. Use them to
visually verify any UI change in the diff:

- ``playwright_navigate`` — open a route at the running dev server
- ``playwright_screenshot`` — capture the current viewport (full-page
or element-scoped)
- ``playwright_click`` / ``playwright_fill`` / ``playwright_select_option``
You have Playwright MCP tools available for this review. They are exposed
under the ``mcp__playwright__`` namespace — call them by their real names
(the SDK may also surface short aliases, but these are the canonical ones):

- ``mcp__playwright__browser_navigate`` — open a route at the running dev server
- ``mcp__playwright__browser_snapshot`` — accessibility snapshot of the page
(prefer this for structure/text/ARIA; it's cheaper than a screenshot)
- ``mcp__playwright__browser_take_screenshot`` — capture the viewport
(``fullPage`` for the whole page, or pass an element ref). Screenshots
are written to this judge's output directory automatically.
- ``mcp__playwright__browser_click`` / ``browser_type`` / ``browser_select_option``
— drive interactions to reach the state you want to verify
- ``playwright_evaluate`` — run JS to inspect computed styles, ARIA
labels, layout
- ``mcp__playwright__browser_evaluate`` — run JS to inspect computed styles,
ARIA labels, layout

Dev server is running at: **{judge_info.dev_server_url}**

For each changed route in the diff:
1. Navigate to it.
1. Navigate to it with ``mcp__playwright__browser_navigate``.
2. Screenshot the relevant states (default, hover/focus where
applicable, dark mode if the project has it, empty / error /
loading states if the diff touches them).
3. Save each screenshot under: ``{screenshots_dir}/<descriptive-name>.png``
(e.g. ``checkout-empty-cart-dark.png``).
4. Reference every screenshot you took in your decision file's
``screenshots`` field:
loading states if the diff touches them) with
``mcp__playwright__browser_take_screenshot``. Pass a descriptive
``filename`` (e.g. ``checkout-empty-cart-dark.png``) — the server
writes it into THIS judge's output directory automatically, which
is: ``{screenshots_dir}``
3. Reference every screenshot you took in your decision file's
``screenshots`` field, using the full path under that directory:

```json
"screenshots": [
Expand All @@ -390,10 +503,16 @@ def _apply_browser_addendum(prompt: str, judge_info: JudgeInfo) -> str:
intent, or you spot a regression text review couldn't catch, surface
it.

If you decide the change is non-visual (pure refactor, types-only,
test-only, server-side), you can skip the browser pass — but say so
explicitly in your review summary so the operator knows you considered
it.
This PR has frontend changes and a dev server is running, so the
browser pass is NOT optional: before you finalise your decision you
MUST navigate to at least one affected route and capture at least one
screenshot, even when you expect the change to look fine — a clean
screenshot is itself evidence the render path works. The only
exception is a change with genuinely no rendered surface (types-only,
test-only, pure server-side helper with no route touched); in that
case state explicitly in your summary which file you judged non-visual
and why, so the operator knows the browser pass was considered and
deliberately skipped, not forgotten.

---

Expand Down
18 changes: 17 additions & 1 deletion sdk-bridge/dist/cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,22 @@ function emitLine(obj) {
process.stdout.write(JSON.stringify(obj) + `
`);
}
function shapeMcpServers(servers) {
const out = {};
for (const [name, raw] of Object.entries(servers)) {
if (raw && typeof raw === "object" && "command" in raw) {
const cfg = raw;
out[name] = {
type: "stdio",
alwaysLoad: true,
...cfg
};
} else {
out[name] = raw;
}
}
return out;
}
async function run(opts) {
const { model, effort } = resolveModel(opts.model);
const env = { ...process.env };
Expand All @@ -81,7 +97,7 @@ async function run(opts) {
allowDangerouslySkipPermissions: true,
includePartialMessages: true,
env,
...opts.mcpServers && Object.keys(opts.mcpServers).length > 0 ? { mcpServers: opts.mcpServers } : {}
...opts.mcpServers && Object.keys(opts.mcpServers).length > 0 ? { mcpServers: shapeMcpServers(opts.mcpServers) } : {}
}
});
try {
Expand Down
44 changes: 43 additions & 1 deletion sdk-bridge/src/providers/claude.ts
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,48 @@ function emitLine(obj: unknown): void {
process.stdout.write(JSON.stringify(obj) + "\n");
}

/**
* Shape the mcpServers map for the Agent SDK.
*
* Two SDK quirks bite browser-judges otherwise (observed: Playwright stayed
* "pending" forever, zero tools in the judge's turn-1 prompt, zero browser
* calls):
*
* 1. MCP startup is NON-BLOCKING by default. The turn-1 prompt is built
* before a stdio server (e.g. `npx @playwright/mcp`) finishes its
* connect handshake, so its tools never make it into the prompt — the
* judge can't call tools it was never told about.
* 2. Tools are deferred behind tool-search by default, so even once the
* server connects the judge has to go looking for them.
*
* Setting `alwaysLoad: true` fixes both: it forces the tools into the turn-1
* prompt AND blocks startup until the server connects (capped at the SDK's
* 5s connect timeout). We also pin `type: "stdio"` for any entry carrying a
* `command`, so the SDK never mis-routes a command-based server.
*
* Untouched: servers we don't recognise as stdio (http/sse maps pass through
* verbatim), and the operator's config-file-discovered servers (those don't
* come through this param at all).
*/
function shapeMcpServers(
servers: Record<string, unknown>,
): Record<string, unknown> {
const out: Record<string, unknown> = {};
for (const [name, raw] of Object.entries(servers)) {
if (raw && typeof raw === "object" && "command" in (raw as object)) {
const cfg = raw as Record<string, unknown>;
out[name] = {
type: "stdio",
alwaysLoad: true,
...cfg,
};
} else {
out[name] = raw;
}
}
return out;
}

export async function run(opts: RunOptions): Promise<void> {
const { model, effort } = resolveModel(opts.model);

Expand All @@ -101,7 +143,7 @@ export async function run(opts: RunOptions): Promise<void> {
// discovery. Non-empty → judges get the declared tools (e.g.
// Playwright for judge_4's browser-based UI review).
...(opts.mcpServers && Object.keys(opts.mcpServers).length > 0
? { mcpServers: opts.mcpServers as any }
? { mcpServers: shapeMcpServers(opts.mcpServers) as any }
: {}),
},
});
Expand Down
49 changes: 48 additions & 1 deletion tests/automation/test_judge_panel_browser_gating.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@

from __future__ import annotations

from cube.automation.judge_panel import _has_frontend_changes
from pathlib import Path

from cube.automation.judge_panel import _augment_playwright_args, _has_frontend_changes
from cube.models.types import JudgeInfo


Expand Down Expand Up @@ -75,3 +77,48 @@ def test_mixed_browser_and_non_browser_judges():
_judge(mcp=True, scoped_files=["src/Button.tsx"]),
]
assert _has_frontend_changes(judges) is True


# ---------------------------------------------------------------------------
# _augment_playwright_args: runtime flags injected into the @playwright/mcp
# stdio server so a headless judge can actually drive a browser and its
# screenshots land in the right place.
# ---------------------------------------------------------------------------


def test_augment_injects_headless_isolated_and_output_dir():
servers = {"playwright": {"command": "npx", "args": ["-y", "@playwright/mcp@latest"]}}
out = _augment_playwright_args(servers, Path("/tmp/shots/judge_4"))
args = out["playwright"]["args"]
assert "--headless" in args
assert "--isolated" in args
assert args[args.index("--output-dir") + 1] == "/tmp/shots/judge_4"
# Original list not mutated.
assert servers["playwright"]["args"] == ["-y", "@playwright/mcp@latest"]


def test_augment_is_idempotent_and_respects_operator_flags():
servers = {
"playwright": {
"command": "npx",
"args": ["-y", "@playwright/mcp@latest", "--headless", "--output-dir", "/custom"],
}
}
out = _augment_playwright_args(servers, Path("/tmp/shots/judge_4"))
args = out["playwright"]["args"]
# No duplicate --headless, operator's --output-dir wins.
assert args.count("--headless") == 1
assert args.count("--output-dir") == 1
assert args[args.index("--output-dir") + 1] == "/custom"
# --isolated still gets added (operator didn't set it).
assert "--isolated" in args


def test_augment_ignores_non_playwright_servers():
servers = {"other": {"command": "node", "args": ["server.js"]}}
out = _augment_playwright_args(servers, Path("/tmp/x"))
assert out["other"]["args"] == ["server.js"]


def test_augment_handles_empty():
assert _augment_playwright_args({}, Path("/tmp/x")) == {}
Loading