From 7d89afff5d1e655e7f787612e215583fde93b564 Mon Sep 17 00:00:00 2001
From: Hwuiwon Kim <hwuiwon.kim@gmail.com>
Date: Tue, 7 Apr 2026 18:06:29 -0400
Subject: [PATCH] Fix sandbox errors, add project docs, and update stale
 documentation

---
 .claude/commands/create-playbook.md |   2 +-
 .claude/commands/test-deployment.md | 113 ++++++++++++++++++++++++++++
 AGENTS.md                           |  93 +++++++++++++++++++++++
 CLAUDE.md                           |   1 +
 README.md                           |   5 +-
 agent/main.py                       |  19 +++--
 agent/session/finalizer.py          |  11 ++-
 api/runs/registry.py                |   8 +-
 api/runs/service.py                 |  14 ++--
 bridge/page_actions.py              |  25 +++++-
 docs/api.md                         |   6 +-
 docs/authentication.md              |   9 +--
 docs/configuration.md               |   6 +-
 docs/guardrails.md                  |   7 +-
 docs/playbooks.md                   |   2 +-
 docs/tools.md                       |   5 +-
 tests/test_run_registry.py          |  30 ++++----
 tests/test_streaming.py             |   8 +-
 uv.lock                             |   2 +-
 19 files changed, 306 insertions(+), 60 deletions(-)
 create mode 100644 .claude/commands/test-deployment.md
 create mode 100644 AGENTS.md
 create mode 100644 CLAUDE.md
diff --git a/.claude/commands/create-playbook.md b/.claude/commands/create-playbook.md
index f3c49f3..4c747b2 100644
--- a/.claude/commands/create-playbook.md
+++ b/.claude/commands/create-playbook.md
@@ -36,7 +36,7 @@ Convert the recorded interactions into a playbook YAML file. Apply these optimiz
 
 **Auth handling (IMPORTANT):**
 - **NEVER include login/auth steps** (typing username, password, clicking login) in the playbook.
-- Authentication is handled externally by `DashboardAuth.ensure_authenticated()` before the playbook runs. It restores saved sessions or performs fresh login automatically.
+- Authentication is handled externally by the auth system before the playbook runs. It detects login forms and performs fresh login automatically.
 - If the recording includes login interactions, **strip them out**. The playbook should start from the first post-login action.
 - Set `auth_required: true` if login was part of the recording — this tells the runner to authenticate before executing.
 
diff --git a/.claude/commands/test-deployment.md b/.claude/commands/test-deployment.md
new file mode 100644
index 0000000..f886ea3
--- /dev/null
+++ b/.claude/commands/test-deployment.md
@@ -0,0 +1,113 @@
+Test the deployed CUA API on Modal to verify it's working correctly.
+
+## Prerequisites
+
+Before running tests, determine the API base URL and ensure `$CUA_API_KEY` is set in the environment (loaded via direnv). The base URL depends on the Modal workspace — check the deploy output or run:
+
+```bash
+.venv/bin/modal app list | grep cua
+```
+
+Set the base URL for the session:
+```bash
+BASE_URL="https://<workspace>--cua-serve.modal.run"
+```
+
+## Test cases
+
+Run all 4 test cases. For Tests 3 and 4, save the `run_id` from the create response and poll until `status` is `completed` (or a terminal state).
+
+### Test 1: Dry-run validation (config check, no sandbox)
+
+```bash
+curl -s -X POST "$BASE_URL/runs/dry-run" \
+  -H "Authorization: Bearer $CUA_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{"directive": "Test directive", "max_steps": 10}' | python3 -m json.tool
+```
+
+**Expected**: `"valid": true`, all checks passed.
+
+### Test 2: Input validation (reject invalid config)
+
+```bash
+curl -s -X POST "$BASE_URL/runs/dry-run" \
+  -H "Authorization: Bearer $CUA_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{"directive": "Test", "max_steps": 0}' | python3 -m json.tool
+```
+
+**Expected**: HTTP 422 with `"code": "INVALID_REQUEST"` and error about `max_steps`.
+
+### Test 3: Simple directive (example.com heading)
+
+Create a run:
+```bash
+curl -s -X POST "$BASE_URL/runs" \
+  -H "Authorization: Bearer $CUA_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "directive": "Go to https://example.com and tell me the heading text on the page",
+    "max_steps": 10,
+    "timeout_seconds": 120,
+    "start_url": "https://example.com"
+  }' | python3 -m json.tool
+```
+
+Poll until completed:
+```bash
+curl -s "$BASE_URL/runs/<run_id>" \
+  -H "Authorization: Bearer $CUA_API_KEY" | python3 -m json.tool
+```
+
+**Expected**: `"status": "completed"`, result mentions "Example Domain".
+
+### Test 4: Structured output extraction (HN top 3)
+
+Create a run with `output_schema`:
+```bash
+curl -s -X POST "$BASE_URL/runs" \
+  -H "Authorization: Bearer $CUA_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "directive": "Go to https://news.ycombinator.com and extract the titles of the top 3 stories",
+    "max_steps": 15,
+    "timeout_seconds": 180,
+    "start_url": "https://news.ycombinator.com",
+    "output_schema": {
+      "type": "object",
+      "properties": {
+        "stories": {
+          "type": "array",
+          "items": {"type": "object", "properties": {"rank": {"type": "integer"}, "title": {"type": "string"}}},
+          "maxItems": 3
+        }
+      },
+      "required": ["stories"]
+    }
+  }' | python3 -m json.tool
+```
+
+Poll until completed (may take 20-30s).
+
+**Expected**: `"status": "completed"`, `data.stories` array with 3 items each having `rank` and `title`, no extract timeout errors in actions.
+
+## Evaluating results
+
+For each test, check:
+1. **Status**: should be `completed` (not `failed` or `timeout`)
+2. **Errors**: `error` field should be `null`
+3. **Actions**: verify no repeated timeout errors (stuck detection should catch these)
+4. **Duration**: simple directives should complete in under 30s, structured output under 60s
+
+## Troubleshooting sandbox logs
+
+If a run fails or behaves unexpectedly, check the sandbox logs in the Modal dashboard.
+
+Look for:
+- `CancelledError` in Starlette lifespan — graceful shutdown issue
+- `AuthError: Token missing` — volume commit called from sandbox context
+- `AsyncUsageWarning` — sync Modal API call in async context
+- `browser_dom.extract failed: Timeout` — selector mismatch (check if href was truncated)
+
+Report a summary table of all test results with status, duration, and any errors found.
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..7e2e955
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,93 @@
+# CUA — Computer Use Agent
+
+Autonomous browser automation agent deployed on Modal. Accepts natural-language directives, executes them via a headless Chromium browser in a sandboxed VM, and returns structured results.
+
+## Quick reference
+
+```bash
+# Run tests
+.venv/bin/python -m pytest tests/ -x -q
+
+# Lint
+.venv/bin/ruff check .
+
+# Type check
+.venv/bin/ty check
+
+# Deploy to Modal
+.venv/bin/modal deploy api/server.py::modal_app
+
+# Run agent locally (requires DISPLAY / Xvfb)
+.venv/bin/python scripts/run_local.py --directive "..." --start-url "https://..."
+```
+
+## Architecture
+
+```
+api/             → Outer FastAPI service (Modal Function), handles /runs CRUD
+  server.py      → FastAPI app + Modal ASGI entrypoint (modal_app lives in modal_app.py)
+  modal_app.py   → Modal App definition, image builds, volume/dict setup
+  runs/          → RunService, RunRegistry (in-memory + Modal Dict), RunHandle
+  streaming.py   → In-sandbox status API (port 8090), SSE events, status persistence
+  models.py      → RunConfig, RunStatus, GuardrailSettings, ActionEvent
+
+agent/           → Agent loop (runs inside Modal Sandbox)
+  main.py        → Sandbox entrypoint — starts status API, runs session, handles shutdown
+  loop.py        → PydanticAI agent loop with tool definitions
+  session/       → SessionRunner (browser + agent lifecycle), RunFinalizer
+  tools.py       → browser_dom tool implementation
+  hooks.py       → PydanticAI hooks (preflight guardrails, thinking capture, error recovery)
+
+bridge/          → Browser abstraction layer
+  browser.py     → BrowserManager (Patchright wrapper, page lifecycle)
+  execution.py   → Action handlers (click, extract, goto, etc.), SequenceExecutor
+  page_actions.py → Primitive page actions with shared semantics
+  observation.py → DOM snapshots, mutations, screenshots
+  scripts/       → JS injected into pages (page_context.js, recorder.js)
+
+sandbox/         → Modal Sandbox definition
+  image.py       → Ubuntu 24.04 image with desktop env, Patchright, agent runtime
+  entrypoint.sh  → Starts Xvfb + openbox, runs agent/main.py
+
+guardrails/      → Runtime safety checks
+  stuck.py       → Stuck detection (repetition, cycles, failure clusters, URL revisits)
+  scope.py       → Domain allowlist/blocklist, action permissions
+
+blinders/        → Directive classification (goal type, login detection, action filtering)
+playbooks/       → YAML-defined deterministic workflows with LLM fallback
+evaluation/      → Benchmark suite runner and scoring engine
+recording/       → Playwright trace capture and artifact management
+telemetry/       → OpenTelemetry tracing, structured logging, metrics
+```
+
+## Key conventions
+
+- **Python 3.13+**, managed with `uv`. Virtual env at `.venv/`.
+- **Environment**: uses `direnv` — secrets loaded from `.envrc` (not committed).
+- **Settings**: all env vars centralized in `settings.py` via Pydantic Settings. Never scatter `os.environ.get()`.
+- **Models**: `PRIMARY_MODEL` and `UTILITY_MODEL` constants in `settings.py`. Change there to switch everywhere.
+- **Modal deploy target**: `api/server.py::modal_app` — the app variable is named `modal_app`, not `app`.
+- **Sandbox vs Function**: Code in `agent/` runs inside Modal Sandboxes (no Modal API token). Code in `api/` runs in Modal Functions (has Modal auth). Don't call `modal.Volume.commit()` from sandbox code.
+- **Tests**: `pytest` with `asyncio_mode = "auto"`. Integration tests marked `@pytest.mark.integration`. Run `pytest tests/ -x -q` for the full suite.
+- **Lint**: `ruff` with bugbear, isort, pyupgrade, and pep8-naming rules. Line length 88.
+
+## API endpoints
+
+| Method | Path | Description |
+|--------|------|-------------|
+| POST | /runs/dry-run | Validate config without executing |
+| POST | /runs | Create and start a new run |
+| GET | /runs/{run_id} | Poll run status |
+| POST | /runs/{run_id}/stop | Terminate a run |
+| GET | /runs/{run_id}/stream | SSE event stream |
+| GET | /runs/{run_id}/recording/manifest | List recording artifacts |
+| GET | /runs/{run_id}/recording/trace | Download Playwright trace ZIP |
+
+Auth: `Authorization: Bearer $CUA_API_KEY` (set in Modal secret `cua-secret`).
+
+## Common patterns
+
+- **DOM snapshot truncation**: `page_context.js` truncates hrefs to 60 chars. The extract action has a fallback that retries with `href^=` (starts-with) when exact match fails.
+- **Stuck detection**: sliding window over recent actions, checks for repetition, cycles, failure clusters, and URL revisits. Configurable via `GuardrailSettings`.
+- **Session memory**: injected into the system prompt before each LLM request so the agent retains awareness of prior work even after context pruning.
+- **Playbook execution**: YAML-defined step sequences with selector fallbacks, verification checks, and LLM handoff on failure.
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..eef4bd2
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1 @@
+@AGENTS.md
\ No newline at end of file
diff --git a/README.md b/README.md
index 7c6ad1a..ca95a04 100644
--- a/README.md
+++ b/README.md
@@ -195,11 +195,10 @@ cua/
 
 | Topic | Description |
 |---|---|
-| [Architecture](docs/architecture.md) | Full sequence diagram and component overview |
 | [API Reference](docs/api.md) | Endpoints, SSE streaming, replay, multi-container support |
-| [Browser Tools](docs/tools.md) | 9 browser actions, `execute_sequence` batching, design choices |
+| [Browser Tools](docs/tools.md) | 10 browser actions, `execute_sequence` batching, design choices |
 | [Playbooks](docs/playbooks.md) | Deterministic workflows, selector fallbacks, LLM handoff |
-| [Authentication](docs/authentication.md) | Session persistence, credential refs, `SecretValue`, and security caveats |
+| [Authentication](docs/authentication.md) | Credential refs, `SecretValue`, and security caveats |
 | [Guardrails](docs/guardrails.md) | Cognitive Blinders, runtime safety, domain/action controls |
 | [Recording](docs/recording.md) | Playwright tracing, session replay |
 | [Evaluation](docs/evaluation.md) | Benchmark suites, trial scoring, pass/fail expectations |
diff --git a/agent/main.py b/agent/main.py
index cfcf0b9..57daf90 100644
--- a/agent/main.py
+++ b/agent/main.py
@@ -18,21 +18,29 @@
 import os
 import signal
 import sys
+from typing import TYPE_CHECKING
 
 from telemetry.logging import setup_logging
 
+if TYPE_CHECKING:
+    import uvicorn
+
 setup_logging()
 logger = logging.getLogger("cua.agent")
 
 STATUS_API_PORT = 8090
 
 
-async def _start_status_api() -> asyncio.Task:
+async def _start_status_api() -> tuple[asyncio.Task, uvicorn.Server]:
     """Start the in-sandbox status API as a background asyncio task.
 
     Runs uvicorn in the same process so the status API shares module
     globals with the agent loop (push_action / complete_run update the
     same _status and _subscribers objects that GET /events reads from).
+
+    Returns ``(task, server)`` so the caller can trigger a graceful
+    shutdown via ``server.should_exit = True`` instead of cancelling
+    the task (which causes a noisy ``CancelledError`` in Starlette).
     """
     import uvicorn
 
@@ -48,7 +56,7 @@ async def _start_status_api() -> asyncio.Task:
     task = asyncio.create_task(server.serve())
     # Give uvicorn a moment to bind the port
     await asyncio.sleep(0.5)
-    return task
+    return task, server
 
 
 async def main() -> int:
@@ -91,7 +99,7 @@ async def main() -> int:
     logger.info("Directive: %s", config.directive[:200])
 
     # Start status API in-process (shares globals with agent loop)
-    status_task = await _start_status_api()
+    status_task, status_server = await _start_status_api()
     logger.info("Status API started on port %d (in-process)", STATUS_API_PORT)
 
     # Initialize status API state
@@ -143,9 +151,10 @@ def _request_shutdown(sig: int) -> None:
     except asyncio.CancelledError:
         result = 1
 
-    # Cancel the status API after a grace period for final SSE delivery
+    # Keep the status API alive so the outer API can do final polling
+    # during the entrypoint keep-alive window, then shut down gracefully.
     await asyncio.sleep(1)
-    status_task.cancel()
+    status_server.should_exit = True
     with contextlib.suppress(asyncio.CancelledError):
         await status_task
 
diff --git a/agent/session/finalizer.py b/agent/session/finalizer.py
index dc33802..e8d2fa6 100644
--- a/agent/session/finalizer.py
+++ b/agent/session/finalizer.py
@@ -24,7 +24,16 @@
 
 
 async def _commit_recording_volume() -> None:
-    """Commit the recordings volume so the outer API can read persisted data."""
+    """Commit the recordings volume so the outer API can read persisted data.
+
+    Inside a Modal sandbox the volume is auto-synced on exit and the Modal
+    API token is unavailable, so we skip the explicit commit.
+    """
+    from settings import get_settings
+
+    if get_settings().modal_sandbox_id != "local":
+        logger.debug("Skipping volume commit (sandbox auto-syncs on exit)")
+        return
     try:
         vol = modal.Volume.from_name(_RECORDING_VOLUME_NAME)
         await vol.commit.aio()
diff --git a/api/runs/registry.py b/api/runs/registry.py
index 2155f65..37761b3 100644
--- a/api/runs/registry.py
+++ b/api/runs/registry.py
@@ -50,7 +50,7 @@ def add(self, handle: RunHandle) -> None:
     def get(self, run_id: str) -> RunHandle | None:
         raise NotImplementedError
 
-    def remove(self, run_id: str) -> RunHandle | None:
+    async def remove(self, run_id: str) -> RunHandle | None:
         raise NotImplementedError
 
     def contains(self, run_id: str) -> bool:
@@ -69,7 +69,7 @@ def add(self, handle: RunHandle) -> None:
     def get(self, run_id: str) -> RunHandle | None:
         return self._runs.get(run_id)
 
-    def remove(self, run_id: str) -> RunHandle | None:
+    async def remove(self, run_id: str) -> RunHandle | None:
         return self._runs.pop(run_id, None)
 
     def contains(self, run_id: str) -> bool:
@@ -106,10 +106,10 @@ def get(self, run_id: str) -> RunHandle | None:
         # via modal.Sandbox.from_id() and re-adds to the registry.
         return self._local.get(run_id)
 
-    def remove(self, run_id: str) -> RunHandle | None:
+    async def remove(self, run_id: str) -> RunHandle | None:
         handle = self._local.pop(run_id, None)
         with contextlib.suppress(Exception):
-            self._dict.pop(run_id)
+            await self._dict.pop.aio(run_id)
         return handle
 
     def contains(self, run_id: str) -> bool:
diff --git a/api/runs/service.py b/api/runs/service.py
index c4b6922..b370678 100644
--- a/api/runs/service.py
+++ b/api/runs/service.py
@@ -234,8 +234,8 @@ def __init__(
             volume=volume,
         )
 
-    def remove_handle(self, run_id: str) -> None:
-        self._registry.remove(run_id)
+    async def remove_handle(self, run_id: str) -> None:
+        await self._registry.remove(run_id)
 
     def _mark_run_active(self, run_id: str) -> None:
         self._active_run_ids.add(run_id)
@@ -302,7 +302,7 @@ async def cleanup_finished_sandbox(self, run_id: str) -> bool:
         logger.info(
             "Cleaning up finished sandbox for run %s (exit code %s)", run_id, exit_code
         )
-        self.remove_handle(run_id)
+        await self.remove_handle(run_id)
         self._mark_run_inactive(run_id)
         return True
 
@@ -402,7 +402,7 @@ async def get_status(self, run_id: str) -> RunStatus:
             resp.raise_for_status()
             return RunStatus.model_validate(resp.json())
         except (ValidationError, ValueError, TypeError) as exc:
-            self.remove_handle(run_id)
+            await self.remove_handle(run_id)
             self._mark_run_inactive(run_id)
             logger.warning(
                 "Invalid status payload for run %s: %s",
@@ -419,7 +419,7 @@ async def get_status(self, run_id: str) -> RunStatus:
                 ),
             )
         except httpx.HTTPError as exc:
-            self.remove_handle(run_id)
+            await self.remove_handle(run_id)
             self._mark_run_inactive(run_id)
             logger.warning("Status request failed for run %s: %s", run_id, exc)
             return await self._terminated_status(
@@ -453,7 +453,7 @@ async def stop_run(self, run_id: str) -> dict[str, str | RunStatusValue]:
                 "Terminate call failed for run %s: %s", run_id, exc, exc_info=True
             )
 
-        self.remove_handle(run_id)
+        await self.remove_handle(run_id)
         self._mark_run_inactive(run_id)
         logger.info("Terminated run %s", run_id)
         return {"status": RunStatusValue.TERMINATED, "run_id": run_id}
@@ -525,7 +525,7 @@ async def proxy_events():
                         yielded_any = True
                         yield line + "\n"
             except httpx.HTTPError as exc:
-                self.remove_handle(run_id)
+                await self.remove_handle(run_id)
                 self._mark_run_inactive(run_id)
                 if not yielded_any:
                     persisted = await self.build_persisted_event_stream(
diff --git a/bridge/page_actions.py b/bridge/page_actions.py
index 455130d..559ee4d 100644
--- a/bridge/page_actions.py
+++ b/bridge/page_actions.py
@@ -182,6 +182,21 @@ async def execute_page_action(
     raise ValueError(f"Unknown browser_dom action: {action}")
 
 
+_HREF_EXACT_RE = __import__("re").compile(r'\[href="([^"]+)"\]')
+
+
+def _relax_href_selector(selector: str) -> str | None:
+    """Convert exact href matches to starts-with (^=) for truncated URLs.
+
+    The DOM snapshot truncates hrefs (e.g. 60 chars).  When the LLM uses a
+    truncated href as an exact-match CSS selector it won't find the element.
+    Retrying with ``[href^="…"]`` handles this gracefully.
+    """
+    if _HREF_EXACT_RE.search(selector):
+        return _HREF_EXACT_RE.sub(r'[href^="\1"]', selector)
+    return None
+
+
 async def extract_content(
     page: Any,
     *,
@@ -204,7 +219,15 @@ async def extract_content(
             EXTRACT_VALUE_CALL_JS,
             [selector, EXTRACT_VALUE_INIT_JS],
         )
-    return await page.inner_text(selector, timeout=timeout_ms)
+    try:
+        return await page.inner_text(selector, timeout=timeout_ms)
+    except Exception:
+        # Retry with starts-with match for truncated href selectors
+        relaxed = _relax_href_selector(selector)
+        if relaxed:
+            logger.debug("Retrying extract with relaxed selector: %s", relaxed)
+            return await page.inner_text(relaxed, timeout=timeout_ms)
+        raise
 
 
 async def _extract_markdown(page: Any) -> str:
diff --git a/docs/api.md b/docs/api.md
index 7f7d505..f1bd759 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -19,7 +19,7 @@ CUA deploys to [Modal](https://modal.com) as a managed API. Each run spawns an i
 | Field | Type | Default | Description |
 |---|---|---|---|
 | `directive` | string | (required) | Natural language task |
-| `model` | string | `google-gla:gemini-3-flash-preview` | LLM model |
+| `model` | string | `openai-responses:gpt-5.4` | LLM model |
 | `max_steps` | int | 50 | Max agent iterations |
 | `timeout_seconds` | int | 600 | Sandbox timeout (30-3600) |
 | `thinking` | string | `high` | Thinking effort level |
@@ -48,11 +48,11 @@ curl -X POST https://<workspace>--cua-serve.modal.run/runs/dry-run \
     {"name": "profile", "passed": true, "message": "Profile 'default' loaded"},
     {"name": "credentials", "passed": true, "message": "No credentials (anonymous run)"},
     {"name": "guardrails", "passed": true, "message": "Default guardrails"},
-    {"name": "model", "passed": true, "message": "Model: google-gla:gemini-3-flash-preview"}
+    {"name": "model", "passed": true, "message": "Model: openai-responses:gpt-5.4"}
   ],
   "warnings": [],
   "config_summary": {
-    "model": "google-gla:gemini-3-flash-preview",
+    "model": "openai-responses:gpt-5.4",
     "max_steps": 50,
     "timeout_seconds": 600,
     "thinking": "high",
diff --git a/docs/authentication.md b/docs/authentication.md
index b3d10fd..b3ba0c9 100644
--- a/docs/authentication.md
+++ b/docs/authentication.md
@@ -2,7 +2,7 @@
 
 ## Dashboard Authentication
 
-CUA handles dashboard login with session persistence:
+CUA handles dashboard login automatically:
 
 ```bash
 python scripts/run_local.py \
@@ -11,12 +11,7 @@ python scripts/run_local.py \
   --credentials '{"email": "admin@company.com", "password": "secret"}'
 ```
 
-The auth system:
-1. Tries restoring a previously saved session (cookies/localStorage)
-2. If expired, logs in by detecting common form patterns (email/username + password fields)
-3. Saves the new session for future runs
-
-Sessions are stored at `~/.cua/sessions/` and reused across runs.
+The auth system detects common login form patterns (email/username + password fields) and performs a fresh login each run. Credentials are resolved at fill time via `credential_ref` so secrets never appear in the LLM prompt.
 
 ## Credential Security
 
diff --git a/docs/configuration.md b/docs/configuration.md
index 828f920..87ab9e2 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -8,7 +8,7 @@
 | `--playbook` | None | Playbook ID to execute deterministically |
 | `--playbook-params` | None | JSON dict of playbook parameters |
 | `--credentials` | None | JSON credentials: `'{"username": "...", "password": "..."}'` |
-| `--model` | `anthropic:claude-sonnet-4-6` | Model for LLM agent (any PydanticAI-supported model) |
+| `--model` | `openai-responses:gpt-5.4` | Model for LLM agent (any PydanticAI-supported model) |
 | `--max-steps` | 50 | Max tool-call iterations (LLM path only) |
 | `--thinking` | `high` | Thinking effort level (`minimal`, `low`, `medium`, `high`, `xhigh`) |
 | `--allow-private-networks` | false | Allow localhost and private IPs |
@@ -20,8 +20,8 @@
 CUA uses [PydanticAI](https://ai.pydantic.dev/) and works with any model it supports — Anthropic, OpenAI, Google Gemini, Groq, and more. Set the model in `settings.py`:
 
 ```python
-PRIMARY_MODEL = "google-gla:gemini-3-flash-preview"      # main agent
-UTILITY_MODEL = "google-gla:gemini-3.1-flash-lite-preview"  # classification, guardrails, extraction
+PRIMARY_MODEL = "openai-responses:gpt-5.4"      # main agent
+UTILITY_MODEL = "openai-responses:gpt-5.4-mini"  # classification, guardrails, extraction
 ```
 
 To switch providers, change the model string and set the corresponding API key:
diff --git a/docs/guardrails.md b/docs/guardrails.md
index c1638de..5f8eae6 100644
--- a/docs/guardrails.md
+++ b/docs/guardrails.md
@@ -108,12 +108,15 @@ guardrails:
   allowed_domains: ["*.internal.com"]   # Domain allowlist (optional)
 
   # Stuck detection thresholds
-  stuck_window_size: 8                  # Sliding window of recent actions
+  stuck_window_size: 12                 # Sliding window of recent actions
   stuck_repeat_hint: 3                  # Same action N times → hint
   stuck_repeat_warn: 5                  # Same action N times → warning
   stuck_repeat_stop: 7                  # Same action N times → hard stop
   stuck_cycle_max_length: 3             # Max cycle pattern length (e.g. A-B-C)
-  stuck_cycle_repeats: 3                 # Cycle must repeat N times to trigger
+  stuck_cycle_repeats: 3               # Cycle must repeat N times to trigger
+  stuck_revisit_gap: 5                  # Min steps between URL revisits before warning
+  stuck_failure_cluster_window: 5       # Window for failure cluster detection
+  stuck_failure_cluster_threshold: 3    # Failed actions in window to trigger cluster alert
 ```
 
 When omitted, safe defaults apply (private networks blocked, LLM checks enabled, standard limits).
diff --git a/docs/playbooks.md b/docs/playbooks.md
index 01c2752..b72202a 100644
--- a/docs/playbooks.md
+++ b/docs/playbooks.md
@@ -53,7 +53,7 @@ Playbooks support these actions (same as the LLM agent's `browser_dom` tool):
   - `llm_recover` (default) — after 2 failures, hands off ALL remaining steps to the full LLM agent
   - `retry` — retry without LLM fallback
   - `abort` — stop immediately
-- **Authentication**: Built-in login flow with session persistence via Playwright cookies/localStorage. Sessions saved to `~/.cua/sessions/` and reused across runs.
+- **Authentication**: Built-in login flow that detects common form patterns (email/username + password fields) and performs fresh login when `auth_required: true`.
 
 ## Execution Tiers
 
diff --git a/docs/tools.md b/docs/tools.md
index 026a232..516d3d8 100644
--- a/docs/tools.md
+++ b/docs/tools.md
@@ -1,6 +1,6 @@
 # Browser Tools
 
-CUA exposes a single `browser_dom` tool with 9 actions. The agent chooses which action to call based on the task and page state.
+CUA exposes a single `browser_dom` tool with 10 actions. The agent chooses which action to call based on the task and page state.
 
 ## Actions
 
@@ -13,7 +13,8 @@ CUA exposes a single `browser_dom` tool with 9 actions. The agent chooses which
 | `scroll(direction, amount)` | Scroll the page | Page map |
 | `extract(selector, mode)` | Extract content as markdown (default), text, HTML, or form values | Content string + page map |
 | `get_dom(selector?)` | Get a compact DOM snapshot (optionally scoped) | DOM string |
-| `wait_for(selector, state)` | Wait for an element to be visible, hidden, etc. | Confirmation |
+| `select(selector, value)` | Select a dropdown option | Confirmation |
+| `evaluate(script)` | Execute arbitrary JavaScript on the page | Page map (if URL changed) |
 | `execute_sequence(steps)` | **Batch multiple actions in a single tool call** | Combined results + page map |
 
 ## Why `execute_sequence` matters
diff --git a/tests/test_run_registry.py b/tests/test_run_registry.py
index 946757e..f909594 100644
--- a/tests/test_run_registry.py
+++ b/tests/test_run_registry.py
@@ -118,58 +118,58 @@ def test_get_does_not_query_remote_on_cache_miss(self):
 
 
 class TestModalDictRunRegistryRemove:
-    def test_remove_deletes_handle_from_local_cache(self):
+    async def test_remove_deletes_handle_from_local_cache(self):
         registry, _ = _make_registry()
         handle = _make_handle("run-1")
         registry._local["run-1"] = handle
 
-        registry.remove("run-1")
+        await registry.remove("run-1")
 
         assert "run-1" not in registry._local
 
-    def test_remove_calls_dict_pop(self):
+    async def test_remove_calls_dict_pop_aio(self):
         registry, modal_dict = _make_registry()
         handle = _make_handle("run-1")
         registry._local["run-1"] = handle
 
-        registry.remove("run-1")
+        await registry.remove("run-1")
 
-        modal_dict.pop.assert_called_once_with("run-1")
+        modal_dict.pop.aio.assert_called_once_with("run-1")
 
-    def test_remove_returns_the_handle(self):
+    async def test_remove_returns_the_handle(self):
         registry, _ = _make_registry()
         handle = _make_handle("run-1")
         registry._local["run-1"] = handle
 
-        result = registry.remove("run-1")
+        result = await registry.remove("run-1")
 
         assert result is handle
 
-    def test_remove_returns_none_when_run_id_not_in_local(self):
+    async def test_remove_returns_none_when_run_id_not_in_local(self):
         registry, _ = _make_registry()
 
-        result = registry.remove("nonexistent-run")
+        result = await registry.remove("nonexistent-run")
 
         assert result is None
 
-    def test_remove_returns_handle_when_dict_pop_raises(self):
+    async def test_remove_returns_handle_when_dict_pop_raises(self):
         registry, modal_dict = _make_registry()
-        modal_dict.pop.side_effect = RuntimeError("modal unavailable")
+        modal_dict.pop.aio.side_effect = RuntimeError("modal unavailable")
         handle = _make_handle("run-1")
         registry._local["run-1"] = handle
 
-        result = registry.remove("run-1")
+        result = await registry.remove("run-1")
 
         assert result is handle
 
-    def test_remove_does_not_raise_when_dict_pop_raises(self):
+    async def test_remove_does_not_raise_when_dict_pop_raises(self):
         registry, modal_dict = _make_registry()
-        modal_dict.pop.side_effect = Exception("network error")
+        modal_dict.pop.aio.side_effect = Exception("network error")
         handle = _make_handle("run-1")
         registry._local["run-1"] = handle
 
         # Failure must be swallowed
-        registry.remove("run-1")
+        await registry.remove("run-1")
 
 
 class TestModalDictRunRegistryContains:
diff --git a/tests/test_streaming.py b/tests/test_streaming.py
index 2cae6e5..c449554 100644
--- a/tests/test_streaming.py
+++ b/tests/test_streaming.py
@@ -73,18 +73,18 @@ def test_phase_transition_to_terminated(self):
         handle.phase = RunPhase.TERMINATED
         assert handle.phase == RunPhase.TERMINATED
 
-    def test_registry_add_get_remove(self):
+    async def test_registry_add_get_remove(self):
         reg = InMemoryRunRegistry()
         handle = RunHandle(run_id="r1", sandbox=None, status_base_url="http://x")
         reg.add(handle)
         assert reg.get("r1") is handle
-        removed = reg.remove("r1")
+        removed = await reg.remove("r1")
         assert removed is handle
         assert reg.get("r1") is None
 
-    def test_registry_remove_nonexistent(self):
+    async def test_registry_remove_nonexistent(self):
         reg = InMemoryRunRegistry()
-        assert reg.remove("nonexistent") is None
+        assert await reg.remove("nonexistent") is None
 
     def test_error_field(self):
         handle = RunHandle(
diff --git a/uv.lock b/uv.lock
index aea0595..7b12961 100644
--- a/uv.lock
+++ b/uv.lock
@@ -454,7 +454,7 @@ wheels = [
 
 [[package]]
 name = "cua"
-version = "0.2.0"
+version = "0.4.0"
 source = { virtual = "." }
 dependencies = [
     { name = "anthropic" },