From 7d89afff5d1e655e7f787612e215583fde93b564 Mon Sep 17 00:00:00 2001 From: Hwuiwon Kim Date: Tue, 7 Apr 2026 18:06:29 -0400 Subject: [PATCH] Fix sandbox errors, add project docs, and update stale documentation --- .claude/commands/create-playbook.md | 2 +- .claude/commands/test-deployment.md | 113 ++++++++++++++++++++++++++++ AGENTS.md | 93 +++++++++++++++++++++++ CLAUDE.md | 1 + README.md | 5 +- agent/main.py | 19 +++-- agent/session/finalizer.py | 11 ++- api/runs/registry.py | 8 +- api/runs/service.py | 14 ++-- bridge/page_actions.py | 25 +++++- docs/api.md | 6 +- docs/authentication.md | 9 +-- docs/configuration.md | 6 +- docs/guardrails.md | 7 +- docs/playbooks.md | 2 +- docs/tools.md | 5 +- tests/test_run_registry.py | 30 ++++---- tests/test_streaming.py | 8 +- uv.lock | 2 +- 19 files changed, 306 insertions(+), 60 deletions(-) create mode 100644 .claude/commands/test-deployment.md create mode 100644 AGENTS.md create mode 100644 CLAUDE.md diff --git a/.claude/commands/create-playbook.md b/.claude/commands/create-playbook.md index f3c49f3..4c747b2 100644 --- a/.claude/commands/create-playbook.md +++ b/.claude/commands/create-playbook.md @@ -36,7 +36,7 @@ Convert the recorded interactions into a playbook YAML file. Apply these optimiz **Auth handling (IMPORTANT):** - **NEVER include login/auth steps** (typing username, password, clicking login) in the playbook. -- Authentication is handled externally by `DashboardAuth.ensure_authenticated()` before the playbook runs. It restores saved sessions or performs fresh login automatically. +- Authentication is handled externally by the auth system before the playbook runs. It detects login forms and performs fresh login automatically. - If the recording includes login interactions, **strip them out**. The playbook should start from the first post-login action. - Set `auth_required: true` if login was part of the recording — this tells the runner to authenticate before executing. diff --git a/.claude/commands/test-deployment.md b/.claude/commands/test-deployment.md new file mode 100644 index 0000000..f886ea3 --- /dev/null +++ b/.claude/commands/test-deployment.md @@ -0,0 +1,113 @@ +Test the deployed CUA API on Modal to verify it's working correctly. + +## Prerequisites + +Before running tests, determine the API base URL and ensure `$CUA_API_KEY` is set in the environment (loaded via direnv). The base URL depends on the Modal workspace — check the deploy output or run: + +```bash +.venv/bin/modal app list | grep cua +``` + +Set the base URL for the session: +```bash +BASE_URL="https://--cua-serve.modal.run" +``` + +## Test cases + +Run all 4 test cases. For Tests 3 and 4, save the `run_id` from the create response and poll until `status` is `completed` (or a terminal state). + +### Test 1: Dry-run validation (config check, no sandbox) + +```bash +curl -s -X POST "$BASE_URL/runs/dry-run" \ + -H "Authorization: Bearer $CUA_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"directive": "Test directive", "max_steps": 10}' | python3 -m json.tool +``` + +**Expected**: `"valid": true`, all checks passed. + +### Test 2: Input validation (reject invalid config) + +```bash +curl -s -X POST "$BASE_URL/runs/dry-run" \ + -H "Authorization: Bearer $CUA_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"directive": "Test", "max_steps": 0}' | python3 -m json.tool +``` + +**Expected**: HTTP 422 with `"code": "INVALID_REQUEST"` and error about `max_steps`. + +### Test 3: Simple directive (example.com heading) + +Create a run: +```bash +curl -s -X POST "$BASE_URL/runs" \ + -H "Authorization: Bearer $CUA_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "directive": "Go to https://example.com and tell me the heading text on the page", + "max_steps": 10, + "timeout_seconds": 120, + "start_url": "https://example.com" + }' | python3 -m json.tool +``` + +Poll until completed: +```bash +curl -s "$BASE_URL/runs/" \ + -H "Authorization: Bearer $CUA_API_KEY" | python3 -m json.tool +``` + +**Expected**: `"status": "completed"`, result mentions "Example Domain". + +### Test 4: Structured output extraction (HN top 3) + +Create a run with `output_schema`: +```bash +curl -s -X POST "$BASE_URL/runs" \ + -H "Authorization: Bearer $CUA_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "directive": "Go to https://news.ycombinator.com and extract the titles of the top 3 stories", + "max_steps": 15, + "timeout_seconds": 180, + "start_url": "https://news.ycombinator.com", + "output_schema": { + "type": "object", + "properties": { + "stories": { + "type": "array", + "items": {"type": "object", "properties": {"rank": {"type": "integer"}, "title": {"type": "string"}}}, + "maxItems": 3 + } + }, + "required": ["stories"] + } + }' | python3 -m json.tool +``` + +Poll until completed (may take 20-30s). + +**Expected**: `"status": "completed"`, `data.stories` array with 3 items each having `rank` and `title`, no extract timeout errors in actions. + +## Evaluating results + +For each test, check: +1. **Status**: should be `completed` (not `failed` or `timeout`) +2. **Errors**: `error` field should be `null` +3. **Actions**: verify no repeated timeout errors (stuck detection should catch these) +4. **Duration**: simple directives should complete in under 30s, structured output under 60s + +## Troubleshooting sandbox logs + +If a run fails or behaves unexpectedly, check the sandbox logs in the Modal dashboard. + +Look for: +- `CancelledError` in Starlette lifespan — graceful shutdown issue +- `AuthError: Token missing` — volume commit called from sandbox context +- `AsyncUsageWarning` — sync Modal API call in async context +- `browser_dom.extract failed: Timeout` — selector mismatch (check if href was truncated) + +Report a summary table of all test results with status, duration, and any errors found. diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..7e2e955 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,93 @@ +# CUA — Computer Use Agent + +Autonomous browser automation agent deployed on Modal. Accepts natural-language directives, executes them via a headless Chromium browser in a sandboxed VM, and returns structured results. + +## Quick reference + +```bash +# Run tests +.venv/bin/python -m pytest tests/ -x -q + +# Lint +.venv/bin/ruff check . + +# Type check +.venv/bin/ty check + +# Deploy to Modal +.venv/bin/modal deploy api/server.py::modal_app + +# Run agent locally (requires DISPLAY / Xvfb) +.venv/bin/python scripts/run_local.py --directive "..." --start-url "https://..." +``` + +## Architecture + +``` +api/ → Outer FastAPI service (Modal Function), handles /runs CRUD + server.py → FastAPI app + Modal ASGI entrypoint (modal_app lives in modal_app.py) + modal_app.py → Modal App definition, image builds, volume/dict setup + runs/ → RunService, RunRegistry (in-memory + Modal Dict), RunHandle + streaming.py → In-sandbox status API (port 8090), SSE events, status persistence + models.py → RunConfig, RunStatus, GuardrailSettings, ActionEvent + +agent/ → Agent loop (runs inside Modal Sandbox) + main.py → Sandbox entrypoint — starts status API, runs session, handles shutdown + loop.py → PydanticAI agent loop with tool definitions + session/ → SessionRunner (browser + agent lifecycle), RunFinalizer + tools.py → browser_dom tool implementation + hooks.py → PydanticAI hooks (preflight guardrails, thinking capture, error recovery) + +bridge/ → Browser abstraction layer + browser.py → BrowserManager (Patchright wrapper, page lifecycle) + execution.py → Action handlers (click, extract, goto, etc.), SequenceExecutor + page_actions.py → Primitive page actions with shared semantics + observation.py → DOM snapshots, mutations, screenshots + scripts/ → JS injected into pages (page_context.js, recorder.js) + +sandbox/ → Modal Sandbox definition + image.py → Ubuntu 24.04 image with desktop env, Patchright, agent runtime + entrypoint.sh → Starts Xvfb + openbox, runs agent/main.py + +guardrails/ → Runtime safety checks + stuck.py → Stuck detection (repetition, cycles, failure clusters, URL revisits) + scope.py → Domain allowlist/blocklist, action permissions + +blinders/ → Directive classification (goal type, login detection, action filtering) +playbooks/ → YAML-defined deterministic workflows with LLM fallback +evaluation/ → Benchmark suite runner and scoring engine +recording/ → Playwright trace capture and artifact management +telemetry/ → OpenTelemetry tracing, structured logging, metrics +``` + +## Key conventions + +- **Python 3.13+**, managed with `uv`. Virtual env at `.venv/`. +- **Environment**: uses `direnv` — secrets loaded from `.envrc` (not committed). +- **Settings**: all env vars centralized in `settings.py` via Pydantic Settings. Never scatter `os.environ.get()`. +- **Models**: `PRIMARY_MODEL` and `UTILITY_MODEL` constants in `settings.py`. Change there to switch everywhere. +- **Modal deploy target**: `api/server.py::modal_app` — the app variable is named `modal_app`, not `app`. +- **Sandbox vs Function**: Code in `agent/` runs inside Modal Sandboxes (no Modal API token). Code in `api/` runs in Modal Functions (has Modal auth). Don't call `modal.Volume.commit()` from sandbox code. +- **Tests**: `pytest` with `asyncio_mode = "auto"`. Integration tests marked `@pytest.mark.integration`. Run `pytest tests/ -x -q` for the full suite. +- **Lint**: `ruff` with bugbear, isort, pyupgrade, and pep8-naming rules. Line length 88. + +## API endpoints + +| Method | Path | Description | +|--------|------|-------------| +| POST | /runs/dry-run | Validate config without executing | +| POST | /runs | Create and start a new run | +| GET | /runs/{run_id} | Poll run status | +| POST | /runs/{run_id}/stop | Terminate a run | +| GET | /runs/{run_id}/stream | SSE event stream | +| GET | /runs/{run_id}/recording/manifest | List recording artifacts | +| GET | /runs/{run_id}/recording/trace | Download Playwright trace ZIP | + +Auth: `Authorization: Bearer $CUA_API_KEY` (set in Modal secret `cua-secret`). + +## Common patterns + +- **DOM snapshot truncation**: `page_context.js` truncates hrefs to 60 chars. The extract action has a fallback that retries with `href^=` (starts-with) when exact match fails. +- **Stuck detection**: sliding window over recent actions, checks for repetition, cycles, failure clusters, and URL revisits. Configurable via `GuardrailSettings`. +- **Session memory**: injected into the system prompt before each LLM request so the agent retains awareness of prior work even after context pruning. +- **Playbook execution**: YAML-defined step sequences with selector fallbacks, verification checks, and LLM handoff on failure. diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..eef4bd2 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +@AGENTS.md \ No newline at end of file diff --git a/README.md b/README.md index 7c6ad1a..ca95a04 100644 --- a/README.md +++ b/README.md @@ -195,11 +195,10 @@ cua/ | Topic | Description | |---|---| -| [Architecture](docs/architecture.md) | Full sequence diagram and component overview | | [API Reference](docs/api.md) | Endpoints, SSE streaming, replay, multi-container support | -| [Browser Tools](docs/tools.md) | 9 browser actions, `execute_sequence` batching, design choices | +| [Browser Tools](docs/tools.md) | 10 browser actions, `execute_sequence` batching, design choices | | [Playbooks](docs/playbooks.md) | Deterministic workflows, selector fallbacks, LLM handoff | -| [Authentication](docs/authentication.md) | Session persistence, credential refs, `SecretValue`, and security caveats | +| [Authentication](docs/authentication.md) | Credential refs, `SecretValue`, and security caveats | | [Guardrails](docs/guardrails.md) | Cognitive Blinders, runtime safety, domain/action controls | | [Recording](docs/recording.md) | Playwright tracing, session replay | | [Evaluation](docs/evaluation.md) | Benchmark suites, trial scoring, pass/fail expectations | diff --git a/agent/main.py b/agent/main.py index cfcf0b9..57daf90 100644 --- a/agent/main.py +++ b/agent/main.py @@ -18,21 +18,29 @@ import os import signal import sys +from typing import TYPE_CHECKING from telemetry.logging import setup_logging +if TYPE_CHECKING: + import uvicorn + setup_logging() logger = logging.getLogger("cua.agent") STATUS_API_PORT = 8090 -async def _start_status_api() -> asyncio.Task: +async def _start_status_api() -> tuple[asyncio.Task, uvicorn.Server]: """Start the in-sandbox status API as a background asyncio task. Runs uvicorn in the same process so the status API shares module globals with the agent loop (push_action / complete_run update the same _status and _subscribers objects that GET /events reads from). + + Returns ``(task, server)`` so the caller can trigger a graceful + shutdown via ``server.should_exit = True`` instead of cancelling + the task (which causes a noisy ``CancelledError`` in Starlette). """ import uvicorn @@ -48,7 +56,7 @@ async def _start_status_api() -> asyncio.Task: task = asyncio.create_task(server.serve()) # Give uvicorn a moment to bind the port await asyncio.sleep(0.5) - return task + return task, server async def main() -> int: @@ -91,7 +99,7 @@ async def main() -> int: logger.info("Directive: %s", config.directive[:200]) # Start status API in-process (shares globals with agent loop) - status_task = await _start_status_api() + status_task, status_server = await _start_status_api() logger.info("Status API started on port %d (in-process)", STATUS_API_PORT) # Initialize status API state @@ -143,9 +151,10 @@ def _request_shutdown(sig: int) -> None: except asyncio.CancelledError: result = 1 - # Cancel the status API after a grace period for final SSE delivery + # Keep the status API alive so the outer API can do final polling + # during the entrypoint keep-alive window, then shut down gracefully. await asyncio.sleep(1) - status_task.cancel() + status_server.should_exit = True with contextlib.suppress(asyncio.CancelledError): await status_task diff --git a/agent/session/finalizer.py b/agent/session/finalizer.py index dc33802..e8d2fa6 100644 --- a/agent/session/finalizer.py +++ b/agent/session/finalizer.py @@ -24,7 +24,16 @@ async def _commit_recording_volume() -> None: - """Commit the recordings volume so the outer API can read persisted data.""" + """Commit the recordings volume so the outer API can read persisted data. + + Inside a Modal sandbox the volume is auto-synced on exit and the Modal + API token is unavailable, so we skip the explicit commit. + """ + from settings import get_settings + + if get_settings().modal_sandbox_id != "local": + logger.debug("Skipping volume commit (sandbox auto-syncs on exit)") + return try: vol = modal.Volume.from_name(_RECORDING_VOLUME_NAME) await vol.commit.aio() diff --git a/api/runs/registry.py b/api/runs/registry.py index 2155f65..37761b3 100644 --- a/api/runs/registry.py +++ b/api/runs/registry.py @@ -50,7 +50,7 @@ def add(self, handle: RunHandle) -> None: def get(self, run_id: str) -> RunHandle | None: raise NotImplementedError - def remove(self, run_id: str) -> RunHandle | None: + async def remove(self, run_id: str) -> RunHandle | None: raise NotImplementedError def contains(self, run_id: str) -> bool: @@ -69,7 +69,7 @@ def add(self, handle: RunHandle) -> None: def get(self, run_id: str) -> RunHandle | None: return self._runs.get(run_id) - def remove(self, run_id: str) -> RunHandle | None: + async def remove(self, run_id: str) -> RunHandle | None: return self._runs.pop(run_id, None) def contains(self, run_id: str) -> bool: @@ -106,10 +106,10 @@ def get(self, run_id: str) -> RunHandle | None: # via modal.Sandbox.from_id() and re-adds to the registry. return self._local.get(run_id) - def remove(self, run_id: str) -> RunHandle | None: + async def remove(self, run_id: str) -> RunHandle | None: handle = self._local.pop(run_id, None) with contextlib.suppress(Exception): - self._dict.pop(run_id) + await self._dict.pop.aio(run_id) return handle def contains(self, run_id: str) -> bool: diff --git a/api/runs/service.py b/api/runs/service.py index c4b6922..b370678 100644 --- a/api/runs/service.py +++ b/api/runs/service.py @@ -234,8 +234,8 @@ def __init__( volume=volume, ) - def remove_handle(self, run_id: str) -> None: - self._registry.remove(run_id) + async def remove_handle(self, run_id: str) -> None: + await self._registry.remove(run_id) def _mark_run_active(self, run_id: str) -> None: self._active_run_ids.add(run_id) @@ -302,7 +302,7 @@ async def cleanup_finished_sandbox(self, run_id: str) -> bool: logger.info( "Cleaning up finished sandbox for run %s (exit code %s)", run_id, exit_code ) - self.remove_handle(run_id) + await self.remove_handle(run_id) self._mark_run_inactive(run_id) return True @@ -402,7 +402,7 @@ async def get_status(self, run_id: str) -> RunStatus: resp.raise_for_status() return RunStatus.model_validate(resp.json()) except (ValidationError, ValueError, TypeError) as exc: - self.remove_handle(run_id) + await self.remove_handle(run_id) self._mark_run_inactive(run_id) logger.warning( "Invalid status payload for run %s: %s", @@ -419,7 +419,7 @@ async def get_status(self, run_id: str) -> RunStatus: ), ) except httpx.HTTPError as exc: - self.remove_handle(run_id) + await self.remove_handle(run_id) self._mark_run_inactive(run_id) logger.warning("Status request failed for run %s: %s", run_id, exc) return await self._terminated_status( @@ -453,7 +453,7 @@ async def stop_run(self, run_id: str) -> dict[str, str | RunStatusValue]: "Terminate call failed for run %s: %s", run_id, exc, exc_info=True ) - self.remove_handle(run_id) + await self.remove_handle(run_id) self._mark_run_inactive(run_id) logger.info("Terminated run %s", run_id) return {"status": RunStatusValue.TERMINATED, "run_id": run_id} @@ -525,7 +525,7 @@ async def proxy_events(): yielded_any = True yield line + "\n" except httpx.HTTPError as exc: - self.remove_handle(run_id) + await self.remove_handle(run_id) self._mark_run_inactive(run_id) if not yielded_any: persisted = await self.build_persisted_event_stream( diff --git a/bridge/page_actions.py b/bridge/page_actions.py index 455130d..559ee4d 100644 --- a/bridge/page_actions.py +++ b/bridge/page_actions.py @@ -182,6 +182,21 @@ async def execute_page_action( raise ValueError(f"Unknown browser_dom action: {action}") +_HREF_EXACT_RE = __import__("re").compile(r'\[href="([^"]+)"\]') + + +def _relax_href_selector(selector: str) -> str | None: + """Convert exact href matches to starts-with (^=) for truncated URLs. + + The DOM snapshot truncates hrefs (e.g. 60 chars). When the LLM uses a + truncated href as an exact-match CSS selector it won't find the element. + Retrying with ``[href^="…"]`` handles this gracefully. + """ + if _HREF_EXACT_RE.search(selector): + return _HREF_EXACT_RE.sub(r'[href^="\1"]', selector) + return None + + async def extract_content( page: Any, *, @@ -204,7 +219,15 @@ async def extract_content( EXTRACT_VALUE_CALL_JS, [selector, EXTRACT_VALUE_INIT_JS], ) - return await page.inner_text(selector, timeout=timeout_ms) + try: + return await page.inner_text(selector, timeout=timeout_ms) + except Exception: + # Retry with starts-with match for truncated href selectors + relaxed = _relax_href_selector(selector) + if relaxed: + logger.debug("Retrying extract with relaxed selector: %s", relaxed) + return await page.inner_text(relaxed, timeout=timeout_ms) + raise async def _extract_markdown(page: Any) -> str: diff --git a/docs/api.md b/docs/api.md index 7f7d505..f1bd759 100644 --- a/docs/api.md +++ b/docs/api.md @@ -19,7 +19,7 @@ CUA deploys to [Modal](https://modal.com) as a managed API. Each run spawns an i | Field | Type | Default | Description | |---|---|---|---| | `directive` | string | (required) | Natural language task | -| `model` | string | `google-gla:gemini-3-flash-preview` | LLM model | +| `model` | string | `openai-responses:gpt-5.4` | LLM model | | `max_steps` | int | 50 | Max agent iterations | | `timeout_seconds` | int | 600 | Sandbox timeout (30-3600) | | `thinking` | string | `high` | Thinking effort level | @@ -48,11 +48,11 @@ curl -X POST https://--cua-serve.modal.run/runs/dry-run \ {"name": "profile", "passed": true, "message": "Profile 'default' loaded"}, {"name": "credentials", "passed": true, "message": "No credentials (anonymous run)"}, {"name": "guardrails", "passed": true, "message": "Default guardrails"}, - {"name": "model", "passed": true, "message": "Model: google-gla:gemini-3-flash-preview"} + {"name": "model", "passed": true, "message": "Model: openai-responses:gpt-5.4"} ], "warnings": [], "config_summary": { - "model": "google-gla:gemini-3-flash-preview", + "model": "openai-responses:gpt-5.4", "max_steps": 50, "timeout_seconds": 600, "thinking": "high", diff --git a/docs/authentication.md b/docs/authentication.md index b3d10fd..b3ba0c9 100644 --- a/docs/authentication.md +++ b/docs/authentication.md @@ -2,7 +2,7 @@ ## Dashboard Authentication -CUA handles dashboard login with session persistence: +CUA handles dashboard login automatically: ```bash python scripts/run_local.py \ @@ -11,12 +11,7 @@ python scripts/run_local.py \ --credentials '{"email": "admin@company.com", "password": "secret"}' ``` -The auth system: -1. Tries restoring a previously saved session (cookies/localStorage) -2. If expired, logs in by detecting common form patterns (email/username + password fields) -3. Saves the new session for future runs - -Sessions are stored at `~/.cua/sessions/` and reused across runs. +The auth system detects common login form patterns (email/username + password fields) and performs a fresh login each run. Credentials are resolved at fill time via `credential_ref` so secrets never appear in the LLM prompt. ## Credential Security diff --git a/docs/configuration.md b/docs/configuration.md index 828f920..87ab9e2 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -8,7 +8,7 @@ | `--playbook` | None | Playbook ID to execute deterministically | | `--playbook-params` | None | JSON dict of playbook parameters | | `--credentials` | None | JSON credentials: `'{"username": "...", "password": "..."}'` | -| `--model` | `anthropic:claude-sonnet-4-6` | Model for LLM agent (any PydanticAI-supported model) | +| `--model` | `openai-responses:gpt-5.4` | Model for LLM agent (any PydanticAI-supported model) | | `--max-steps` | 50 | Max tool-call iterations (LLM path only) | | `--thinking` | `high` | Thinking effort level (`minimal`, `low`, `medium`, `high`, `xhigh`) | | `--allow-private-networks` | false | Allow localhost and private IPs | @@ -20,8 +20,8 @@ CUA uses [PydanticAI](https://ai.pydantic.dev/) and works with any model it supports — Anthropic, OpenAI, Google Gemini, Groq, and more. Set the model in `settings.py`: ```python -PRIMARY_MODEL = "google-gla:gemini-3-flash-preview" # main agent -UTILITY_MODEL = "google-gla:gemini-3.1-flash-lite-preview" # classification, guardrails, extraction +PRIMARY_MODEL = "openai-responses:gpt-5.4" # main agent +UTILITY_MODEL = "openai-responses:gpt-5.4-mini" # classification, guardrails, extraction ``` To switch providers, change the model string and set the corresponding API key: diff --git a/docs/guardrails.md b/docs/guardrails.md index c1638de..5f8eae6 100644 --- a/docs/guardrails.md +++ b/docs/guardrails.md @@ -108,12 +108,15 @@ guardrails: allowed_domains: ["*.internal.com"] # Domain allowlist (optional) # Stuck detection thresholds - stuck_window_size: 8 # Sliding window of recent actions + stuck_window_size: 12 # Sliding window of recent actions stuck_repeat_hint: 3 # Same action N times → hint stuck_repeat_warn: 5 # Same action N times → warning stuck_repeat_stop: 7 # Same action N times → hard stop stuck_cycle_max_length: 3 # Max cycle pattern length (e.g. A-B-C) - stuck_cycle_repeats: 3 # Cycle must repeat N times to trigger + stuck_cycle_repeats: 3 # Cycle must repeat N times to trigger + stuck_revisit_gap: 5 # Min steps between URL revisits before warning + stuck_failure_cluster_window: 5 # Window for failure cluster detection + stuck_failure_cluster_threshold: 3 # Failed actions in window to trigger cluster alert ``` When omitted, safe defaults apply (private networks blocked, LLM checks enabled, standard limits). diff --git a/docs/playbooks.md b/docs/playbooks.md index 01c2752..b72202a 100644 --- a/docs/playbooks.md +++ b/docs/playbooks.md @@ -53,7 +53,7 @@ Playbooks support these actions (same as the LLM agent's `browser_dom` tool): - `llm_recover` (default) — after 2 failures, hands off ALL remaining steps to the full LLM agent - `retry` — retry without LLM fallback - `abort` — stop immediately -- **Authentication**: Built-in login flow with session persistence via Playwright cookies/localStorage. Sessions saved to `~/.cua/sessions/` and reused across runs. +- **Authentication**: Built-in login flow that detects common form patterns (email/username + password fields) and performs fresh login when `auth_required: true`. ## Execution Tiers diff --git a/docs/tools.md b/docs/tools.md index 026a232..516d3d8 100644 --- a/docs/tools.md +++ b/docs/tools.md @@ -1,6 +1,6 @@ # Browser Tools -CUA exposes a single `browser_dom` tool with 9 actions. The agent chooses which action to call based on the task and page state. +CUA exposes a single `browser_dom` tool with 10 actions. The agent chooses which action to call based on the task and page state. ## Actions @@ -13,7 +13,8 @@ CUA exposes a single `browser_dom` tool with 9 actions. The agent chooses which | `scroll(direction, amount)` | Scroll the page | Page map | | `extract(selector, mode)` | Extract content as markdown (default), text, HTML, or form values | Content string + page map | | `get_dom(selector?)` | Get a compact DOM snapshot (optionally scoped) | DOM string | -| `wait_for(selector, state)` | Wait for an element to be visible, hidden, etc. | Confirmation | +| `select(selector, value)` | Select a dropdown option | Confirmation | +| `evaluate(script)` | Execute arbitrary JavaScript on the page | Page map (if URL changed) | | `execute_sequence(steps)` | **Batch multiple actions in a single tool call** | Combined results + page map | ## Why `execute_sequence` matters diff --git a/tests/test_run_registry.py b/tests/test_run_registry.py index 946757e..f909594 100644 --- a/tests/test_run_registry.py +++ b/tests/test_run_registry.py @@ -118,58 +118,58 @@ def test_get_does_not_query_remote_on_cache_miss(self): class TestModalDictRunRegistryRemove: - def test_remove_deletes_handle_from_local_cache(self): + async def test_remove_deletes_handle_from_local_cache(self): registry, _ = _make_registry() handle = _make_handle("run-1") registry._local["run-1"] = handle - registry.remove("run-1") + await registry.remove("run-1") assert "run-1" not in registry._local - def test_remove_calls_dict_pop(self): + async def test_remove_calls_dict_pop_aio(self): registry, modal_dict = _make_registry() handle = _make_handle("run-1") registry._local["run-1"] = handle - registry.remove("run-1") + await registry.remove("run-1") - modal_dict.pop.assert_called_once_with("run-1") + modal_dict.pop.aio.assert_called_once_with("run-1") - def test_remove_returns_the_handle(self): + async def test_remove_returns_the_handle(self): registry, _ = _make_registry() handle = _make_handle("run-1") registry._local["run-1"] = handle - result = registry.remove("run-1") + result = await registry.remove("run-1") assert result is handle - def test_remove_returns_none_when_run_id_not_in_local(self): + async def test_remove_returns_none_when_run_id_not_in_local(self): registry, _ = _make_registry() - result = registry.remove("nonexistent-run") + result = await registry.remove("nonexistent-run") assert result is None - def test_remove_returns_handle_when_dict_pop_raises(self): + async def test_remove_returns_handle_when_dict_pop_raises(self): registry, modal_dict = _make_registry() - modal_dict.pop.side_effect = RuntimeError("modal unavailable") + modal_dict.pop.aio.side_effect = RuntimeError("modal unavailable") handle = _make_handle("run-1") registry._local["run-1"] = handle - result = registry.remove("run-1") + result = await registry.remove("run-1") assert result is handle - def test_remove_does_not_raise_when_dict_pop_raises(self): + async def test_remove_does_not_raise_when_dict_pop_raises(self): registry, modal_dict = _make_registry() - modal_dict.pop.side_effect = Exception("network error") + modal_dict.pop.aio.side_effect = Exception("network error") handle = _make_handle("run-1") registry._local["run-1"] = handle # Failure must be swallowed - registry.remove("run-1") + await registry.remove("run-1") class TestModalDictRunRegistryContains: diff --git a/tests/test_streaming.py b/tests/test_streaming.py index 2cae6e5..c449554 100644 --- a/tests/test_streaming.py +++ b/tests/test_streaming.py @@ -73,18 +73,18 @@ def test_phase_transition_to_terminated(self): handle.phase = RunPhase.TERMINATED assert handle.phase == RunPhase.TERMINATED - def test_registry_add_get_remove(self): + async def test_registry_add_get_remove(self): reg = InMemoryRunRegistry() handle = RunHandle(run_id="r1", sandbox=None, status_base_url="http://x") reg.add(handle) assert reg.get("r1") is handle - removed = reg.remove("r1") + removed = await reg.remove("r1") assert removed is handle assert reg.get("r1") is None - def test_registry_remove_nonexistent(self): + async def test_registry_remove_nonexistent(self): reg = InMemoryRunRegistry() - assert reg.remove("nonexistent") is None + assert await reg.remove("nonexistent") is None def test_error_field(self): handle = RunHandle( diff --git a/uv.lock b/uv.lock index aea0595..7b12961 100644 --- a/uv.lock +++ b/uv.lock @@ -454,7 +454,7 @@ wheels = [ [[package]] name = "cua" -version = "0.2.0" +version = "0.4.0" source = { virtual = "." } dependencies = [ { name = "anthropic" },