diff --git a/crates/browser-use-agent/src/tools/registry.rs b/crates/browser-use-agent/src/tools/registry.rs index 3a690e38..7c99216c 100644 --- a/crates/browser-use-agent/src/tools/registry.rs +++ b/crates/browser-use-agent/src/tools/registry.rs @@ -741,7 +741,7 @@ pub mod definitions { pub fn update_goal() -> ToolDefinition { ToolDefinition { name: "update_goal".to_string(), - description: "Update the existing goal.\nUse this tool only to mark the goal achieved or genuinely blocked.\nSet status to `complete` only when the objective has actually been achieved and no required work remains.\nSet status to `blocked` only when the same blocking condition has repeated for at least three consecutive goal turns, counting the original/user-triggered turn and any automatic continuations, and the agent cannot make meaningful progress without user input or an external-state change.\nIf the user resumes a goal that was previously marked `blocked`, treat the resumed run as a fresh blocked audit. If the same blocking condition then repeats for at least three consecutive resumed goal turns, set status to `blocked` again.\nOnce the blocked threshold is satisfied, do not keep reporting that you are still blocked while leaving the goal active; set status to `blocked`.\nDo not use `blocked` merely because the work is hard, slow, uncertain, incomplete, or would benefit from clarification.\nDo not mark a goal complete merely because its budget is nearly exhausted or because you are stopping work.\nYou cannot use this tool to pause, resume, budget-limit, or usage-limit a goal; those status changes are controlled by the user or system.\nWhen marking a budgeted goal achieved with status `complete`, report the final token usage from the tool result to the user.".to_string(), + description: "Update the existing goal. Set status to `complete` only when the objective has actually been achieved and no required work remains; set status to `blocked` only when the same blocking condition has repeated for at least three consecutive goal turns (counting the original/user-triggered turn and any automatic continuations, and restarting a fresh audit when a previously blocked goal is resumed) and the agent cannot progress without user input or an external-state change.\nDo not use `blocked` merely because the work is hard, slow, uncertain, incomplete, or would benefit from clarification; do not mark complete merely because the budget is nearly exhausted or you are stopping; and do not use this tool to pause, resume, budget-limit, or usage-limit a goal (those are controlled by the user or system). When marking a budgeted goal `complete`, report the final token usage from the tool result to the user.".to_string(), input_schema: json!({ "type": "object", "properties": { diff --git a/crates/browser-use-browser/src/browser_script_helpers.py b/crates/browser-use-browser/src/browser_script_helpers.py index c0ee7c37..c9b59ccf 100644 --- a/crates/browser-use-browser/src/browser_script_helpers.py +++ b/crates/browser-use-browser/src/browser_script_helpers.py @@ -7,6 +7,7 @@ import base64 import gzip +import ipaddress import json import math import os @@ -1694,16 +1695,114 @@ def json(self): raise ValueError(f"request failed for {self.url}: {self.error}") -def http_get(url, headers=None, timeout=20.0, binary=None): +def _is_private_or_local_host(host): + """True for hosts the fetch proxy must never see: loopback, RFC1918/link-local + ranges, .local/.internal-style suffixes, and dotless intranet shortnames. + Routing these through the remote proxy would leak the URL/headers off-box and + fetch the WRONG target (the proxy's localhost, not the caller's).""" + host = str(host or "").strip().lower().rstrip(".").strip("[]") + if not host: + return True + if host == "localhost" or host.endswith(".localhost"): + return True + if host.endswith((".local", ".internal", ".lan", ".intranet", ".corp", ".home.arpa")): + return True + try: + ip = ipaddress.ip_address(host) + except ValueError: + return "." not in host + return ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved or ip.is_unspecified + + +class _ProxyFetchResponse: + """Response shim mirroring fetch-use's FetchResponse attribute surface.""" + + __slots__ = ("status_code", "status", "headers", "url", "text", "content") + + def __init__(self, status_code, headers, url, body, body_b64, is_binary): + self.status_code = status_code + self.status = status_code + self.headers = headers or {} + self.url = url + self.text = body or "" + if is_binary and body_b64: + self.content = base64.b64decode(body_b64) + else: + self.content = (body or "").encode("utf-8", errors="replace") + + +def _fetch_use_sync(url, headers=None, timeout_ms=30000, method="GET", body=None): + """Vendored minimal Browser-Use Fetch client (mirrors the `fetch-use` pkg). + + POSTs through fetch.browser-use.com so requests carry Chrome TLS + fingerprinting + rotating proxy IPs — the same un-blockable path browser-use + uses — instead of a bare urllib request that bot-protection blocks. Vendored + so it works even when the `fetch_use` package isn't installed in the sandbox. + """ + import uuid as _uuid + + api_key = os.environ.get("BROWSER_USE_API_KEY", "") + if not api_key: + raise RuntimeError("BROWSER_USE_API_KEY not set") + service = (os.environ.get("FETCH_USE_URL") or "https://fetch.browser-use.com").rstrip("/") + session_id = (os.environ.get("SESSION_ID") or str(_uuid.uuid4()))[:36] + payload = { + "url": url, + "method": str(method or "GET").upper(), + "timeout_ms": min(int(timeout_ms), 120000), + "follow_redirects": True, + "max_redirects": 10, + "proxy_country": os.environ.get("FETCH_USE_PROXY_COUNTRY", "US"), + "session_id": session_id, + } + if headers: + payload["headers"] = dict(headers) + if body is not None: + payload["body"] = body + req_headers = {"Content-Type": "application/json", "X-Browser-Use-API-Key": api_key} + token = os.environ.get("FETCH_USE_AUTH_TOKEN") + if token: + req_headers["X-Fetch-Token"] = token + data = json.dumps(payload).encode("utf-8") + request = urllib.request.Request(service + "/fetch", data=data, headers=req_headers, method="POST") + with urllib.request.urlopen(request, timeout=(int(timeout_ms) / 1000) + 10) as resp: + result = json.loads(resp.read().decode("utf-8")) + if result.get("error"): + raise RuntimeError(f"fetch proxy error: {result['error']}") + return _ProxyFetchResponse( + result.get("status_code", 0), + result.get("headers", {}), + result.get("final_url", url), + result.get("body", ""), + result.get("body_base64", ""), + result.get("is_binary", False), + ) + + +def http_get(url, headers=None, timeout=20.0, binary=None, use_proxy=None): """Pure HTTP fetch for static pages and APIs. - When BROWSER_USE_API_KEY is set and fetch_use is installed, route through - fetch-use like browser-harness. Otherwise fall back to local urllib with a - browser-like UA and gzip handling. Pass binary=True for bytes. + Public URLs route through the Browser-Use Fetch proxy (Chrome TLS + fingerprint + rotating IPs) when BROWSER_USE_API_KEY is set, so + bot-protected sites don't block us — preferring the installed `fetch_use` + package, else the vendored client above. Loopback/private/intranet hosts + are ALWAYS fetched directly (never sent to the proxy). On proxy failure the + request falls back to direct urllib and the proxy error is surfaced. + Pass binary=True for bytes. use_proxy: None=auto (public hosts only), + True=force the proxy, False=force direct. """ - if os.environ.get("BROWSER_USE_API_KEY"): + proxy_error = None + want_proxy = ( + use_proxy + if use_proxy is not None + else not _is_private_or_local_host(urlparse(url).hostname) + ) + if want_proxy and os.environ.get("BROWSER_USE_API_KEY"): try: - from fetch_use import fetch_sync + try: + from fetch_use import fetch_sync + except ImportError: + fetch_sync = _fetch_use_sync response = fetch_sync(url, headers=headers, timeout_ms=int(float(timeout) * 1000)) status_code = getattr(response, "status_code", getattr(response, "status", None)) @@ -1726,8 +1825,16 @@ def http_get(url, headers=None, timeout=20.0, binary=None): response_headers, response_url, ) - except ImportError: - pass + except Exception as exc: + # Proxy unavailable / auth / schema / network error — fall back to a + # direct urllib request below, but keep the proxy error visible so a + # bot-blocked direct response isn't mistaken for proxy success. + proxy_error = exc + print( + f"http_get: fetch proxy failed ({exc}); retrying direct", + file=sys.stderr, + flush=True, + ) request_headers = {"User-Agent": "Mozilla/5.0", "Accept-Encoding": "gzip"} if headers: request_headers.update(headers) @@ -1751,11 +1858,16 @@ def http_get(url, headers=None, timeout=20.0, binary=None): f"{exc.code} for {url}. If this is bot/login protection, retry from the browser with js(fetch(...)), " "pass site-specific headers/cookies, or configure the Browser Use fetch proxy with BROWSER_USE_API_KEY." ) + if proxy_error is not None: + guidance += f" (fetch proxy also failed: {proxy_error})" raise RuntimeError(guidance) from exc except (urllib.error.URLError, TimeoutError, OSError) as exc: - raise RuntimeError( + message = ( f"http_get failed for {url}: {exc}. Try a shorter timeout, browser js(fetch(...)), or a configured proxy if the site blocks direct HTTP." - ) from exc + ) + if proxy_error is not None: + message += f" (fetch proxy also failed: {proxy_error})" + raise RuntimeError(message) from exc def http_get_many(urls, headers=None, timeout=20.0, binary=None, max_workers=8, return_errors=True): diff --git a/crates/browser-use-browser/src/lib.rs b/crates/browser-use-browser/src/lib.rs index 0e4f30e5..309c860d 100644 --- a/crates/browser-use-browser/src/lib.rs +++ b/crates/browser-use-browser/src/lib.rs @@ -28,8 +28,20 @@ use tungstenite::{connect, Message, WebSocket}; const BU_API: &str = "https://api.browser-use.com/api/v3"; const LOG_LIMIT: usize = 250; const SCRIPT_MAX_OUTPUT_CHARS: usize = 120_000; -const BROWSER_SCRIPT_DEFAULT_INITIAL_WAIT_MS: u64 = 15_000; -const BROWSER_SCRIPT_DEFAULT_OBSERVE_MS: u64 = 1_000; +// Cost optimization (eval-everything): a script that finishes within the start +// call returns its result in ONE tool call — no separate `observe` model turns. +// Raised 15s->30s so the common scrape script (which finishes well under 30s) +// no longer forces a poll round-trip. This is a single, non-stacking block that +// still hands control back at 30s, so a stuck script can be cancelled/finalized +// (unlike the reverted "observe30", which STACKED 30s observe blocks and starved +// the run timebox — see DEFAULT_OBSERVE_TIMEOUT_MS doc in browser.rs). +const BROWSER_SCRIPT_DEFAULT_INITIAL_WAIT_MS: u64 = 30_000; +// The `next_observe_ms` HINT surfaced to the model ("call observe with +// observe_timeout_ms=N"). Raised 1s->15s to nudge the model to long-poll instead +// of issuing 1s "still running?" peeks (the dominant observe-churn cost). This is +// only a hint — the observe floor stays at 1s, so the model keeps full agency to +// bail early; we stay under the 30s window that previously regressed. +const BROWSER_SCRIPT_DEFAULT_OBSERVE_MS: u64 = 15_000; const BROWSER_SCRIPT_HELPERS: &str = include_str!("browser_script_helpers.py"); const BROWSER_CONNECT_LOCAL_HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(120); const BROWSER_CONNECT_ATTACH_DEADLINE: Duration = Duration::from_secs(8); @@ -13091,6 +13103,127 @@ print("http_get_many parity ok") assert!(output.text.contains("http_get_many parity ok")); } + #[test] + fn browser_script_http_get_vendored_proxy_private_bypass_and_error_fallback() { + let temp = tempfile::tempdir().unwrap(); + let output = run_browser_script( + "script-http-get-vendored-proxy", + temp.path(), + temp.path().join("artifacts"), + r#" +import http.server +import json +import os +import socketserver +import sys +import threading + +assert _is_private_or_local_host("localhost") +assert _is_private_or_local_host("127.0.0.1") +assert _is_private_or_local_host("10.1.2.3") +assert _is_private_or_local_host("192.168.0.5") +assert _is_private_or_local_host("169.254.1.1") +assert _is_private_or_local_host("printer.local") +assert _is_private_or_local_host("wiki.internal") +assert _is_private_or_local_host("intranet-host") +assert not _is_private_or_local_host("example.com") +assert not _is_private_or_local_host("8.8.8.8") + +proxy_calls = [] +proxy_mode = {"fail": False} + +class FakeFetchProxy(http.server.BaseHTTPRequestHandler): + def log_message(self, fmt, *args): + pass + + def do_POST(self): + assert self.path == "/fetch" + assert self.headers.get("X-Browser-Use-API-Key") == "test-key" + req = json.loads(self.rfile.read(int(self.headers["Content-Length"]))) + proxy_calls.append(req["url"]) + if proxy_mode["fail"]: + self.send_response(500) + self.end_headers() + return + body = json.dumps({ + "status_code": 200, + "status": "200 OK", + "headers": {"x-proxy": "yes"}, + "body": "proxied:" + req["url"], + "body_base64": "", + "is_binary": False, + "final_url": req["url"], + "redirect_count": 0, + "protocol": "HTTP/2.0", + }).encode() + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + +class DirectTarget(http.server.BaseHTTPRequestHandler): + def log_message(self, fmt, *args): + pass + + def do_GET(self): + body = b"direct" + self.send_response(200) + self.send_header("Content-Type", "text/plain; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + +proxy_server = socketserver.TCPServer(("127.0.0.1", 0), FakeFetchProxy) +target_server = socketserver.TCPServer(("127.0.0.1", 0), DirectTarget) +for server in (proxy_server, target_server): + threading.Thread(target=server.serve_forever, daemon=True).start() +target_base = f"http://127.0.0.1:{target_server.server_address[1]}" + +sys.modules.pop("fetch_use", None) # force the VENDORED client path +os.environ["BROWSER_USE_API_KEY"] = "test-key" +os.environ["FETCH_USE_URL"] = f"http://127.0.0.1:{proxy_server.server_address[1]}" + +try: + # 1) public URL goes through the vendored proxy client + proxied = http_get("https://public.example/data") + assert proxied == "proxied:https://public.example/data", proxied + assert proxied.status_code == 200 and proxied.headers["x-proxy"] == "yes" + + # 2) loopback/private host bypasses the proxy entirely + before = len(proxy_calls) + direct = http_get(target_base + "/anything") + assert direct == "direct", direct + assert len(proxy_calls) == before, "private host must never reach the proxy" + + # 3) use_proxy=True forces even a private host through the proxy + forced = http_get(target_base + "/anything", use_proxy=True) + assert forced == "proxied:" + target_base + "/anything", forced + + # 4) proxy failure falls back to direct; both errors surfaced when direct also fails + proxy_mode["fail"] = True + fallback = http_get(target_base + "/anything", use_proxy=True, timeout=3) + assert fallback == "direct", fallback + try: + http_get("https://no-such-host.invalid/x", timeout=3) + except RuntimeError as exc: + assert "fetch proxy also failed" in str(exc), exc + else: + raise AssertionError("expected both proxy and direct to fail") +finally: + for server in (proxy_server, target_server): + server.shutdown() + server.server_close() +print("http_get vendored proxy ok") +"#, + 20, + ) + .unwrap(); + + assert!(output.ok, "{:?}\n{}", output.error, output.text); + assert!(output.text.contains("http_get vendored proxy ok")); + } + #[test] fn browser_script_browser_fetch_single_returns_structured_errors_by_default() { let temp = tempfile::tempdir().unwrap(); diff --git a/prompts/browser-script-tool-description.md b/prompts/browser-script-tool-description.md index c4a354e9..0fcf0e10 100644 --- a/prompts/browser-script-tool-description.md +++ b/prompts/browser-script-tool-description.md @@ -6,17 +6,12 @@ Use the `browser` tool for connection/runtime work first. If the browser is not Important execution model: -- Each `browser_script` call starts a fresh Python process. -- Python variables do not persist across calls. -- Browser/CDP state persists in Rust. +- Each `browser_script` call starts a fresh Python process; Python variables do not persist across calls. Browser/CDP state persists in Rust. - Fast calls return their final result immediately. Long calls return `status: running` with a `run_id`; keep observing that same run until it finishes, fails, or is cancelled. -- To listen to a running script, call this tool with `action="observe"`, the returned `run_id`, and optionally `observe_timeout_ms`. Prefer coarse waits such as 30000-120000 ms for long navigation or extraction scripts; do not burn many turns polling the same `run_id` with short waits. -- To stop a running script, call this tool with `action="cancel"` and the `run_id`. Partial images and artifacts emitted before cancellation are preserved. -- A failed `browser_script` call may include a short diagnosis. Read that diagnosis first: if it says the browser is still connected or the same page is usable, continue from the same page instead of reconnecting. -- Helpers are preimported; you do not need imports for normal browser work. -- CDP is the source of truth. If a helper is incomplete, use `cdp(...)` directly. -- Keep browser actions sequential and deliberate. -- Do not import Playwright, Selenium, or Pyppeteer. +- To listen to a running script, call this tool with `action="observe"`, the `run_id`, and optionally `observe_timeout_ms`. Prefer coarse waits (30000-120000 ms) for long navigation/extraction; do not burn many turns polling with short waits. To stop a run, call `action="cancel"` with the `run_id`; partial images/artifacts emitted before cancellation are preserved. +- A failed call may include a short diagnosis. Read it first: if it says the browser is still connected or the same page is usable, continue from the same page instead of reconnecting. +- Helpers are preimported; no imports needed for normal browser work. CDP is the source of truth — if a helper is incomplete, use `cdp(...)` directly. +- Keep browser actions sequential and deliberate. Do not import Playwright, Selenium, or Pyppeteer. Preimported helpers: @@ -73,21 +68,16 @@ last_domain_skills(include_content=False) Usage guidance: -- First navigation should usually be `new_tab(url)`, not `goto_url(url)`, because `goto_url(url)` mutates the current controlled tab. Both helpers send the CDP navigation command, perform a bounded readiness check, and emit a labeled `navigation` output with `status`, `page_info`, `page_state`, and `next_step`. If that output says `navigation_ready` and `page_info.url` is the expected page, trust it and inspect/extract from the current page instead of navigating to the same URL again. If you chain more work in the same script after navigation, explicitly wait or poll for the specific selector/state you need before reading/clicking. -- If a navigation is blocked by the user's `/domains` policy (the error says so), call `nav_policy()` to see the allowed/denied sites and plan within them; pass a URL (`nav_policy("example.com")`) to check before navigating. If the task can't be completed within the policy, tell the user which site is blocked and suggest they allow it with `/domains` or adjust the task — don't keep retrying the blocked host. -- Keep keyboard semantics browser-harness/Rod aligned: `press_key(...)` simulates physical keys or shortcuts, while `type_text(...)` inserts/pastes text into the focused element with `Input.insertText`. -- For React/Vue/Svelte/controlled inputs, prefer `fill_input(selector, text, timeout=...)` over direct DOM value assignment. It focuses the element, clears with Cmd/Ctrl+A plus Backspace, types through physical key events, then fires final `input`/`change` events. Use stable selectors from labels, ids, names, placeholders, or visible DOM inspection; avoid brittle positional selectors such as `input:nth-of-type(2)` unless you just verified that exact selector on the current page. -- Do not combine `Input.dispatchKeyEvent` carrying printable `text` with a manual `char` event for the same character; that double-inserts text in Chrome. +- First navigation should usually be `new_tab(url)`, not `goto_url(url)`, because `goto_url(url)` mutates the current controlled tab. Both send the CDP navigation command, perform a bounded readiness check, and emit a labeled `navigation` output with `status`, `page_info`, `page_state`, and `next_step`. If that output says `navigation_ready` and `page_info.url` is the expected page, trust it and inspect/extract instead of navigating again. If you chain more work after navigation, explicitly wait or poll for the specific selector/state before reading/clicking. +- If a navigation is blocked by the user's `/domains` policy (the error says so), call `nav_policy()` to see allowed/denied sites and plan within them; pass a URL (`nav_policy("example.com")`) to check before navigating. If the task can't be done within the policy, tell the user which site is blocked and suggest `/domains` or adjusting the task — don't keep retrying the blocked host. +- Keyboard semantics: `press_key(...)` simulates physical keys/shortcuts; `type_text(...)` inserts/pastes text into the focused element via `Input.insertText`. Do not combine `Input.dispatchKeyEvent` carrying printable `text` with a manual `char` event for the same character; that double-inserts in Chrome. +- For React/Vue/Svelte/controlled inputs, prefer `fill_input(selector, text, timeout=...)` over direct DOM value assignment. It focuses, clears with Cmd/Ctrl+A plus Backspace, types through physical key events, then fires final `input`/`change` events. Use stable selectors from labels, ids, names, placeholders, or visible DOM inspection; avoid brittle positional selectors like `input:nth-of-type(2)` unless you just verified that exact selector on the current page. - If the task is site-specific, call `domain_skills_for_url(url, include_content=True)` before inventing selectors, private API routes, or flows. `goto_url(url)` also returns matching `domain_skills` metadata when a skill root is available. -- Be patient with loading pages by making several cheap observations, not one long blind wait. Prefer short waits such as `wait_for_load(1)`, `wait_for_element(selector, timeout=2)`, or `wait_for_network_idle(2)`, then inspect again. If a wait returns false, that is not a task failure; inspect the current page and continue from the best available state or decide whether it is stuck. -- Use screenshots as labeled temporal checkpoints: initial load, before/after meaningful clicks, scrolls, route changes, dialogs, uploads, downloads, and final verification. For screenshot or visual-output tasks, verify the saved image is contentful and nonblank before `done`. -- The common screenshot call is `screenshot(label)`, for example `screenshot("before_submit")`. -- Screenshot/image artifacts are sent as `input_image` content to the next model turn. The user does not see those pixels inline in the terminal; describe what you see or provide the saved artifact path when the user asks for the screenshot. -- If a script emits screenshots/images and then fails, the next model turn still receives the images alongside the failure diagnosis. Use those pixels to decide the next smaller retry. -- If a running script emits screenshots/images before it finishes, `observe` returns those images as soon as they are available. Use the pixels to guide the next observe/retry. -- Use `emit_output(value, label="...")` for structured observations that the next model turn may need, such as `page_info()`, extracted rows, selected DOM state, or API responses. The full value stays model-visible. -- When a script emits labeled structured output, add a `# browser_summary:` JSON comment block at the top of the script that maps each emitted label to the compact transcript summary. Write the code/labels first mentally, then place or update this block before submitting the tool call; the runtime parses the whole script before execution. -- Summary values may be literals, JSONPath-like selectors such as `$.url`, or template strings such as `Read ${$.length} employee rows`. Missing summary specs fall back to a generic `Recorded