From 2e788925e4311149cf330a73b964fe7db5373983 Mon Sep 17 00:00:00 2001 From: reformedot Date: Thu, 4 Jun 2026 18:07:22 -0700 Subject: [PATCH 01/10] Add locally-executed DuckDuckGo `search` tool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port the Python `search` action (DuckDuckGo Lite HTTP search) into the async agent engine as a new locally-dispatched `search` tool. Only the search logic is carried over — the `request_human_control` action and the Controller/DB/session scaffolding are dropped per "keep the logic only". Unlike the existing hosted `web_search` (provider-executed, no local I/O), this tool performs a real HTTP GET against `lite.duckduckgo.com/lite/` and parses the result HTML itself, so it works against any provider. Implementation notes: - New handler `tools/handlers/search.rs` follows the same trait stack (Approvable + Sandboxable + ToolRuntime) as the sibling tools, with the HTTP fetch behind a `SearchBackend` seam (real reqwest impl + fake for tests), mirroring the browser/python/mcp backend-injection pattern. - No new dependencies: the repo deliberately avoids HTML-parser deps (browser DOM comes from CDP), so parsing uses targeted `regex` over the fixed DuckDuckGo Lite markup plus a small hand-rolled percent-decoder and entity decoder. Faithful to the original BeautifulSoup logic. - Registered as `search` in both `default_registry` and the production dispatcher (`build_tool_dispatcher_with_cwd_and_goal_store`) so the live model can actually call it; parallel-safe (read-only). - Tests are fully deterministic (fixture HTML + fake backend, no network): parsing, URL unwrapping, entity/whitespace handling, response classification, formatting, and orchestrator/registry/dispatcher wiring. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/entrypoint/provider.rs | 17 +- .../src/tools/handlers/mod.rs | 8 + .../src/tools/handlers/search.rs | 736 ++++++++++++++++++ .../src/tools/handlers/search_tests.rs | 476 +++++++++++ .../browser-use-agent/src/tools/registry.rs | 40 +- .../src/tools/registry_tests.rs | 60 +- 6 files changed, 1330 insertions(+), 7 deletions(-) create mode 100644 crates/browser-use-agent/src/tools/handlers/search.rs create mode 100644 crates/browser-use-agent/src/tools/handlers/search_tests.rs diff --git a/crates/browser-use-agent/src/entrypoint/provider.rs b/crates/browser-use-agent/src/entrypoint/provider.rs index 620c7f30..d857c9d0 100644 --- a/crates/browser-use-agent/src/entrypoint/provider.rs +++ b/crates/browser-use-agent/src/entrypoint/provider.rs @@ -1111,8 +1111,9 @@ fn resolve_provider_with_python( /// The registry registers the backend-free handlers — `shell`, `apply_patch`, /// `view_image`, `update_plan`, `done`, `tool_search` (catalog populated from the registered tools' defs), /// `web_search` (ENABLED; the Responses builder encodes it as the hosted -/// `web_search_preview` tool) — plus the two product-surface tools that drive -/// real subsystems: +/// `web_search_preview` tool), `search` (a locally-executed DuckDuckGo search, +/// distinct from the hosted `web_search`) — plus the two product-surface tools +/// that drive real subsystems: /// * `browser` ([`BrowserTool::new`]): standalone — the production /// [`RealBackend`](crate::tools::handlers::browser::RealBackend) wraps the /// `browser-use-browser` crate and manages CDP sessions internally (keyed by @@ -1208,6 +1209,7 @@ fn build_tool_dispatcher_with_cwd_and_goal_store( use crate::tools::handlers::done::{DoneRequest, DoneTool}; use crate::tools::handlers::mcp::McpToolCallRequest; use crate::tools::handlers::python::{PythonRequest, PythonTool}; + use crate::tools::handlers::search::{SearchRequest, SearchTool}; use crate::tools::handlers::shell::{ ExecCommandRequest, ExecCommandTool, ShellRequest, ShellTool, WriteStdinRequest, WriteStdinTool, @@ -1292,6 +1294,10 @@ fn build_tool_dispatcher_with_cwd_and_goal_store( true, WebSearchTool::new(WebSearchConfig::enabled()), ); + // `search`: locally-executed DuckDuckGo (Lite) web search — the client runs + // the HTTP request and parses the results itself (distinct from the hosted + // `web_search` above). Read-only, so parallel_safe = true. + reg.register::<_, SearchRequest>("search", definitions::search(), true, SearchTool::new()); let browser_backend = browser_backend_for_runtime_or_config( config, runtime_handle.as_ref(), @@ -3188,6 +3194,13 @@ mod tests { assert!(names.contains(&"browser")); assert!(names.contains(&"done")); assert!(names.contains(&"update_plan")); + // Both web searches are wired into the production dispatcher: the hosted + // `web_search` and the locally-executed DuckDuckGo `search`. + assert!(names.contains(&"web_search")); + assert!( + names.contains(&"search"), + "the locally-executed `search` tool must be reachable by the live model" + ); } /// A non-empty `mcp_servers` map registers the `mcp` tool. The stdio server diff --git a/crates/browser-use-agent/src/tools/handlers/mod.rs b/crates/browser-use-agent/src/tools/handlers/mod.rs index 89b5cfee..cfae823e 100644 --- a/crates/browser-use-agent/src/tools/handlers/mod.rs +++ b/crates/browser-use-agent/src/tools/handlers/mod.rs @@ -13,6 +13,7 @@ pub mod done; pub mod goal; pub mod mcp; pub mod python; +pub mod search; pub mod shell; pub mod subagent; pub mod tool_search; @@ -31,6 +32,8 @@ mod mcp_tests; #[cfg(test)] mod python_tests; #[cfg(test)] +mod search_tests; +#[cfg(test)] mod shell_tests; #[cfg(test)] mod tool_search_tests; @@ -52,6 +55,11 @@ pub use mcp::{ MCP_ERROR_EXIT_CODE, MCP_EVENT_RESULT_MAX_CHARS, }; pub use python::{PythonApprovalKey, PythonBackend, PythonRequest, PythonTool}; +pub use search::{ + classify_response, extract_real_url, format_results, normalize_whitespace, parse_lite_results, + HttpSearchBackend, SearchApprovalKey, SearchBackend, SearchError, SearchRequest, SearchResult, + SearchTool, +}; pub use shell::{ ExecCommandApprovalKey, ExecCommandRequest, ExecCommandTool, ShellApprovalKey, ShellRequest, ShellTool, WriteStdinApprovalKey, WriteStdinRequest, WriteStdinTool, diff --git a/crates/browser-use-agent/src/tools/handlers/search.rs b/crates/browser-use-agent/src/tools/handlers/search.rs new file mode 100644 index 00000000..d21aeeaa --- /dev/null +++ b/crates/browser-use-agent/src/tools/handlers/search.rs @@ -0,0 +1,736 @@ +//! `search` tool: a LOCALLY-executed DuckDuckGo (Lite) web search. +//! +//! This is the async re-implementation of the legacy Python `search` action +//! (a `browser_use` `Controller` action that fetched +//! `lite.duckduckgo.com/lite/` over HTTP and parsed the result HTML). Only the +//! *search logic* is ported — the surrounding `Controller` / DB / session +//! scaffolding (and the unrelated `request_human_control` action) are dropped. +//! Like the other handlers it implements the full trait stack +//! ([`Approvable`] + [`Sandboxable`] + [`ToolRuntime`]) so it can be driven by +//! the [`ToolOrchestrator`](crate::tools::orchestrator::ToolOrchestrator), +//! mirroring the `tool_search` tool's structure: a non-FS, +//! fetch-parse-and-return tool that spawns no process. +//! +//! # Relationship to [`web_search`](super::web_search) +//! +//! [`web_search`](super::web_search) is the HOSTED, provider-executed web search +//! (the provider runs the search server-side; the client only declares + passes +//! through the result — it performs *no* local HTTP). This `search` tool is the +//! opposite: it performs a REAL local HTTP GET against DuckDuckGo Lite and parses +//! the returned HTML itself, exactly as the Python action did. The two are +//! complementary, not duplicates: `web_search` needs a capable provider; `search` +//! works against any provider because the client does the work. +//! +//! # Network seam (testability) +//! +//! The HTTP fetch lives behind the [`SearchBackend`] trait, with the real +//! [`HttpSearchBackend`] (a `reqwest` client) injected by default and a fake +//! substitutable in tests. This mirrors how the `browser` / `python` / `mcp` +//! handlers inject their backends (`BrowserTool::with_backend`, +//! `McpTool::new(Arc)`), so the tool's parsing/formatting logic is +//! unit-tested deterministically with fixture HTML — no network is touched. +//! +//! # HTML parsing +//! +//! The Python original used BeautifulSoup. This crate intentionally carries no +//! HTML-parser dependency (the existing browser tooling reads the DOM from a real +//! browser over CDP, never by parsing HTML strings), so to keep the dependency +//! footprint unchanged we extract the few fields we need with targeted `regex` +//! over the *specific, stable* DuckDuckGo Lite markup — the same fixed selectors +//! BeautifulSoup keyed on (`a.result-link`, `td.result-snippet`). The extraction +//! is faithful to the Python logic and fully fixture-tested in `search_tests.rs`. + +use std::sync::{Arc, OnceLock}; +use std::time::Duration; + +use regex::Regex; +use reqwest::header::{ACCEPT, ACCEPT_LANGUAGE, USER_AGENT}; + +use crate::tools::runtime::{ + Approvable, ExecOutput, SandboxAttempt, Sandboxable, ToolCtx, ToolError, ToolRuntime, +}; +use crate::tools::sandbox::{SandboxPermissions, SandboxPreference}; + +/// The tool name surfaced to the model. +pub const SEARCH_TOOL_NAME: &str = "search"; + +/// The DuckDuckGo Lite search endpoint the real backend fetches. +const DDG_LITE_BASE_URL: &str = "https://lite.duckduckgo.com/lite/"; + +/// Browser-like `User-Agent` (ported verbatim from the Python action's headers). +const DDG_USER_AGENT: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \ +AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36"; + +/// `Accept` header (ported verbatim from the Python action's headers). +const DDG_ACCEPT: &str = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; + +/// `Accept-Language` header (ported verbatim from the Python action's headers). +const DDG_ACCEPT_LANGUAGE: &str = "en-US,en;q=0.9"; + +/// Request timeout (the Python action used `timeout=30.0`). +const SEARCH_REQUEST_TIMEOUT_SECS: u64 = 30; + +/// A single parsed search result. +/// +/// Mirrors the Python action's `{title, url, description}` dict. +#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct SearchResult { + /// The result's title (the `a.result-link` text). + pub title: String, + /// The result's destination URL (the DuckDuckGo redirect, unwrapped). + pub url: String, + /// The result's snippet (the following `td.result-snippet` text), if any. + pub description: String, +} + +/// Typed request for the `search` tool. +/// +/// Mirrors the Python `SearchParams { query }`. +#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct SearchRequest { + /// The search query to look up on the web. + pub query: String, +} + +impl SearchRequest { + /// Convenience constructor from a bare query. + pub fn new(query: impl Into) -> Self { + Self { + query: query.into(), + } + } +} + +/// An error from the search backend's HTTP fetch. +/// +/// Reproduces the failure cases the Python `_search_duckduckgo` raised: a +/// challenge/CAPTCHA page, a non-2xx HTTP status, and a transport error. +#[derive(Debug, thiserror::Error)] +pub enum SearchError { + /// DuckDuckGo returned a challenge/anti-bot page (HTTP 202, or the body + /// mentions "anomaly"). + #[error( + "DuckDuckGo is showing a challenge/CAPTCHA – too many requests or suspicious activity." + )] + Challenge, + /// The server returned a client/server error status. + #[error("HTTP {status}: {snippet}")] + Http { + /// The HTTP status code. + status: u16, + /// The first 200 chars of the response body (matching the Python + /// `response.text[:200]`). + snippet: String, + }, + /// A transport-level error (connection, timeout, decoding). + #[error("{0}")] + Request(String), +} + +/// The network seam: fetch the raw DuckDuckGo Lite HTML for a query. +/// +/// Implemented for real by [`HttpSearchBackend`] and by a fake in tests, so the +/// tool's parsing/formatting can be exercised without a real network — mirroring +/// the `browser` / `python` / `mcp` backend seams. +#[async_trait::async_trait] +pub trait SearchBackend: Send + Sync { + /// Fetch the DuckDuckGo Lite result HTML for `query`. + async fn fetch(&self, query: &str) -> Result; +} + +/// The real [`SearchBackend`]: a `reqwest` client against DuckDuckGo Lite. +pub struct HttpSearchBackend { + client: reqwest::Client, + base_url: String, +} + +impl HttpSearchBackend { + /// Construct the backend with a default client and the DuckDuckGo Lite + /// endpoint. + pub fn new() -> Self { + let client = reqwest::Client::builder() + .timeout(Duration::from_secs(SEARCH_REQUEST_TIMEOUT_SECS)) + .build() + .unwrap_or_else(|_| reqwest::Client::new()); + Self { + client, + base_url: DDG_LITE_BASE_URL.to_string(), + } + } +} + +impl Default for HttpSearchBackend { + fn default() -> Self { + Self::new() + } +} + +#[async_trait::async_trait] +impl SearchBackend for HttpSearchBackend { + async fn fetch(&self, query: &str) -> Result { + // `reqwest`'s `.query()` produces application/x-www-form-urlencoded + // output (space -> `+`); the encoded byte set differs from Python's + // `quote_plus` on a few characters (e.g. `~`, `*`), but DuckDuckGo + // decodes both to the same query, so results are equivalent. Redirects + // are followed by default, matching `follow_redirects=True`. + let response = self + .client + .get(&self.base_url) + .query(&[("q", query)]) + .header(USER_AGENT, DDG_USER_AGENT) + .header(ACCEPT, DDG_ACCEPT) + .header(ACCEPT_LANGUAGE, DDG_ACCEPT_LANGUAGE) + .send() + .await + .map_err(|err| SearchError::Request(err.to_string()))?; + + let status = response.status().as_u16(); + let body = response + .text() + .await + .map_err(|err| SearchError::Request(err.to_string()))?; + + classify_response(status, &body)?; + Ok(body) + } +} + +/// Classify an HTTP response the way the Python action did: a challenge page +/// (status 202 or an "anomaly" body) first, then any `>= 400` status as an +/// error, otherwise success. +pub fn classify_response(status: u16, body: &str) -> Result<(), SearchError> { + if status == 202 || body.to_ascii_lowercase().contains("anomaly") { + return Err(SearchError::Challenge); + } + if status >= 400 { + let snippet: String = body.chars().take(200).collect(); + return Err(SearchError::Http { status, snippet }); + } + Ok(()) +} + +/// The async `search` tool. +/// +/// Holds the injected [`SearchBackend`]. Cheap to clone (the backend is behind +/// an `Arc`). +#[derive(Clone)] +pub struct SearchTool { + backend: Arc, +} + +impl Default for SearchTool { + fn default() -> Self { + Self::new() + } +} + +impl std::fmt::Debug for SearchTool { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // The backend is an opaque trait object; show only the tool identity. + f.debug_struct("SearchTool").finish_non_exhaustive() + } +} + +impl SearchTool { + /// Construct the tool backed by the real [`HttpSearchBackend`]. + pub fn new() -> Self { + Self::with_backend(Arc::new(HttpSearchBackend::new())) + } + + /// Construct the tool with a custom backend (used by tests). + pub fn with_backend(backend: Arc) -> Self { + Self { backend } + } + + /// The tool name surfaced to the model. + pub fn name(&self) -> &'static str { + SEARCH_TOOL_NAME + } +} + +/// Approval key: the query identifies a call for session caching, mirroring the +/// shape the other non-FS tools use (`tool_search.rs`, `web_search.rs`). This +/// tool is read-only and benign, so the key is rarely consulted; it exists to +/// satisfy the [`Approvable`] contract uniformly. +#[derive(serde::Serialize, Clone, Debug, Eq, PartialEq, Hash)] +pub struct SearchApprovalKey { + query: String, +} + +impl Approvable for SearchTool { + type ApprovalKey = SearchApprovalKey; + + fn approval_keys(&self, req: &SearchRequest) -> Vec { + vec![SearchApprovalKey { + query: req.query.clone(), + }] + } + + /// `search` touches no filesystem; request the default sandbox permissions + /// (no escalation), mirroring the other non-FS tools. + fn sandbox_permissions(&self, _req: &SearchRequest) -> SandboxPermissions { + SandboxPermissions::UseDefault + } + + // `exec_approval_requirement` is intentionally left at its trait default + // (`None`): the search is a benign, read-only HTTP GET (the Python action had + // no approval gate either). Returning `None` lets the orchestrator apply + // `default_exec_approval_requirement`, which yields `Skip` under any + // non-prompting policy. The outbound request mirrors the crate's existing + // network usage (the MCP HTTP client, analytics) which is likewise ungated. +} + +impl Sandboxable for SearchTool { + fn sandbox_preference(&self) -> SandboxPreference { + // Let the provider decide (today everything resolves to + // `SandboxType::None`). Keeps the seam uniform with the other non-FS + // tools. + SandboxPreference::Auto + } + + fn escalate_on_failure(&self) -> bool { + // The tool never produces a sandbox denial, so this is moot; `true` keeps + // it uniform with the other tools. + true + } +} + +#[async_trait::async_trait] +impl ToolRuntime for SearchTool { + fn parallel_safe(&self, _req: &SearchRequest) -> bool { + // A read-only HTTP GET + pure parse mutates no shared state, so it is safe + // to run concurrently with other tools — matching the parallel-safe + // stance of `tool_search` / `web_search`. + true + } + + async fn run( + &self, + req: &SearchRequest, + attempt: &SandboxAttempt<'_>, + _ctx: &ToolCtx, + ) -> Result { + // No sandbox is exercised (the tool does no FS I/O); acknowledge the + // attempt to make the seam explicit, matching the other tools. + let _ = attempt; + + let query = req.query.trim(); + if query.is_empty() { + return Err(ToolError::Rejected( + "search query must not be empty".to_string(), + )); + } + + // A fetch failure is surfaced to the model as a soft error (nonzero exit + // with the message on stderr), mirroring the Python action's + // `ActionResult(error="Search failed: …")` and the MCP handler's + // model-facing error mapping — not a hard tool error. + match self.backend.fetch(query).await { + Ok(html) => { + let results = parse_lite_results(&html); + let stdout = if results.is_empty() { + format!("No results found for \"{query}\".") + } else { + format_results(query, &results) + }; + Ok(ExecOutput { + exit_code: 0, + stdout, + stderr: String::new(), + }) + } + Err(err) => Ok(ExecOutput { + exit_code: 1, + stdout: String::new(), + stderr: format!("Search failed: {err}"), + }), + } + } +} + +// --------------------------------------------------------------------------- +// Pure helpers (parsing + formatting) — ported from the Python action. +// --------------------------------------------------------------------------- + +/// Format parsed results into the readable text block the model sees. +/// +/// Faithful to the Python action's `extracted_content` layout: a header (count + +/// the "you already have the results" guidance), then a numbered list with each +/// result's title, `URL:` line, and optional snippet, blank-line separated. +pub fn format_results(query: &str, results: &[SearchResult]) -> String { + let mut lines: Vec = Vec::with_capacity(results.len() * 4 + 1); + lines.push(format!( + "Search results for \"{query}\" ({} results):\n\ + You already have the results below – do NOT navigate to a search engine.\n\ + If these snippets are not enough, navigate directly to the result URLs for more detail.\n", + results.len() + )); + for (i, result) in results.iter().enumerate() { + lines.push(format!("{}. {}", i + 1, result.title)); + lines.push(format!(" URL: {}", result.url)); + if !result.description.is_empty() { + lines.push(format!(" {}", result.description)); + } + lines.push(String::new()); + } + lines.join("\n") +} + +/// Unwrap a DuckDuckGo redirect URL to its real destination. +/// +/// Ported from the Python `_extract_real_url`: +/// * protocol-relative `//host/…` gets an `https:` scheme; +/// * a `duckduckgo.com/l/?uddg=…` redirect is unwrapped to its `uddg` target +/// (form-decoded, matching `parse_qs` + `unquote`); +/// * ad links (`duckduckgo.com/y.js`) and non-`http(s)` schemes are dropped +/// (returns `None`). +pub fn extract_real_url(ddg_url: &str) -> Option { + if ddg_url.is_empty() { + return None; + } + + let with_scheme = if let Some(rest) = ddg_url.strip_prefix("//") { + format!("https://{rest}") + } else { + ddg_url.to_string() + }; + + let mut url = with_scheme.clone(); + if with_scheme.contains("duckduckgo.com/l/") && with_scheme.contains("uddg=") { + if let Some(target) = query_param(&with_scheme, "uddg") { + url = target; + } + } + + // Ad links – skip. + if url.contains("duckduckgo.com/y.js") { + return None; + } + + // Only allow http/https to prevent unsafe URLs (javascript:, data:, …). + if !(url.starts_with("https://") || url.starts_with("http://")) { + return None; + } + + Some(url) +} + +/// Collapse runs of whitespace into a single space and trim the ends. +/// +/// Ported from the Python `_normalize_whitespace` +/// (`re.sub(r"\s+", " ", text).strip()`). +pub fn normalize_whitespace(text: &str) -> String { + whitespace_regex() + .replace_all(text.trim(), " ") + .into_owned() +} + +/// Parse search results out of a DuckDuckGo Lite HTML response. +/// +/// Ported from the Python `_parse_lite_results`: for each `a.result-link`, take +/// its (entity-decoded) text as the title and unwrap its `href`; skip empty / +/// "more info" / duplicate / `duckduckgo.com` results; and attach the snippet +/// from the first following `td.result-snippet` that precedes the next result +/// link. +pub fn parse_lite_results(html: &str) -> Vec { + let anchors = collect_anchors(html); + let snippets = collect_snippets(html); + + let mut results: Vec = Vec::new(); + let mut seen: std::collections::HashSet = std::collections::HashSet::new(); + + for (idx, anchor) in anchors.iter().enumerate() { + if anchor.title.is_empty() || anchor.title.eq_ignore_ascii_case("more info") { + continue; + } + + let Some(url) = extract_real_url(&anchor.href) else { + continue; + }; + if seen.contains(&url) || url.contains("duckduckgo.com") { + continue; + } + seen.insert(url.clone()); + + // The snippet is the first `result-snippet` after this anchor and before + // the next one (matching the Python sibling-walk that stops at the next + // result link). + let next_pos = anchors.get(idx + 1).map_or(usize::MAX, |a| a.pos); + let description = snippets + .iter() + .find(|s| s.pos > anchor.pos && s.pos < next_pos) + .map(|s| s.text.clone()) + .unwrap_or_default(); + + results.push(SearchResult { + title: anchor.title.clone(), + url, + description, + }); + } + + results +} + +/// A raw `a.result-link` extracted from the HTML, with its byte offset. +struct RawAnchor { + pos: usize, + href: String, + title: String, +} + +/// A raw `td.result-snippet` extracted from the HTML, with its byte offset. +struct RawSnippet { + pos: usize, + text: String, +} + +/// Extract every `a.result-link` anchor (offset, href, title) in document order. +fn collect_anchors(html: &str) -> Vec { + anchor_regex() + .captures_iter(html) + .filter_map(|caps| { + let whole = caps.get(0)?; + let attrs = caps.get(1).map_or("", |m| m.as_str()); + let inner = caps.get(2).map_or("", |m| m.as_str()); + if !has_class(attrs, "result-link") { + return None; + } + Some(RawAnchor { + pos: whole.start(), + href: attr_value(attrs, AttrName::Href).unwrap_or_default(), + // Strip tags, decode entities, then trim. DuckDuckGo Lite titles + // are plain text, so this matches the Python `get_text(strip=True)` + // title extraction; on any inline markup it yields the cleaner + // space-preserving text rather than BeautifulSoup's node-join. + title: text_from_html(inner, "").trim().to_string(), + }) + }) + .collect() +} + +/// Extract every `td.result-snippet` (offset, normalized text) in document order. +fn collect_snippets(html: &str) -> Vec { + td_regex() + .captures_iter(html) + .filter_map(|caps| { + let whole = caps.get(0)?; + let attrs = caps.get(1).map_or("", |m| m.as_str()); + let inner = caps.get(2).map_or("", |m| m.as_str()); + if !has_class(attrs, "result-snippet") { + return None; + } + Some(RawSnippet { + pos: whole.start(), + // `get_text(separator=" ")` then normalize whitespace. + text: normalize_whitespace(&text_from_html(inner, " ")), + }) + }) + .collect() +} + +/// Strip HTML tags (replacing each with `separator`) and decode entities. +fn text_from_html(html: &str, separator: &str) -> String { + let without_tags = tag_regex().replace_all(html, separator); + decode_entities(&without_tags) +} + +/// Whether a tag's attribute string declares `class` containing `class_name`. +fn has_class(attrs: &str, class_name: &str) -> bool { + attr_value(attrs, AttrName::Class) + .is_some_and(|value| value.split_whitespace().any(|c| c == class_name)) +} + +/// The attributes we extract from a tag. +#[derive(Clone, Copy)] +enum AttrName { + Href, + Class, +} + +/// Extract a quoted attribute value from a tag's attribute string. +fn attr_value(attrs: &str, name: AttrName) -> Option { + let re = match name { + AttrName::Href => href_regex(), + AttrName::Class => class_regex(), + }; + re.captures(attrs) + .and_then(|caps| caps.get(1)) + .map(|m| m.as_str().to_string()) +} + +/// Read a single query parameter's value, form-decoded (matching `parse_qs`: +/// `+` becomes a space and `%XX` is percent-decoded). +fn query_param(url: &str, key: &str) -> Option { + let (_, query) = url.split_once('?')?; + // Drop any fragment before splitting pairs. + let query = query.split('#').next().unwrap_or(query); + for pair in query.split('&') { + let (k, v) = pair.split_once('=').unwrap_or((pair, "")); + if k == key { + return Some(percent_decode_form(v)); + } + } + None +} + +/// Form-decode a query component: `+` -> space, `%XX` -> byte, then UTF-8. +fn percent_decode_form(value: &str) -> String { + let spaced = value.replace('+', " "); + let bytes = spaced.as_bytes(); + let mut out: Vec = Vec::with_capacity(bytes.len()); + let mut i = 0; + while i < bytes.len() { + if bytes[i] == b'%' && i + 2 < bytes.len() { + if let (Some(hi), Some(lo)) = (hex_val(bytes[i + 1]), hex_val(bytes[i + 2])) { + out.push(hi * 16 + lo); + i += 3; + continue; + } + } + out.push(bytes[i]); + i += 1; + } + String::from_utf8_lossy(&out).into_owned() +} + +/// Hex digit value of an ASCII byte, or `None`. +fn hex_val(byte: u8) -> Option { + match byte { + b'0'..=b'9' => Some(byte - b'0'), + b'a'..=b'f' => Some(byte - b'a' + 10), + b'A'..=b'F' => Some(byte - b'A' + 10), + _ => None, + } +} + +/// Decode the common HTML character references in one pass. +/// +/// Covers the named references that appear in DuckDuckGo snippets plus all +/// numeric references (`&#NN;` / `&#xHH;`); unknown named references are left +/// intact (BeautifulSoup decodes the full set — this is the practical subset). +fn decode_entities(text: &str) -> String { + entity_regex() + .replace_all(text, |caps: ®ex::Captures<'_>| { + let body = &caps[1]; + if let Some(hex) = body.strip_prefix("#x").or_else(|| body.strip_prefix("#X")) { + return decode_codepoint(u32::from_str_radix(hex, 16).ok()) + .unwrap_or_else(|| caps[0].to_string()); + } + if let Some(dec) = body.strip_prefix('#') { + return decode_codepoint(dec.parse::().ok()) + .unwrap_or_else(|| caps[0].to_string()); + } + match body { + "amp" => "&", + "lt" => "<", + "gt" => ">", + "quot" => "\"", + "apos" => "'", + "nbsp" => " ", + // Typographic punctuation. + "hellip" => "…", + "mdash" => "—", + "ndash" => "–", + "rsquo" => "\u{2019}", + "lsquo" => "\u{2018}", + "rdquo" => "\u{201D}", + "ldquo" => "\u{201C}", + "laquo" => "«", + "raquo" => "»", + "middot" => "·", + "bull" => "•", + // Common symbols. + "copy" => "©", + "reg" => "®", + "trade" => "™", + "times" => "×", + "divide" => "÷", + "deg" => "°", + "euro" => "€", + "pound" => "£", + "cent" => "¢", + "sect" => "§", + // Common Western-European accented letters. + "aacute" => "á", + "agrave" => "à", + "acirc" => "â", + "auml" => "ä", + "aring" => "å", + "ccedil" => "ç", + "eacute" => "é", + "egrave" => "è", + "ecirc" => "ê", + "euml" => "ë", + "iacute" => "í", + "iuml" => "ï", + "ntilde" => "ñ", + "oacute" => "ó", + "ocirc" => "ô", + "ouml" => "ö", + "uacute" => "ú", + "uuml" => "ü", + "szlig" => "ß", + // Unknown named reference: leave the original text intact + // (BeautifulSoup decodes the full HTML5 set; this is the + // practical subset DuckDuckGo emits, plus all numeric refs). + _ => return caps[0].to_string(), + } + .to_string() + }) + .into_owned() +} + +/// Map a numeric character-reference code point to its string, if valid. +fn decode_codepoint(code: Option) -> Option { + code.and_then(char::from_u32).map(|c| c.to_string()) +} + +// --- Cached regexes (compiled once; patterns are constant) ----------------- +// +// The tag regexes use `[^>]*` for the attribute span, which assumes attribute +// values contain no literal `>` — true for the fixed DuckDuckGo Lite markup +// (see the module doc). On non-conforming markup a `>` inside an attribute +// value would truncate the match (dropping that result), never panic. + +fn anchor_regex() -> &'static Regex { + static RE: OnceLock = OnceLock::new(); + RE.get_or_init(|| Regex::new(r"(?is)]*)>(.*?)").expect("valid anchor regex")) +} + +fn td_regex() -> &'static Regex { + static RE: OnceLock = OnceLock::new(); + RE.get_or_init(|| Regex::new(r"(?is)]*)>(.*?)").expect("valid td regex")) +} + +fn tag_regex() -> &'static Regex { + static RE: OnceLock = OnceLock::new(); + RE.get_or_init(|| Regex::new(r"(?s)<[^>]*>").expect("valid tag regex")) +} + +fn href_regex() -> &'static Regex { + static RE: OnceLock = OnceLock::new(); + RE.get_or_init(|| { + Regex::new(r#"(?i)(?:^|\s)href\s*=\s*["']([^"']*)["']"#).expect("valid href regex") + }) +} + +fn class_regex() -> &'static Regex { + static RE: OnceLock = OnceLock::new(); + RE.get_or_init(|| { + Regex::new(r#"(?i)(?:^|\s)class\s*=\s*["']([^"']*)["']"#).expect("valid class regex") + }) +} + +fn whitespace_regex() -> &'static Regex { + static RE: OnceLock = OnceLock::new(); + RE.get_or_init(|| Regex::new(r"\s+").expect("valid whitespace regex")) +} + +fn entity_regex() -> &'static Regex { + static RE: OnceLock = OnceLock::new(); + RE.get_or_init(|| { + Regex::new(r"&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*);") + .expect("valid entity regex") + }) +} diff --git a/crates/browser-use-agent/src/tools/handlers/search_tests.rs b/crates/browser-use-agent/src/tools/handlers/search_tests.rs new file mode 100644 index 00000000..c911f87e --- /dev/null +++ b/crates/browser-use-agent/src/tools/handlers/search_tests.rs @@ -0,0 +1,476 @@ +//! Tests for the async `search` tool ([`SearchTool`]). +//! +//! No real network is touched: the pure parsing/formatting/URL helpers are +//! exercised against fixture HTML, and the `run` path is driven through a fake +//! [`SearchBackend`] (mirroring `update_plan_tests` / `tool_search_tests`). + +use std::sync::Arc; + +use super::search::{ + classify_response, extract_real_url, format_results, normalize_whitespace, parse_lite_results, + SearchBackend, SearchError, SearchRequest, SearchResult, SearchTool, SEARCH_TOOL_NAME, +}; +use crate::tools::approval::AskForApproval; +use crate::tools::orchestrator::{ToolOrchestrator, TurnEnv}; +use crate::tools::runtime::{ + Approvable, AutoApprover, SandboxAttempt, ToolCtx, ToolError, ToolRuntime, +}; +use crate::tools::sandbox::{ + FileSystemSandboxPolicy, NoneSandboxProvider, SandboxLaunch, SandboxPermissions, SandboxType, +}; + +// ---- test scaffolding (mirrors update_plan_tests) ------------------------- + +fn none_launch() -> SandboxLaunch { + SandboxLaunch { + sandbox: SandboxType::None, + cancel: None, + } +} + +fn none_attempt(launch: &SandboxLaunch) -> SandboxAttempt<'_> { + SandboxAttempt { + sandbox: SandboxType::None, + permissions: SandboxPermissions::UseDefault, + enforce_managed_network: false, + launch, + cancel: None, + } +} + +fn ctx() -> ToolCtx { + ToolCtx { + call_id: "test-call".to_string(), + tool_name: "search".to_string(), + cwd: std::env::temp_dir(), + artifact_root: std::env::temp_dir().join("artifacts"), + } +} + +fn turn_env() -> TurnEnv { + TurnEnv { + file_system_sandbox_policy: FileSystemSandboxPolicy { + restricted: false, + denied_read: false, + }, + managed_network_active: false, + strict_auto_review: false, + use_guardian: false, + } +} + +/// A fake backend returning a canned HTML body (no network). +struct HtmlBackend(String); + +#[async_trait::async_trait] +impl SearchBackend for HtmlBackend { + async fn fetch(&self, _query: &str) -> Result { + Ok(self.0.clone()) + } +} + +/// A fake backend returning a challenge error (no network). +struct ChallengeBackend; + +#[async_trait::async_trait] +impl SearchBackend for ChallengeBackend { + async fn fetch(&self, _query: &str) -> Result { + Err(SearchError::Challenge) + } +} + +/// A small, realistic DuckDuckGo Lite results fixture exercising: a redirect +/// URL, an entity in the snippet, a "More info" link (skipped), a duplicate +/// (deduped), a `duckduckgo.com` target (skipped), a direct (non-redirect) link, +/// and a result without a snippet. +const FIXTURE: &str = r#" + + + + + + + + + + + + + + + + + + + + + +
1. The Rust Programming Language
 A language empowering everyone to build reliable & efficient software — fast.
www.rust-lang.org
2. Rust (duplicate target)
duplicate should be dropped
DuckDuckGo About
a duckduckgo.com target, should be dropped
More info
Direct Link No Redirect
direct link snippet
No Snippet Result
+ +"#; + +// ---- pure helpers: normalize_whitespace ----------------------------------- + +#[test] +fn normalize_whitespace_collapses_and_trims() { + assert_eq!(normalize_whitespace(" a \n\t b c \r\n"), "a b c"); + assert_eq!(normalize_whitespace("single"), "single"); + assert_eq!(normalize_whitespace(" "), ""); +} + +// ---- pure helpers: extract_real_url --------------------------------------- + +#[test] +fn extract_real_url_unwraps_ddg_redirect() { + let raw = "//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fpage%3Fx%3D1&rut=abc"; + assert_eq!( + extract_real_url(raw), + Some("https://example.com/page?x=1".to_string()) + ); +} + +#[test] +fn extract_real_url_decodes_plus_as_space() { + // `parse_qs` semantics: `+` in a query value decodes to a space. + let raw = "//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fa+b"; + assert_eq!( + extract_real_url(raw), + Some("https://example.com/a b".to_string()) + ); +} + +#[test] +fn extract_real_url_adds_scheme_to_protocol_relative() { + assert_eq!( + extract_real_url("//example.com/x"), + Some("https://example.com/x".to_string()) + ); +} + +#[test] +fn extract_real_url_passes_through_plain_http() { + assert_eq!( + extract_real_url("https://example.com/"), + Some("https://example.com/".to_string()) + ); + assert_eq!( + extract_real_url("http://example.com/"), + Some("http://example.com/".to_string()) + ); +} + +#[test] +fn extract_real_url_drops_ads_and_unsafe_and_empty() { + // Ad links. + assert_eq!( + extract_real_url("//duckduckgo.com/y.js?ad_provider=x"), + None + ); + // Non-http(s) schemes. + assert_eq!(extract_real_url("javascript:alert(1)"), None); + assert_eq!(extract_real_url("data:text/html,hi"), None); + // Empty. + assert_eq!(extract_real_url(""), None); +} + +// ---- pure helpers: parse_lite_results ------------------------------------- + +#[test] +fn parse_lite_results_extracts_decodes_dedupes_and_filters() { + let results = parse_lite_results(FIXTURE); + + // Kept, in order: rust-lang (redirect), direct link, no-snippet result. + // Dropped: duplicate target, duckduckgo.com target, "More info" title. + let titles: Vec<&str> = results.iter().map(|r| r.title.as_str()).collect(); + assert_eq!( + titles, + vec![ + "The Rust Programming Language", + "Direct Link No Redirect", + "No Snippet Result", + ] + ); + + // First result: redirect unwrapped + snippet entity-decoded + normalized. + assert_eq!(results[0].url, "https://www.rust-lang.org/"); + assert_eq!( + results[0].description, + "A language empowering everyone to build reliable & efficient software — fast." + ); + + // Direct (non-redirect) link is passed through with its own snippet. + assert_eq!(results[1].url, "https://direct.example.com/page"); + assert_eq!(results[1].description, "direct link snippet"); + + // A result with no following snippet gets an empty description. + assert_eq!(results[2].url, "https://no-snippet.example.com/"); + assert_eq!(results[2].description, ""); +} + +#[test] +fn parse_lite_results_handles_empty_and_resultless_html() { + assert!(parse_lite_results("").is_empty()); + assert!(parse_lite_results("no results here").is_empty()); +} + +/// Inline markup inside a title/snippet, real whitespace runs, and a broadened +/// named entity: exercises `text_from_html` tag-stripping (both separators), +/// `normalize_whitespace` via the parse path, and the entity table. +#[test] +fn parse_lite_results_strips_inline_markup_and_collapses_whitespace() { + let html = "\ + \ + \ +
The Rust Book
Tokio is an\n async runtime for café & more.
"; + let results = parse_lite_results(html); + assert_eq!(results.len(), 1); + // Title: tags stripped (separator ""), single-spaced. + assert_eq!(results[0].title, "The Rust Book"); + assert_eq!(results[0].url, "https://book.example.com/"); + // Snippet: tags -> space, é/& decoded, whitespace runs collapsed. + assert_eq!( + results[0].description, + "Tokio is an async runtime for café & more." + ); +} + +// ---- pure helpers: format_results ----------------------------------------- + +#[test] +fn format_results_renders_header_and_numbered_entries() { + let results = vec![ + SearchResult { + title: "First".to_string(), + url: "https://a.example/".to_string(), + description: "first snippet".to_string(), + }, + SearchResult { + title: "Second".to_string(), + url: "https://b.example/".to_string(), + description: String::new(), + }, + ]; + let out = format_results("my query", &results); + + assert!( + out.contains("Search results for \"my query\" (2 results):"), + "got: {out}" + ); + assert!( + out.contains("do NOT navigate to a search engine"), + "got: {out}" + ); + assert!(out.contains("1. First"), "got: {out}"); + assert!(out.contains(" URL: https://a.example/"), "got: {out}"); + assert!(out.contains(" first snippet"), "got: {out}"); + assert!(out.contains("2. Second"), "got: {out}"); + assert!(out.contains(" URL: https://b.example/"), "got: {out}"); +} + +// ---- pure helpers: classify_response -------------------------------------- + +#[test] +fn classify_response_flags_challenge_status_and_anomaly_body() { + assert!(matches!( + classify_response(202, "anything"), + Err(SearchError::Challenge) + )); + assert!(matches!( + classify_response(200, "...Anomaly detected..."), + Err(SearchError::Challenge) + )); +} + +#[test] +fn classify_response_flags_http_errors_with_snippet() { + let body = "x".repeat(500); + match classify_response(503, &body) { + Err(SearchError::Http { status, snippet }) => { + assert_eq!(status, 503); + assert_eq!( + snippet.chars().count(), + 200, + "snippet truncated to 200 chars" + ); + } + other => panic!("expected Http error, got {other:?}"), + } +} + +#[test] +fn classify_response_flags_4xx_and_pins_the_400_boundary() { + // 4xx is the case the port must handle (not just 5xx). + match classify_response(404, "not found") { + Err(SearchError::Http { status, snippet }) => { + assert_eq!(status, 404); + assert_eq!(snippet, "not found"); + } + other => panic!("expected Http error, got {other:?}"), + } + // The 399-ok / 400-error boundary pins against an off-by-one in `>= 400`. + assert!(classify_response(399, "ok").is_ok()); + assert!(matches!( + classify_response(400, "bad"), + Err(SearchError::Http { status: 400, .. }) + )); +} + +#[test] +fn classify_response_accepts_ok() { + assert!(classify_response(200, "fine").is_ok()); +} + +// ---- run() through the fake backend --------------------------------------- + +#[tokio::test] +async fn run_formats_results_from_backend_html() { + let tool = SearchTool::with_backend(Arc::new(HtmlBackend(FIXTURE.to_string()))); + let launch = none_launch(); + let attempt = none_attempt(&launch); + let out = tool + .run(&SearchRequest::new("rust"), &attempt, &ctx()) + .await + .unwrap(); + + assert_eq!(out.exit_code, 0); + assert!(out.stderr.is_empty()); + assert!( + out.stdout + .contains("Search results for \"rust\" (3 results):"), + "got: {}", + out.stdout + ); + assert!( + out.stdout.contains("The Rust Programming Language"), + "got: {}", + out.stdout + ); + assert!( + out.stdout.contains("https://www.rust-lang.org/"), + "got: {}", + out.stdout + ); +} + +#[tokio::test] +async fn run_reports_no_results() { + let tool = SearchTool::with_backend(Arc::new(HtmlBackend( + "nothing".to_string(), + ))); + let launch = none_launch(); + let attempt = none_attempt(&launch); + let out = tool + .run(&SearchRequest::new("obscure"), &attempt, &ctx()) + .await + .unwrap(); + + assert_eq!(out.exit_code, 0); + assert_eq!(out.stdout, "No results found for \"obscure\"."); +} + +#[tokio::test] +async fn run_rejects_empty_query() { + let tool = SearchTool::with_backend(Arc::new(HtmlBackend(String::new()))); + let launch = none_launch(); + let attempt = none_attempt(&launch); + let err = tool + .run(&SearchRequest::new(" "), &attempt, &ctx()) + .await + .unwrap_err(); + let ToolError::Rejected(msg) = err else { + panic!("expected Rejected, got {err:?}"); + }; + assert!(msg.contains("must not be empty"), "got: {msg}"); +} + +#[tokio::test] +async fn run_surfaces_backend_failure_as_soft_error() { + let tool = SearchTool::with_backend(Arc::new(ChallengeBackend)); + let launch = none_launch(); + let attempt = none_attempt(&launch); + let out = tool + .run(&SearchRequest::new("rust"), &attempt, &ctx()) + .await + .unwrap(); + + // A fetch failure is a soft, model-visible error (nonzero exit + stderr), + // not a hard tool error. + assert_eq!(out.exit_code, 1); + assert!(out.stdout.is_empty()); + assert!( + out.stderr.contains("Search failed:") && out.stderr.contains("challenge"), + "got: {}", + out.stderr + ); +} + +// ---- accessors + parallel-safety ------------------------------------------ + +#[test] +fn approval_accessors() { + let tool = SearchTool::with_backend(Arc::new(HtmlBackend(String::new()))); + let req = SearchRequest::new("rust"); + assert_eq!(tool.approval_keys(&req).len(), 1, "one key per call"); + assert_eq!( + tool.sandbox_permissions(&req), + SandboxPermissions::UseDefault + ); + assert!(tool.exec_approval_requirement(&req).is_none()); +} + +#[test] +fn search_is_parallel_safe() { + let tool = SearchTool::with_backend(Arc::new(HtmlBackend(String::new()))); + assert!(tool.parallel_safe(&SearchRequest::new("rust"))); +} + +#[test] +fn tool_name_is_search() { + assert_eq!(SEARCH_TOOL_NAME, "search"); + let tool = SearchTool::with_backend(Arc::new(HtmlBackend(String::new()))); + assert_eq!(tool.name(), "search"); +} + +#[test] +fn request_round_trips_wire_shape() { + let json = r#"{"query":"hello world"}"#; + let req: SearchRequest = serde_json::from_str(json).unwrap(); + assert_eq!(req.query, "hello world"); + let out = serde_json::to_string(&req).unwrap(); + assert_eq!(out, json); +} + +// ---- drive a call through the orchestrator over the seam ------------------- + +#[tokio::test] +async fn orchestrated_search_completes_under_none() { + let orch = ToolOrchestrator::new(NoneSandboxProvider, AutoApprover); + let tool = SearchTool::with_backend(Arc::new(HtmlBackend(FIXTURE.to_string()))); + + let result = orch + .run( + &tool, + &SearchRequest::new("rust"), + &ctx(), + &turn_env(), + AskForApproval::Never, + ) + .await + .expect("orchestration ok"); + + assert_eq!(result.sandbox_used, SandboxType::None); + assert_eq!(result.output.exit_code, 0); + assert!( + result + .output + .stdout + .contains("The Rust Programming Language"), + "got: {}", + result.output.stdout + ); +} diff --git a/crates/browser-use-agent/src/tools/registry.rs b/crates/browser-use-agent/src/tools/registry.rs index 79f9c7e0..6ffc4eee 100644 --- a/crates/browser-use-agent/src/tools/registry.rs +++ b/crates/browser-use-agent/src/tools/registry.rs @@ -1155,6 +1155,34 @@ to the single frame that proves the task succeeded." } } + /// `search`: a LOCALLY-executed DuckDuckGo (Lite) web search. Unlike the + /// hosted [`web_search`](definitions::web_search), the client performs the + /// HTTP request itself and returns the parsed results as text. Ported from + /// the Python `search` action's description. + pub fn search() -> ToolDefinition { + ToolDefinition { + name: "search".to_string(), + description: "Search the web using DuckDuckGo and return results directly as text – \ + no browser navigation occurs. The returned results are final and complete. \ + NEVER open a search engine website after calling this action." + .to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The search query to look up on the web." + } + }, + "required": ["query"], + "additionalProperties": false + }), + output_schema: None, + namespace: None, + namespace_description: None, + } + } + fn agent_status_output_schema() -> Value { json!({ "oneOf": [ @@ -1924,9 +1952,10 @@ Agent-role guidance below only helps choose which agent to use after spawning is /// `WireArgs` types. The browser/python/mcp handlers need an injected backend /// (they would otherwise reach the OS), so those are supplied by the caller. /// -/// `parallel_safe` per tool: `exec_command` / `tool_search` / `web_search` = -/// `true`; `shell` / `apply_patch` / `view_image` / `browser` / `python` / -/// `update_plan` / `done` = `false` (serial). `mcp` is registered `false` here +/// `parallel_safe` per tool: `exec_command` / `tool_search` / `web_search` / +/// `search` = `true`; `shell` / `apply_patch` / `view_image` / `browser` / +/// `python` / `update_plan` / `done` = `false` (serial). `mcp` is registered +/// `false` here /// (a serial default); its per-request read-only hint still drives the handler's /// own [`ToolRuntime::parallel_safe`](crate::tools::ToolRuntime::parallel_safe). #[allow(clippy::too_many_arguments)] @@ -1940,6 +1969,7 @@ pub fn default_registry( update_plan: crate::tools::handlers::update_plan::UpdatePlanTool, tool_search: crate::tools::handlers::tool_search::ToolSearchTool, web_search: crate::tools::handlers::web_search::WebSearchTool, + search: crate::tools::handlers::search::SearchTool, done: crate::tools::handlers::done::DoneTool, ) -> ToolRegistry where @@ -1951,6 +1981,7 @@ where use crate::tools::handlers::done::DoneRequest; use crate::tools::handlers::mcp::McpToolCallRequest; use crate::tools::handlers::python::PythonRequest; + use crate::tools::handlers::search::SearchRequest; use crate::tools::handlers::shell::{ ExecCommandRequest, ExecCommandTool, ShellRequest, WriteStdinRequest, WriteStdinTool, }; @@ -2002,6 +2033,9 @@ where tool_search, ); reg.register::<_, WebSearchRequest>("web_search", definitions::web_search(), true, web_search); + // `search`: locally-executed DuckDuckGo search. Read-only HTTP GET + + // pure parse, so parallel-safe like `web_search` / `tool_search`. + reg.register::<_, SearchRequest>("search", definitions::search(), true, search); // `done`: the completion tool. Serial (terminal; must not be reordered). reg.register::<_, DoneRequest>("done", definitions::done(), false, done); diff --git a/crates/browser-use-agent/src/tools/registry_tests.rs b/crates/browser-use-agent/src/tools/registry_tests.rs index 8efc4089..9315e87a 100644 --- a/crates/browser-use-agent/src/tools/registry_tests.rs +++ b/crates/browser-use-agent/src/tools/registry_tests.rs @@ -28,6 +28,7 @@ use crate::tools::handlers::mcp::{ McpCallResult, McpClient, McpTool, McpToolCallRequest, McpWireArgs, }; use crate::tools::handlers::python::{PythonBackend, PythonRequest, PythonTool}; +use crate::tools::handlers::search::{SearchBackend, SearchError, SearchTool}; use crate::tools::handlers::shell::{ShellRequest, ShellTool}; use crate::tools::handlers::tool_search::{ToolSearchEntry, ToolSearchRequest, ToolSearchTool}; use crate::tools::handlers::update_plan::{UpdatePlanRequest, UpdatePlanTool}; @@ -485,6 +486,23 @@ impl McpClient for FakeMcpClient { } } +/// A fake search backend: returns a canned DuckDuckGo Lite HTML fragment with a +/// single result echoing the query, so no network is touched (mirrors +/// `search_tests.rs`). +struct FakeSearchBackend; + +#[async_trait::async_trait] +impl SearchBackend for FakeSearchBackend { + async fn fetch(&self, query: &str) -> Result { + Ok(format!( + "\ + \ + \ +
Result for {query}
snippet for {query}
" + )) + } +} + /// Build a registry holding all handlers via [`default_registry`], using /// fake backends for browser/python/mcp so no OS resource is touched. fn full_registry() -> ToolRegistry { @@ -502,6 +520,7 @@ fn full_registry() -> ToolRegistry { ["namespace"], )]), WebSearchTool::new(WebSearchConfig::enabled()), + SearchTool::with_backend(Arc::new(FakeSearchBackend)), DoneTool::new(), ) } @@ -519,11 +538,11 @@ fn ctx_at(name: &str, cwd: PathBuf) -> ToolCtx { #[test] fn default_registry_registers_all_tools() { let reg = full_registry(); - assert_eq!(reg.len(), 12, "all tools must register"); + assert_eq!(reg.len(), 13, "all tools must register"); let defs = reg.model_visible_definitions(); assert_eq!( defs.len(), - 12, + 13, "model_visible_definitions must list all tools" ); let mut names: Vec<&str> = defs.iter().map(|d| d.name.as_str()).collect(); @@ -537,6 +556,7 @@ fn default_registry_registers_all_tools() { "exec_command", "mcp", "python", + "search", "shell", "tool_search", "update_plan", @@ -562,6 +582,7 @@ fn parallel_safe_flags_match_registration() { // Pure / read-only tools are parallel-safe. assert_eq!(reg.parallel_safe("tool_search"), Some(true)); assert_eq!(reg.parallel_safe("web_search"), Some(true)); + assert_eq!(reg.parallel_safe("search"), Some(true)); // Everything else is serial. for name in [ "shell", @@ -801,6 +822,41 @@ async fn tool_search_and_web_search_dispatch() { ); } +#[tokio::test] +async fn search_dispatches_to_the_fake_backend() { + let reg = full_registry(); + let orch = ToolOrchestrator::stub(); + let out = reg + .dispatch( + "search", + &serde_json::json!({ "query": "rust lang" }), + &ctx("search"), + &env(), + AskForApproval::Never, + &orch, + ) + .await + .expect("search should dispatch"); + assert_eq!(out.exit_code, 0); + // The fake backend's canned HTML yields one result whose title echoes the + // query, its unwrapped destination URL, and the snippet. + assert!( + out.stdout.contains("Result for rust lang"), + "search stdout: {:?}", + out.stdout + ); + assert!( + out.stdout.contains("https://example.com/"), + "search stdout: {:?}", + out.stdout + ); + assert!( + out.stdout.contains("snippet for rust lang"), + "search stdout: {:?}", + out.stdout + ); +} + #[tokio::test] async fn browser_bad_action_value_surfaces_an_error_naming_the_tool() { let reg = full_registry(); From fefc3aa5b4b77e00e13ddd2b084e31c3474bc8d0 Mon Sep 17 00:00:00 2001 From: reformedot Date: Thu, 4 Jun 2026 18:15:49 -0700 Subject: [PATCH 02/10] Add ignored live DuckDuckGo smoke test for the search tool A network-dependent end-to-end check against the real DuckDuckGo Lite endpoint via the default HttpSearchBackend. Ignored by default (so CI and `cargo test` stay deterministic and offline); run manually with: cargo test -p browser-use-agent --lib -- --ignored --nocapture search_live_smoke Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/tools/handlers/search_tests.rs | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/crates/browser-use-agent/src/tools/handlers/search_tests.rs b/crates/browser-use-agent/src/tools/handlers/search_tests.rs index c911f87e..0a985938 100644 --- a/crates/browser-use-agent/src/tools/handlers/search_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/search_tests.rs @@ -474,3 +474,48 @@ async fn orchestrated_search_completes_under_none() { result.output.stdout ); } + +// ---- live smoke (ignored: hits the real DuckDuckGo endpoint) -------------- + +/// End-to-end check against the REAL DuckDuckGo Lite endpoint via the default +/// [`HttpSearchBackend`]. Ignored by default (network + non-deterministic, and +/// DuckDuckGo may rate-limit/serve a challenge). Run it manually with: +/// +/// ```text +/// cargo test -p browser-use-agent --lib -- --ignored --nocapture search_live_smoke +/// ``` +#[tokio::test] +#[ignore = "hits the live DuckDuckGo Lite endpoint"] +async fn search_live_smoke() { + let tool = SearchTool::new(); + let launch = none_launch(); + let attempt = none_attempt(&launch); + let out = tool + .run( + &SearchRequest::new("rust programming language"), + &attempt, + &ctx(), + ) + .await + .expect("run ok"); + + eprintln!( + "exit_code={}\n--- stdout ---\n{}\n--- stderr ---\n{}", + out.exit_code, out.stdout, out.stderr + ); + // A challenge/CAPTCHA is a legitimate live outcome (exit 1 + message); only + // assert hard on the success shape so the test documents both paths. + if out.exit_code == 0 { + assert!( + out.stdout.contains("Search results for") || out.stdout.contains("No results found"), + "unexpected stdout: {}", + out.stdout + ); + } else { + assert!( + out.stderr.contains("Search failed:"), + "unexpected stderr: {}", + out.stderr + ); + } +} From 99b43482cae84b8d57b91bbab9a88eb1dc668201 Mon Sep 17 00:00:00 2001 From: reformedot Date: Thu, 4 Jun 2026 18:26:47 -0700 Subject: [PATCH 03/10] Truncate search result title (15) and description (100) for token efficiency The formatted model-facing output now trims each result's title to 15 chars and description to 100 chars (ellipsis counted within the cap, on a Unicode char boundary); destination URLs are kept intact so they stay usable. Truncation is applied at the display layer (`format_results`), so `SearchResult` still carries full data for any other consumer. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/tools/handlers/search.rs | 37 +++++++++++++-- .../src/tools/handlers/search_tests.rs | 47 +++++++++++++++++-- .../src/tools/registry_tests.rs | 5 +- 3 files changed, 79 insertions(+), 10 deletions(-) diff --git a/crates/browser-use-agent/src/tools/handlers/search.rs b/crates/browser-use-agent/src/tools/handlers/search.rs index d21aeeaa..8fde09b5 100644 --- a/crates/browser-use-agent/src/tools/handlers/search.rs +++ b/crates/browser-use-agent/src/tools/handlers/search.rs @@ -70,6 +70,14 @@ const DDG_ACCEPT_LANGUAGE: &str = "en-US,en;q=0.9"; /// Request timeout (the Python action used `timeout=30.0`). const SEARCH_REQUEST_TIMEOUT_SECS: u64 = 30; +/// Max characters of a result title in the formatted output. Titles are trimmed +/// (with an ellipsis counted within the cap) to keep the model-facing text token +/// efficient. +const MAX_TITLE_CHARS: usize = 15; + +/// Max characters of a result description (snippet) in the formatted output. +const MAX_DESCRIPTION_CHARS: usize = 100; + /// A single parsed search result. /// /// Mirrors the Python action's `{title, url, description}` dict. @@ -356,7 +364,10 @@ impl ToolRuntime for SearchTool { /// /// Faithful to the Python action's `extracted_content` layout: a header (count + /// the "you already have the results" guidance), then a numbered list with each -/// result's title, `URL:` line, and optional snippet, blank-line separated. +/// result's title, `URL:` line, and optional snippet, blank-line separated. The +/// title and description are truncated ([`MAX_TITLE_CHARS`] / +/// [`MAX_DESCRIPTION_CHARS`]) for token efficiency; URLs are kept intact so they +/// remain usable. pub fn format_results(query: &str, results: &[SearchResult]) -> String { let mut lines: Vec = Vec::with_capacity(results.len() * 4 + 1); lines.push(format!( @@ -366,16 +377,36 @@ pub fn format_results(query: &str, results: &[SearchResult]) -> String { results.len() )); for (i, result) in results.iter().enumerate() { - lines.push(format!("{}. {}", i + 1, result.title)); + lines.push(format!( + "{}. {}", + i + 1, + truncate_chars(&result.title, MAX_TITLE_CHARS) + )); lines.push(format!(" URL: {}", result.url)); if !result.description.is_empty() { - lines.push(format!(" {}", result.description)); + lines.push(format!( + " {}", + truncate_chars(&result.description, MAX_DESCRIPTION_CHARS) + )); } lines.push(String::new()); } lines.join("\n") } +/// Truncate `text` to at most `max` characters (Unicode scalar values). When it +/// must cut, the last kept character is an ellipsis `…`, so the result is never +/// longer than `max` and the truncation is visible. Trailing whitespace before +/// the ellipsis is trimmed so the text reads cleanly. +fn truncate_chars(text: &str, max: usize) -> String { + if text.chars().count() <= max { + return text.to_string(); + } + // Reserve one character for the ellipsis. + let prefix: String = text.chars().take(max.saturating_sub(1)).collect(); + format!("{}…", prefix.trim_end()) +} + /// Unwrap a DuckDuckGo redirect URL to its real destination. /// /// Ported from the Python `_extract_real_url`: diff --git a/crates/browser-use-agent/src/tools/handlers/search_tests.rs b/crates/browser-use-agent/src/tools/handlers/search_tests.rs index 0a985938..342b1324 100644 --- a/crates/browser-use-agent/src/tools/handlers/search_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/search_tests.rs @@ -272,6 +272,38 @@ fn format_results_renders_header_and_numbered_entries() { assert!(out.contains(" URL: https://b.example/"), "got: {out}"); } +#[test] +fn format_results_truncates_long_title_and_description() { + let results = vec![SearchResult { + title: "ThisTitleIsWayTooLongToKeep".to_string(), + url: "https://example.com/keep/this/whole/url".to_string(), + description: "d".repeat(250), + }]; + let out = format_results("q", &results); + + // Title capped at 15 characters including the ellipsis. + let title = out + .lines() + .find_map(|l| l.strip_prefix("1. ")) + .expect("title line"); + assert_eq!(title.chars().count(), 15, "title capped at 15: {title:?}"); + assert!(title.ends_with('…'), "title ellipsized: {title:?}"); + assert!(title.starts_with("ThisTitle"), "title prefix: {title:?}"); + assert!(!out.contains("TooLong"), "tail must be dropped: {out}"); + + // URL is kept intact (not truncated). + assert!( + out.contains("https://example.com/keep/this/whole/url"), + "url kept: {out}" + ); + + // Description capped at 100 characters including the ellipsis. + let desc_line = out.lines().find(|l| l.starts_with(" d")).expect("desc"); + let desc = desc_line.strip_prefix(" ").unwrap(); + assert_eq!(desc.chars().count(), 100, "description capped at 100"); + assert!(desc.ends_with('…'), "description ellipsized: {desc:?}"); +} + // ---- pure helpers: classify_response -------------------------------------- #[test] @@ -345,11 +377,18 @@ async fn run_formats_results_from_backend_html() { "got: {}", out.stdout ); + // Title is truncated to 15 chars (incl. the ellipsis) for token efficiency. assert!( - out.stdout.contains("The Rust Programming Language"), + out.stdout.contains("The Rust Progr…"), "got: {}", out.stdout ); + assert!( + !out.stdout.contains("The Rust Programming Language"), + "title should be truncated: {}", + out.stdout + ); + // URLs are kept intact. assert!( out.stdout.contains("https://www.rust-lang.org/"), "got: {}", @@ -465,11 +504,9 @@ async fn orchestrated_search_completes_under_none() { assert_eq!(result.sandbox_used, SandboxType::None); assert_eq!(result.output.exit_code, 0); + // Title truncated to 15 chars (incl. ellipsis) in the formatted output. assert!( - result - .output - .stdout - .contains("The Rust Programming Language"), + result.output.stdout.contains("The Rust Progr…"), "got: {}", result.output.stdout ); diff --git a/crates/browser-use-agent/src/tools/registry_tests.rs b/crates/browser-use-agent/src/tools/registry_tests.rs index 9315e87a..ad9d2da3 100644 --- a/crates/browser-use-agent/src/tools/registry_tests.rs +++ b/crates/browser-use-agent/src/tools/registry_tests.rs @@ -839,9 +839,10 @@ async fn search_dispatches_to_the_fake_backend() { .expect("search should dispatch"); assert_eq!(out.exit_code, 0); // The fake backend's canned HTML yields one result whose title echoes the - // query, its unwrapped destination URL, and the snippet. + // query (truncated to 15 chars in the output), its unwrapped destination + // URL (kept intact), and the snippet. assert!( - out.stdout.contains("Result for rust lang"), + out.stdout.contains("Result for rus…"), "search stdout: {:?}", out.stdout ); From 84d810901a758bd5c4ece13ebfab7d15bbd156f5 Mon Sep 17 00:00:00 2001 From: reformedot Date: Thu, 4 Jun 2026 18:46:15 -0700 Subject: [PATCH 04/10] Increase search title cap to 30 and description cap to 125 Tune the formatted-output truncation limits: titles 15 -> 30 chars, descriptions 100 -> 125 chars (ellipsis still counted within the cap). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/tools/handlers/search.rs | 4 +-- .../src/tools/handlers/search_tests.rs | 33 ++++++++++--------- .../src/tools/registry_tests.rs | 6 ++-- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/crates/browser-use-agent/src/tools/handlers/search.rs b/crates/browser-use-agent/src/tools/handlers/search.rs index 8fde09b5..8ab28190 100644 --- a/crates/browser-use-agent/src/tools/handlers/search.rs +++ b/crates/browser-use-agent/src/tools/handlers/search.rs @@ -73,10 +73,10 @@ const SEARCH_REQUEST_TIMEOUT_SECS: u64 = 30; /// Max characters of a result title in the formatted output. Titles are trimmed /// (with an ellipsis counted within the cap) to keep the model-facing text token /// efficient. -const MAX_TITLE_CHARS: usize = 15; +const MAX_TITLE_CHARS: usize = 30; /// Max characters of a result description (snippet) in the formatted output. -const MAX_DESCRIPTION_CHARS: usize = 100; +const MAX_DESCRIPTION_CHARS: usize = 125; /// A single parsed search result. /// diff --git a/crates/browser-use-agent/src/tools/handlers/search_tests.rs b/crates/browser-use-agent/src/tools/handlers/search_tests.rs index 342b1324..b8d7168d 100644 --- a/crates/browser-use-agent/src/tools/handlers/search_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/search_tests.rs @@ -275,21 +275,24 @@ fn format_results_renders_header_and_numbered_entries() { #[test] fn format_results_truncates_long_title_and_description() { let results = vec![SearchResult { - title: "ThisTitleIsWayTooLongToKeep".to_string(), + title: "ThisIsAVeryLongResultTitleThatExceedsThirtyCharacters".to_string(), url: "https://example.com/keep/this/whole/url".to_string(), description: "d".repeat(250), }]; let out = format_results("q", &results); - // Title capped at 15 characters including the ellipsis. + // Title capped at 30 characters including the ellipsis. let title = out .lines() .find_map(|l| l.strip_prefix("1. ")) .expect("title line"); - assert_eq!(title.chars().count(), 15, "title capped at 15: {title:?}"); + assert_eq!(title.chars().count(), 30, "title capped at 30: {title:?}"); assert!(title.ends_with('…'), "title ellipsized: {title:?}"); - assert!(title.starts_with("ThisTitle"), "title prefix: {title:?}"); - assert!(!out.contains("TooLong"), "tail must be dropped: {out}"); + assert!( + title.starts_with("ThisIsAVeryLong"), + "title prefix: {title:?}" + ); + assert!(!out.contains("Characters"), "tail must be dropped: {out}"); // URL is kept intact (not truncated). assert!( @@ -297,10 +300,10 @@ fn format_results_truncates_long_title_and_description() { "url kept: {out}" ); - // Description capped at 100 characters including the ellipsis. + // Description capped at 125 characters including the ellipsis. let desc_line = out.lines().find(|l| l.starts_with(" d")).expect("desc"); let desc = desc_line.strip_prefix(" ").unwrap(); - assert_eq!(desc.chars().count(), 100, "description capped at 100"); + assert_eq!(desc.chars().count(), 125, "description capped at 125"); assert!(desc.ends_with('…'), "description ellipsized: {desc:?}"); } @@ -377,17 +380,12 @@ async fn run_formats_results_from_backend_html() { "got: {}", out.stdout ); - // Title is truncated to 15 chars (incl. the ellipsis) for token efficiency. + // This title (29 chars) is within the 30-char cap, so it appears in full. assert!( - out.stdout.contains("The Rust Progr…"), + out.stdout.contains("The Rust Programming Language"), "got: {}", out.stdout ); - assert!( - !out.stdout.contains("The Rust Programming Language"), - "title should be truncated: {}", - out.stdout - ); // URLs are kept intact. assert!( out.stdout.contains("https://www.rust-lang.org/"), @@ -504,9 +502,12 @@ async fn orchestrated_search_completes_under_none() { assert_eq!(result.sandbox_used, SandboxType::None); assert_eq!(result.output.exit_code, 0); - // Title truncated to 15 chars (incl. ellipsis) in the formatted output. + // Within the 30-char title cap, so it appears in full. assert!( - result.output.stdout.contains("The Rust Progr…"), + result + .output + .stdout + .contains("The Rust Programming Language"), "got: {}", result.output.stdout ); diff --git a/crates/browser-use-agent/src/tools/registry_tests.rs b/crates/browser-use-agent/src/tools/registry_tests.rs index ad9d2da3..751cde5c 100644 --- a/crates/browser-use-agent/src/tools/registry_tests.rs +++ b/crates/browser-use-agent/src/tools/registry_tests.rs @@ -839,10 +839,10 @@ async fn search_dispatches_to_the_fake_backend() { .expect("search should dispatch"); assert_eq!(out.exit_code, 0); // The fake backend's canned HTML yields one result whose title echoes the - // query (truncated to 15 chars in the output), its unwrapped destination - // URL (kept intact), and the snippet. + // query (within the 30-char cap, so shown in full), its unwrapped + // destination URL (kept intact), and the snippet. assert!( - out.stdout.contains("Result for rus…"), + out.stdout.contains("Result for rust lang"), "search stdout: {:?}", out.stdout ); From af4111cceec8552ff60d8acc2b68fcf93d81c606 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Fri, 5 Jun 2026 08:53:47 -0700 Subject: [PATCH 05/10] Tune search tool guidance and scheduling --- .../src/entrypoint/provider.rs | 11 +++++-- .../src/tools/handlers/search.rs | 11 ++++--- .../src/tools/handlers/search_tests.rs | 11 +++++-- .../browser-use-agent/src/tools/registry.rs | 28 +++++++++++------- .../src/tools/registry_tests.rs | 29 +++++++++++++++++-- 5 files changed, 68 insertions(+), 22 deletions(-) diff --git a/crates/browser-use-agent/src/entrypoint/provider.rs b/crates/browser-use-agent/src/entrypoint/provider.rs index d857c9d0..7cb3001d 100644 --- a/crates/browser-use-agent/src/entrypoint/provider.rs +++ b/crates/browser-use-agent/src/entrypoint/provider.rs @@ -1209,7 +1209,7 @@ fn build_tool_dispatcher_with_cwd_and_goal_store( use crate::tools::handlers::done::{DoneRequest, DoneTool}; use crate::tools::handlers::mcp::McpToolCallRequest; use crate::tools::handlers::python::{PythonRequest, PythonTool}; - use crate::tools::handlers::search::{SearchRequest, SearchTool}; + use crate::tools::handlers::search::{SearchRequest, SearchTool, SEARCH_PARALLEL_SAFE}; use crate::tools::handlers::shell::{ ExecCommandRequest, ExecCommandTool, ShellRequest, ShellTool, WriteStdinRequest, WriteStdinTool, @@ -1296,8 +1296,13 @@ fn build_tool_dispatcher_with_cwd_and_goal_store( ); // `search`: locally-executed DuckDuckGo (Lite) web search — the client runs // the HTTP request and parses the results itself (distinct from the hosted - // `web_search` above). Read-only, so parallel_safe = true. - reg.register::<_, SearchRequest>("search", definitions::search(), true, SearchTool::new()); + // `web_search` above). Serial to avoid DuckDuckGo Lite rate-limit blocks. + reg.register::<_, SearchRequest>( + "search", + definitions::search(), + SEARCH_PARALLEL_SAFE, + SearchTool::new(), + ); let browser_backend = browser_backend_for_runtime_or_config( config, runtime_handle.as_ref(), diff --git a/crates/browser-use-agent/src/tools/handlers/search.rs b/crates/browser-use-agent/src/tools/handlers/search.rs index 8ab28190..5144f8d7 100644 --- a/crates/browser-use-agent/src/tools/handlers/search.rs +++ b/crates/browser-use-agent/src/tools/handlers/search.rs @@ -54,6 +54,12 @@ use crate::tools::sandbox::{SandboxPermissions, SandboxPreference}; /// The tool name surfaced to the model. pub const SEARCH_TOOL_NAME: &str = "search"; +/// Whether search calls may run concurrently with other parallel-safe tools. +/// +/// Keep DuckDuckGo Lite requests serial: concurrent searches from the same +/// client are more likely to trigger rate limits or challenge pages. +pub const SEARCH_PARALLEL_SAFE: bool = false; + /// The DuckDuckGo Lite search endpoint the real backend fetches. const DDG_LITE_BASE_URL: &str = "https://lite.duckduckgo.com/lite/"; @@ -306,10 +312,7 @@ impl Sandboxable for SearchTool { #[async_trait::async_trait] impl ToolRuntime for SearchTool { fn parallel_safe(&self, _req: &SearchRequest) -> bool { - // A read-only HTTP GET + pure parse mutates no shared state, so it is safe - // to run concurrently with other tools — matching the parallel-safe - // stance of `tool_search` / `web_search`. - true + SEARCH_PARALLEL_SAFE } async fn run( diff --git a/crates/browser-use-agent/src/tools/handlers/search_tests.rs b/crates/browser-use-agent/src/tools/handlers/search_tests.rs index b8d7168d..fea6558f 100644 --- a/crates/browser-use-agent/src/tools/handlers/search_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/search_tests.rs @@ -8,7 +8,8 @@ use std::sync::Arc; use super::search::{ classify_response, extract_real_url, format_results, normalize_whitespace, parse_lite_results, - SearchBackend, SearchError, SearchRequest, SearchResult, SearchTool, SEARCH_TOOL_NAME, + SearchBackend, SearchError, SearchRequest, SearchResult, SearchTool, SEARCH_PARALLEL_SAFE, + SEARCH_TOOL_NAME, }; use crate::tools::approval::AskForApproval; use crate::tools::orchestrator::{ToolOrchestrator, TurnEnv}; @@ -461,9 +462,13 @@ fn approval_accessors() { } #[test] -fn search_is_parallel_safe() { +fn search_is_serial_to_avoid_rate_limit_blocks() { let tool = SearchTool::with_backend(Arc::new(HtmlBackend(String::new()))); - assert!(tool.parallel_safe(&SearchRequest::new("rust"))); + assert_eq!( + tool.parallel_safe(&SearchRequest::new("rust")), + SEARCH_PARALLEL_SAFE + ); + assert!(!SEARCH_PARALLEL_SAFE); } #[test] diff --git a/crates/browser-use-agent/src/tools/registry.rs b/crates/browser-use-agent/src/tools/registry.rs index 6ffc4eee..ced17ed8 100644 --- a/crates/browser-use-agent/src/tools/registry.rs +++ b/crates/browser-use-agent/src/tools/registry.rs @@ -1162,9 +1162,12 @@ to the single frame that proves the task succeeded." pub fn search() -> ToolDefinition { ToolDefinition { name: "search".to_string(), - description: "Search the web using DuckDuckGo and return results directly as text – \ - no browser navigation occurs. The returned results are final and complete. \ - NEVER open a search engine website after calling this action." + description: "Search the web with a local DuckDuckGo Lite request and return compact \ + text results. This does not use or require a browser connection or browser \ + session. Use this instead of navigating a browser to Google, DuckDuckGo, Bing, \ + or any other search engine; it is far more token-efficient than reading a search \ + results page in the browser. Only use the browser after search when you need to \ + inspect a specific result page." .to_string(), input_schema: json!({ "type": "object", @@ -1952,9 +1955,9 @@ Agent-role guidance below only helps choose which agent to use after spawning is /// `WireArgs` types. The browser/python/mcp handlers need an injected backend /// (they would otherwise reach the OS), so those are supplied by the caller. /// -/// `parallel_safe` per tool: `exec_command` / `tool_search` / `web_search` / -/// `search` = `true`; `shell` / `apply_patch` / `view_image` / `browser` / -/// `python` / `update_plan` / `done` = `false` (serial). `mcp` is registered +/// `parallel_safe` per tool: `exec_command` / `tool_search` / `web_search` = +/// `true`; `shell` / `apply_patch` / `view_image` / `browser` / `python` / +/// `search` / `update_plan` / `done` = `false` (serial). `mcp` is registered /// `false` here /// (a serial default); its per-request read-only hint still drives the handler's /// own [`ToolRuntime::parallel_safe`](crate::tools::ToolRuntime::parallel_safe). @@ -1981,7 +1984,7 @@ where use crate::tools::handlers::done::DoneRequest; use crate::tools::handlers::mcp::McpToolCallRequest; use crate::tools::handlers::python::PythonRequest; - use crate::tools::handlers::search::SearchRequest; + use crate::tools::handlers::search::{SearchRequest, SEARCH_PARALLEL_SAFE}; use crate::tools::handlers::shell::{ ExecCommandRequest, ExecCommandTool, ShellRequest, WriteStdinRequest, WriteStdinTool, }; @@ -2033,9 +2036,14 @@ where tool_search, ); reg.register::<_, WebSearchRequest>("web_search", definitions::web_search(), true, web_search); - // `search`: locally-executed DuckDuckGo search. Read-only HTTP GET + - // pure parse, so parallel-safe like `web_search` / `tool_search`. - reg.register::<_, SearchRequest>("search", definitions::search(), true, search); + // `search`: locally-executed DuckDuckGo search. Serial to avoid + // DuckDuckGo Lite rate-limit blocks from concurrent requests. + reg.register::<_, SearchRequest>( + "search", + definitions::search(), + SEARCH_PARALLEL_SAFE, + search, + ); // `done`: the completion tool. Serial (terminal; must not be reordered). reg.register::<_, DoneRequest>("done", definitions::done(), false, done); diff --git a/crates/browser-use-agent/src/tools/registry_tests.rs b/crates/browser-use-agent/src/tools/registry_tests.rs index 751cde5c..a02e287b 100644 --- a/crates/browser-use-agent/src/tools/registry_tests.rs +++ b/crates/browser-use-agent/src/tools/registry_tests.rs @@ -28,7 +28,9 @@ use crate::tools::handlers::mcp::{ McpCallResult, McpClient, McpTool, McpToolCallRequest, McpWireArgs, }; use crate::tools::handlers::python::{PythonBackend, PythonRequest, PythonTool}; -use crate::tools::handlers::search::{SearchBackend, SearchError, SearchTool}; +use crate::tools::handlers::search::{ + SearchBackend, SearchError, SearchTool, SEARCH_PARALLEL_SAFE, +}; use crate::tools::handlers::shell::{ShellRequest, ShellTool}; use crate::tools::handlers::tool_search::{ToolSearchEntry, ToolSearchRequest, ToolSearchTool}; use crate::tools::handlers::update_plan::{UpdatePlanRequest, UpdatePlanTool}; @@ -576,13 +578,35 @@ fn default_registry_registers_all_tools() { } } +#[test] +fn search_definition_guides_model_away_from_browser_search_engines() { + let desc = definitions::search().description; + assert!( + desc.contains("local DuckDuckGo Lite request"), + "search description should explain the local search backend: {desc}" + ); + assert!( + desc.contains("does not use or require a browser connection or browser session"), + "search description should make clear no browser connection is needed: {desc}" + ); + assert!( + desc.contains("instead of navigating a browser"), + "search description should prefer this tool over browser search-engine navigation: {desc}" + ); + assert!( + desc.contains("token-efficient"), + "search description should call out the token-efficiency reason: {desc}" + ); +} + #[test] fn parallel_safe_flags_match_registration() { let reg = full_registry(); // Pure / read-only tools are parallel-safe. assert_eq!(reg.parallel_safe("tool_search"), Some(true)); assert_eq!(reg.parallel_safe("web_search"), Some(true)); - assert_eq!(reg.parallel_safe("search"), Some(true)); + assert_eq!(reg.parallel_safe("search"), Some(SEARCH_PARALLEL_SAFE)); + assert!(!SEARCH_PARALLEL_SAFE); // Everything else is serial. for name in [ "shell", @@ -591,6 +615,7 @@ fn parallel_safe_flags_match_registration() { "browser", "python", "mcp", + "search", "update_plan", "done", ] { From 7199cdc2c6244e9fd69f3681cf97ff0118cdac61 Mon Sep 17 00:00:00 2001 From: reformedot Date: Fri, 5 Jun 2026 18:22:20 -0700 Subject: [PATCH 06/10] Replace DuckDuckGo engine with the browser-use search API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `search` tool now POSTs the query to search.browser-use.com — a thin proxy in front of Parallel's Search API with browser-use auth + billing — instead of scraping DuckDuckGo Lite HTML. Contract verified against the search service source (documents/browser-use/search): - POST {base}/search with {"query"} and the `X-Browser-Use-API-Key` header (key read from BROWSER_USE_API_KEY, the workspace's existing browser-use cloud auth variable; fails fast with an actionable message when unset). - Base URL overridable via BROWSER_USE_SEARCH_URL (e.g. a local dev instance, which runs as an open proxy without auth — keyless requests are allowed through there). - 200 -> {"results":[{title?, url, published_date?, content}]}; the multi-line markdown content is whitespace-normalized; untitled results fall back to their URL; url-less results are dropped; the publication date is appended to the title line when known. - Errors mapped per the service's table: 401 invalid key, 402 insufficient balance, other >=400 carried with a 200-char body snippet — all surfaced to the model as soft errors ("Search failed: ..."). All the DuckDuckGo HTML-parsing machinery (regex extraction, entity decoding, redirect unwrapping, percent decoding) is gone; the title/ description truncation (30/125) and output layout are unchanged. Tests rewritten against fixture JSON; live smoke now targets the real service (verified end-to-end against a local instance). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/entrypoint/provider.rs | 13 +- .../src/tools/handlers/mod.rs | 5 +- .../src/tools/handlers/search.rs | 657 ++++++------------ .../src/tools/handlers/search_tests.rs | 392 +++++------ .../browser-use-agent/src/tools/registry.rs | 14 +- .../src/tools/registry_tests.rs | 13 +- 6 files changed, 407 insertions(+), 687 deletions(-) diff --git a/crates/browser-use-agent/src/entrypoint/provider.rs b/crates/browser-use-agent/src/entrypoint/provider.rs index 7cb3001d..5effd73f 100644 --- a/crates/browser-use-agent/src/entrypoint/provider.rs +++ b/crates/browser-use-agent/src/entrypoint/provider.rs @@ -1111,9 +1111,9 @@ fn resolve_provider_with_python( /// The registry registers the backend-free handlers — `shell`, `apply_patch`, /// `view_image`, `update_plan`, `done`, `tool_search` (catalog populated from the registered tools' defs), /// `web_search` (ENABLED; the Responses builder encodes it as the hosted -/// `web_search_preview` tool), `search` (a locally-executed DuckDuckGo search, -/// distinct from the hosted `web_search`) — plus the two product-surface tools -/// that drive real subsystems: +/// `web_search_preview` tool), `search` (a client-executed call to the +/// browser-use search API, distinct from the hosted `web_search`) — plus the +/// two product-surface tools that drive real subsystems: /// * `browser` ([`BrowserTool::new`]): standalone — the production /// [`RealBackend`](crate::tools::handlers::browser::RealBackend) wraps the /// `browser-use-browser` crate and manages CDP sessions internally (keyed by @@ -1294,9 +1294,10 @@ fn build_tool_dispatcher_with_cwd_and_goal_store( true, WebSearchTool::new(WebSearchConfig::enabled()), ); - // `search`: locally-executed DuckDuckGo (Lite) web search — the client runs - // the HTTP request and parses the results itself (distinct from the hosted - // `web_search` above). Serial to avoid DuckDuckGo Lite rate-limit blocks. + // `search`: web search via the browser-use search API — the client makes + // the API call (auth: `BROWSER_USE_API_KEY`) and formats the results itself + // (distinct from the hosted `web_search` above). Serial: a conservative + // scheduling default for a billed API call. reg.register::<_, SearchRequest>( "search", definitions::search(), diff --git a/crates/browser-use-agent/src/tools/handlers/mod.rs b/crates/browser-use-agent/src/tools/handlers/mod.rs index cfae823e..b6ac0eff 100644 --- a/crates/browser-use-agent/src/tools/handlers/mod.rs +++ b/crates/browser-use-agent/src/tools/handlers/mod.rs @@ -56,9 +56,8 @@ pub use mcp::{ }; pub use python::{PythonApprovalKey, PythonBackend, PythonRequest, PythonTool}; pub use search::{ - classify_response, extract_real_url, format_results, normalize_whitespace, parse_lite_results, - HttpSearchBackend, SearchApprovalKey, SearchBackend, SearchError, SearchRequest, SearchResult, - SearchTool, + classify_response, format_results, normalize_whitespace, parse_results, HttpSearchBackend, + SearchApprovalKey, SearchBackend, SearchError, SearchRequest, SearchResult, SearchTool, }; pub use shell::{ ExecCommandApprovalKey, ExecCommandRequest, ExecCommandTool, ShellApprovalKey, ShellRequest, diff --git a/crates/browser-use-agent/src/tools/handlers/search.rs b/crates/browser-use-agent/src/tools/handlers/search.rs index 5144f8d7..0e226a7b 100644 --- a/crates/browser-use-agent/src/tools/handlers/search.rs +++ b/crates/browser-use-agent/src/tools/handlers/search.rs @@ -1,50 +1,50 @@ -//! `search` tool: a LOCALLY-executed DuckDuckGo (Lite) web search. +//! `search` tool: a web search via the browser-use search API. //! -//! This is the async re-implementation of the legacy Python `search` action -//! (a `browser_use` `Controller` action that fetched -//! `lite.duckduckgo.com/lite/` over HTTP and parsed the result HTML). Only the -//! *search logic* is ported — the surrounding `Controller` / DB / session -//! scaffolding (and the unrelated `request_human_control` action) are dropped. -//! Like the other handlers it implements the full trait stack +//! The client POSTs the query to `search.browser-use.com` — a thin proxy in +//! front of [Parallel](https://parallel.ai)'s Search API with browser-use auth +//! and billing — and formats the returned JSON results for the model. This +//! replaced the DuckDuckGo Lite scrape the tool was originally ported from: +//! the engine changed, the tool surface (name, request shape, output layout) +//! did not. Like the other handlers it implements the full trait stack //! ([`Approvable`] + [`Sandboxable`] + [`ToolRuntime`]) so it can be driven by -//! the [`ToolOrchestrator`](crate::tools::orchestrator::ToolOrchestrator), -//! mirroring the `tool_search` tool's structure: a non-FS, -//! fetch-parse-and-return tool that spawns no process. +//! the [`ToolOrchestrator`](crate::tools::orchestrator::ToolOrchestrator). //! //! # Relationship to [`web_search`](super::web_search) //! //! [`web_search`](super::web_search) is the HOSTED, provider-executed web search -//! (the provider runs the search server-side; the client only declares + passes -//! through the result — it performs *no* local HTTP). This `search` tool is the -//! opposite: it performs a REAL local HTTP GET against DuckDuckGo Lite and parses -//! the returned HTML itself, exactly as the Python action did. The two are -//! complementary, not duplicates: `web_search` needs a capable provider; `search` -//! works against any provider because the client does the work. +//! (the model provider runs the search server-side; the client only declares + +//! passes through the result — it performs *no* local HTTP). This `search` tool +//! is the opposite: the client performs the API call itself, so it works +//! against any model provider. +//! +//! # API contract (verified against the `search` service source) +//! +//! * `POST {base}/search` with JSON `{"query": "…"}` and the +//! [`X-Browser-Use-API-Key`](SEARCH_API_KEY_HEADER) header (a `bu_…` key, +//! read from [`BROWSER_USE_API_KEY`](SEARCH_API_KEY_ENV) — the same variable +//! the rest of the workspace uses for browser-use cloud auth). The base URL +//! defaults to the production service and can be overridden via +//! [`BROWSER_USE_SEARCH_URL`](SEARCH_BASE_URL_ENV) (e.g. a local dev +//! instance, which runs as an open proxy without auth). +//! * `200` → `{"results": [{"title"?, "url", "published_date"?, "content"}]}`; +//! `title` / `published_date` are omitted when the source lacks them, and +//! `content` is multi-line markdown (whitespace-normalized here). +//! * Errors: `400` invalid query, `401` missing/invalid API key, `402` +//! insufficient balance, `422` upstream rejected the request, `502` upstream +//! failed, `503` auth/billing backend unavailable. //! //! # Network seam (testability) //! -//! The HTTP fetch lives behind the [`SearchBackend`] trait, with the real +//! The HTTP call lives behind the [`SearchBackend`] trait, with the real //! [`HttpSearchBackend`] (a `reqwest` client) injected by default and a fake //! substitutable in tests. This mirrors how the `browser` / `python` / `mcp` -//! handlers inject their backends (`BrowserTool::with_backend`, -//! `McpTool::new(Arc)`), so the tool's parsing/formatting logic is -//! unit-tested deterministically with fixture HTML — no network is touched. -//! -//! # HTML parsing -//! -//! The Python original used BeautifulSoup. This crate intentionally carries no -//! HTML-parser dependency (the existing browser tooling reads the DOM from a real -//! browser over CDP, never by parsing HTML strings), so to keep the dependency -//! footprint unchanged we extract the few fields we need with targeted `regex` -//! over the *specific, stable* DuckDuckGo Lite markup — the same fixed selectors -//! BeautifulSoup keyed on (`a.result-link`, `td.result-snippet`). The extraction -//! is faithful to the Python logic and fully fixture-tested in `search_tests.rs`. +//! handlers inject their backends, so the tool's parsing/formatting logic is +//! unit-tested deterministically with fixture JSON — no network is touched. use std::sync::{Arc, OnceLock}; use std::time::Duration; use regex::Regex; -use reqwest::header::{ACCEPT, ACCEPT_LANGUAGE, USER_AGENT}; use crate::tools::runtime::{ Approvable, ExecOutput, SandboxAttempt, Sandboxable, ToolCtx, ToolError, ToolRuntime, @@ -56,25 +56,31 @@ pub const SEARCH_TOOL_NAME: &str = "search"; /// Whether search calls may run concurrently with other parallel-safe tools. /// -/// Keep DuckDuckGo Lite requests serial: concurrent searches from the same -/// client are more likely to trigger rate limits or challenge pages. +/// Kept serial: a conservative scheduling default for a billed API call +/// (carried over from the previous engine's rate-limit concerns). pub const SEARCH_PARALLEL_SAFE: bool = false; -/// The DuckDuckGo Lite search endpoint the real backend fetches. -const DDG_LITE_BASE_URL: &str = "https://lite.duckduckgo.com/lite/"; +/// The browser-use search service base URL. +const SEARCH_BASE_URL: &str = "https://search.browser-use.com"; -/// Browser-like `User-Agent` (ported verbatim from the Python action's headers). -const DDG_USER_AGENT: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \ -AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36"; +/// Environment variable overriding the search service base URL (e.g. a local +/// dev instance, `http://localhost:8080`, which runs as an open proxy without +/// auth). Defaults to [`SEARCH_BASE_URL`]. +const SEARCH_BASE_URL_ENV: &str = "BROWSER_USE_SEARCH_URL"; -/// `Accept` header (ported verbatim from the Python action's headers). -const DDG_ACCEPT: &str = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; +/// Environment variable holding the `bu_…` browser-use API key. The same +/// variable the rest of the workspace uses for browser-use cloud auth +/// (`.env.example`, `browser-use-browser`). +const SEARCH_API_KEY_ENV: &str = "BROWSER_USE_API_KEY"; -/// `Accept-Language` header (ported verbatim from the Python action's headers). -const DDG_ACCEPT_LANGUAGE: &str = "en-US,en;q=0.9"; +/// Auth header the search service expects (service `internal/api/server.go` / +/// its README: `X-Browser-Use-API-Key: bu_…`). +const SEARCH_API_KEY_HEADER: &str = "X-Browser-Use-API-Key"; -/// Request timeout (the Python action used `timeout=30.0`). -const SEARCH_REQUEST_TIMEOUT_SECS: u64 = 30; +/// Client-side request timeout. The service's own upstream (Parallel) timeout +/// is 30s (`UPSTREAM_TIMEOUT`); 60s gives it room to answer — including with a +/// `502` — before we cut the connection. +const SEARCH_REQUEST_TIMEOUT_SECS: u64 = 60; /// Max characters of a result title in the formatted output. Titles are trimmed /// (with an ellipsis counted within the cap) to keep the model-facing text token @@ -86,20 +92,22 @@ const MAX_DESCRIPTION_CHARS: usize = 125; /// A single parsed search result. /// -/// Mirrors the Python action's `{title, url, description}` dict. +/// Mirrors the service's result object; the wire `content` (multi-line +/// markdown) is whitespace-normalized into the single-line `description`. #[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct SearchResult { - /// The result's title (the `a.result-link` text). + /// The result's title; empty when the source provided none. pub title: String, - /// The result's destination URL (the DuckDuckGo redirect, unwrapped). + /// The result's destination URL. pub url: String, - /// The result's snippet (the following `td.result-snippet` text), if any. + /// `YYYY-MM-DD` publication date, when the source provides one. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub published_date: Option, + /// The result's content/snippet, normalized to a single line. pub description: String, } /// Typed request for the `search` tool. -/// -/// Mirrors the Python `SearchParams { query }`. #[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct SearchRequest { /// The search query to look up on the web. @@ -115,60 +123,79 @@ impl SearchRequest { } } -/// An error from the search backend's HTTP fetch. +/// An error from the search backend's HTTP call. /// -/// Reproduces the failure cases the Python `_search_duckduckgo` raised: a -/// challenge/CAPTCHA page, a non-2xx HTTP status, and a transport error. +/// The named variants mirror the service's documented statuses so the model +/// sees an actionable message instead of a bare code. #[derive(Debug, thiserror::Error)] pub enum SearchError { - /// DuckDuckGo returned a challenge/anti-bot page (HTTP 202, or the body - /// mentions "anomaly"). - #[error( - "DuckDuckGo is showing a challenge/CAPTCHA – too many requests or suspicious activity." - )] - Challenge, - /// The server returned a client/server error status. + /// No API key was configured; the request was not attempted. + #[error("BROWSER_USE_API_KEY is not set – the browser-use search API requires an API key")] + MissingApiKey, + /// The service rejected the API key (HTTP 401). + #[error("invalid or missing browser-use API key (HTTP 401)")] + Unauthorized, + /// The project balance is exhausted (HTTP 402). + #[error("insufficient browser-use balance (HTTP 402)")] + InsufficientBalance, + /// Any other client/server error status (400, 422, 502, 503, …). #[error("HTTP {status}: {snippet}")] Http { /// The HTTP status code. status: u16, - /// The first 200 chars of the response body (matching the Python - /// `response.text[:200]`). + /// The first 200 chars of the response body. snippet: String, }, + /// A `200` response whose body was not the documented JSON shape. + #[error("unexpected response body: {0}")] + Decode(String), /// A transport-level error (connection, timeout, decoding). #[error("{0}")] Request(String), } -/// The network seam: fetch the raw DuckDuckGo Lite HTML for a query. +/// The network seam: fetch the raw search-API response body for a query. /// /// Implemented for real by [`HttpSearchBackend`] and by a fake in tests, so the /// tool's parsing/formatting can be exercised without a real network — mirroring /// the `browser` / `python` / `mcp` backend seams. #[async_trait::async_trait] pub trait SearchBackend: Send + Sync { - /// Fetch the DuckDuckGo Lite result HTML for `query`. + /// Fetch the search service's JSON response body for `query`. async fn fetch(&self, query: &str) -> Result; } -/// The real [`SearchBackend`]: a `reqwest` client against DuckDuckGo Lite. +/// The real [`SearchBackend`]: a `reqwest` client against the browser-use +/// search service. pub struct HttpSearchBackend { client: reqwest::Client, base_url: String, + api_key: Option, } impl HttpSearchBackend { - /// Construct the backend with a default client and the DuckDuckGo Lite - /// endpoint. + /// Construct the backend from the environment: the base URL from + /// [`BROWSER_USE_SEARCH_URL`](SEARCH_BASE_URL_ENV) (defaulting to the + /// production [`SEARCH_BASE_URL`]) and the API key from + /// [`BROWSER_USE_API_KEY`](SEARCH_API_KEY_ENV). pub fn new() -> Self { + let base_url = std::env::var(SEARCH_BASE_URL_ENV) + .ok() + .map(|url| url.trim().trim_end_matches('/').to_string()) + .filter(|url| !url.is_empty()) + .unwrap_or_else(|| SEARCH_BASE_URL.to_string()); + let api_key = std::env::var(SEARCH_API_KEY_ENV) + .ok() + .map(|key| key.trim().to_string()) + .filter(|key| !key.is_empty()); let client = reqwest::Client::builder() .timeout(Duration::from_secs(SEARCH_REQUEST_TIMEOUT_SECS)) .build() .unwrap_or_else(|_| reqwest::Client::new()); Self { client, - base_url: DDG_LITE_BASE_URL.to_string(), + base_url, + api_key, } } } @@ -182,18 +209,22 @@ impl Default for HttpSearchBackend { #[async_trait::async_trait] impl SearchBackend for HttpSearchBackend { async fn fetch(&self, query: &str) -> Result { - // `reqwest`'s `.query()` produces application/x-www-form-urlencoded - // output (space -> `+`); the encoded byte set differs from Python's - // `quote_plus` on a few characters (e.g. `~`, `*`), but DuckDuckGo - // decodes both to the same query, so results are equivalent. Redirects - // are followed by default, matching `follow_redirects=True`. - let response = self + // The production service always requires a key: fail fast with an + // actionable message instead of a guaranteed 401 round-trip. A custom + // endpoint (BROWSER_USE_SEARCH_URL, e.g. a local dev instance) may be + // an open proxy, so keyless requests are allowed through there. + if self.api_key.is_none() && self.base_url == SEARCH_BASE_URL { + return Err(SearchError::MissingApiKey); + } + + let mut request = self .client - .get(&self.base_url) - .query(&[("q", query)]) - .header(USER_AGENT, DDG_USER_AGENT) - .header(ACCEPT, DDG_ACCEPT) - .header(ACCEPT_LANGUAGE, DDG_ACCEPT_LANGUAGE) + .post(format!("{}/search", self.base_url)) + .json(&serde_json::json!({ "query": query })); + if let Some(api_key) = self.api_key.as_deref() { + request = request.header(SEARCH_API_KEY_HEADER, api_key); + } + let response = request .send() .await .map_err(|err| SearchError::Request(err.to_string()))?; @@ -209,18 +240,20 @@ impl SearchBackend for HttpSearchBackend { } } -/// Classify an HTTP response the way the Python action did: a challenge page -/// (status 202 or an "anomaly" body) first, then any `>= 400` status as an -/// error, otherwise success. +/// Classify an HTTP response per the service's documented statuses: `401` and +/// `402` get named, actionable errors; any other `>= 400` (400 invalid query, +/// 422 upstream rejected, 502 upstream failed, 503 auth backend down) carries +/// the status plus the first 200 chars of the body; everything else is success. pub fn classify_response(status: u16, body: &str) -> Result<(), SearchError> { - if status == 202 || body.to_ascii_lowercase().contains("anomaly") { - return Err(SearchError::Challenge); - } - if status >= 400 { - let snippet: String = body.chars().take(200).collect(); - return Err(SearchError::Http { status, snippet }); + match status { + 401 => Err(SearchError::Unauthorized), + 402 => Err(SearchError::InsufficientBalance), + s if s >= 400 => { + let snippet: String = body.chars().take(200).collect(); + Err(SearchError::Http { status: s, snippet }) + } + _ => Ok(()), } - Ok(()) } /// The async `search` tool. @@ -287,8 +320,8 @@ impl Approvable for SearchTool { } // `exec_approval_requirement` is intentionally left at its trait default - // (`None`): the search is a benign, read-only HTTP GET (the Python action had - // no approval gate either). Returning `None` lets the orchestrator apply + // (`None`): the search is a benign, read-only query against the browser-use + // search API. Returning `None` lets the orchestrator apply // `default_exec_approval_requirement`, which yields `Skip` under any // non-prompting policy. The outbound request mirrors the crate's existing // network usage (the MCP HTTP client, analytics) which is likewise ungated. @@ -332,13 +365,16 @@ impl ToolRuntime for SearchTool { )); } - // A fetch failure is surfaced to the model as a soft error (nonzero exit - // with the message on stderr), mirroring the Python action's - // `ActionResult(error="Search failed: …")` and the MCP handler's - // model-facing error mapping — not a hard tool error. - match self.backend.fetch(query).await { - Ok(html) => { - let results = parse_lite_results(&html); + // A fetch/parse failure is surfaced to the model as a soft error + // (nonzero exit with the message on stderr), mirroring the MCP + // handler's model-facing error mapping — not a hard tool error. + match self + .backend + .fetch(query) + .await + .and_then(|body| parse_results(&body)) + { + Ok(results) => { let stdout = if results.is_empty() { format!("No results found for \"{query}\".") } else { @@ -360,17 +396,65 @@ impl ToolRuntime for SearchTool { } // --------------------------------------------------------------------------- -// Pure helpers (parsing + formatting) — ported from the Python action. +// Pure helpers (parsing + formatting). // --------------------------------------------------------------------------- +/// Wire shape of the service's `200` response: `{"results": [...]}`. +#[derive(serde::Deserialize)] +struct SearchResponseWire { + #[serde(default)] + results: Vec, +} + +/// Wire shape of one result. `title` / `published_date` are omitted when the +/// source lacks them; everything defaults so one sparse result cannot fail the +/// whole response. +#[derive(serde::Deserialize)] +struct SearchResultWire { + #[serde(default)] + title: String, + #[serde(default)] + url: String, + #[serde(default)] + published_date: Option, + #[serde(default)] + content: String, +} + +/// Parse the search service's JSON response body into results. +/// +/// The wire `content` arrives as multi-line markdown; it is whitespace- +/// normalized into the single-line `description`. Results without a `url` are +/// dropped (the model cannot follow them). A body that is not the documented +/// JSON shape is a [`SearchError::Decode`]. +pub fn parse_results(body: &str) -> Result, SearchError> { + let wire: SearchResponseWire = + serde_json::from_str(body).map_err(|err| SearchError::Decode(err.to_string()))?; + + Ok(wire + .results + .into_iter() + .filter(|result| !result.url.trim().is_empty()) + .map(|result| SearchResult { + title: normalize_whitespace(&result.title), + url: result.url.trim().to_string(), + published_date: result + .published_date + .map(|date| date.trim().to_string()) + .filter(|date| !date.is_empty()), + description: normalize_whitespace(&result.content), + }) + .collect()) +} + /// Format parsed results into the readable text block the model sees. /// -/// Faithful to the Python action's `extracted_content` layout: a header (count + -/// the "you already have the results" guidance), then a numbered list with each -/// result's title, `URL:` line, and optional snippet, blank-line separated. The -/// title and description are truncated ([`MAX_TITLE_CHARS`] / -/// [`MAX_DESCRIPTION_CHARS`]) for token efficiency; URLs are kept intact so they -/// remain usable. +/// A header (count + the "you already have the results" guidance), then a +/// numbered list with each result's title (publication date appended when +/// known), `URL:` line, and optional snippet, blank-line separated. The title +/// and description are truncated ([`MAX_TITLE_CHARS`] / +/// [`MAX_DESCRIPTION_CHARS`]) for token efficiency; URLs are kept intact so +/// they remain usable. pub fn format_results(query: &str, results: &[SearchResult]) -> String { let mut lines: Vec = Vec::with_capacity(results.len() * 4 + 1); lines.push(format!( @@ -380,11 +464,17 @@ pub fn format_results(query: &str, results: &[SearchResult]) -> String { results.len() )); for (i, result) in results.iter().enumerate() { - lines.push(format!( - "{}. {}", - i + 1, - truncate_chars(&result.title, MAX_TITLE_CHARS) - )); + // Fall back to the URL when the source provided no title. + let title = if result.title.is_empty() { + result.url.as_str() + } else { + result.title.as_str() + }; + let mut title_line = format!("{}. {}", i + 1, truncate_chars(title, MAX_TITLE_CHARS)); + if let Some(date) = result.published_date.as_deref() { + title_line.push_str(&format!(" ({date})")); + } + lines.push(title_line); lines.push(format!(" URL: {}", result.url)); if !result.description.is_empty() { lines.push(format!( @@ -410,361 +500,14 @@ fn truncate_chars(text: &str, max: usize) -> String { format!("{}…", prefix.trim_end()) } -/// Unwrap a DuckDuckGo redirect URL to its real destination. -/// -/// Ported from the Python `_extract_real_url`: -/// * protocol-relative `//host/…` gets an `https:` scheme; -/// * a `duckduckgo.com/l/?uddg=…` redirect is unwrapped to its `uddg` target -/// (form-decoded, matching `parse_qs` + `unquote`); -/// * ad links (`duckduckgo.com/y.js`) and non-`http(s)` schemes are dropped -/// (returns `None`). -pub fn extract_real_url(ddg_url: &str) -> Option { - if ddg_url.is_empty() { - return None; - } - - let with_scheme = if let Some(rest) = ddg_url.strip_prefix("//") { - format!("https://{rest}") - } else { - ddg_url.to_string() - }; - - let mut url = with_scheme.clone(); - if with_scheme.contains("duckduckgo.com/l/") && with_scheme.contains("uddg=") { - if let Some(target) = query_param(&with_scheme, "uddg") { - url = target; - } - } - - // Ad links – skip. - if url.contains("duckduckgo.com/y.js") { - return None; - } - - // Only allow http/https to prevent unsafe URLs (javascript:, data:, …). - if !(url.starts_with("https://") || url.starts_with("http://")) { - return None; - } - - Some(url) -} - /// Collapse runs of whitespace into a single space and trim the ends. -/// -/// Ported from the Python `_normalize_whitespace` -/// (`re.sub(r"\s+", " ", text).strip()`). pub fn normalize_whitespace(text: &str) -> String { whitespace_regex() .replace_all(text.trim(), " ") .into_owned() } -/// Parse search results out of a DuckDuckGo Lite HTML response. -/// -/// Ported from the Python `_parse_lite_results`: for each `a.result-link`, take -/// its (entity-decoded) text as the title and unwrap its `href`; skip empty / -/// "more info" / duplicate / `duckduckgo.com` results; and attach the snippet -/// from the first following `td.result-snippet` that precedes the next result -/// link. -pub fn parse_lite_results(html: &str) -> Vec { - let anchors = collect_anchors(html); - let snippets = collect_snippets(html); - - let mut results: Vec = Vec::new(); - let mut seen: std::collections::HashSet = std::collections::HashSet::new(); - - for (idx, anchor) in anchors.iter().enumerate() { - if anchor.title.is_empty() || anchor.title.eq_ignore_ascii_case("more info") { - continue; - } - - let Some(url) = extract_real_url(&anchor.href) else { - continue; - }; - if seen.contains(&url) || url.contains("duckduckgo.com") { - continue; - } - seen.insert(url.clone()); - - // The snippet is the first `result-snippet` after this anchor and before - // the next one (matching the Python sibling-walk that stops at the next - // result link). - let next_pos = anchors.get(idx + 1).map_or(usize::MAX, |a| a.pos); - let description = snippets - .iter() - .find(|s| s.pos > anchor.pos && s.pos < next_pos) - .map(|s| s.text.clone()) - .unwrap_or_default(); - - results.push(SearchResult { - title: anchor.title.clone(), - url, - description, - }); - } - - results -} - -/// A raw `a.result-link` extracted from the HTML, with its byte offset. -struct RawAnchor { - pos: usize, - href: String, - title: String, -} - -/// A raw `td.result-snippet` extracted from the HTML, with its byte offset. -struct RawSnippet { - pos: usize, - text: String, -} - -/// Extract every `a.result-link` anchor (offset, href, title) in document order. -fn collect_anchors(html: &str) -> Vec { - anchor_regex() - .captures_iter(html) - .filter_map(|caps| { - let whole = caps.get(0)?; - let attrs = caps.get(1).map_or("", |m| m.as_str()); - let inner = caps.get(2).map_or("", |m| m.as_str()); - if !has_class(attrs, "result-link") { - return None; - } - Some(RawAnchor { - pos: whole.start(), - href: attr_value(attrs, AttrName::Href).unwrap_or_default(), - // Strip tags, decode entities, then trim. DuckDuckGo Lite titles - // are plain text, so this matches the Python `get_text(strip=True)` - // title extraction; on any inline markup it yields the cleaner - // space-preserving text rather than BeautifulSoup's node-join. - title: text_from_html(inner, "").trim().to_string(), - }) - }) - .collect() -} - -/// Extract every `td.result-snippet` (offset, normalized text) in document order. -fn collect_snippets(html: &str) -> Vec { - td_regex() - .captures_iter(html) - .filter_map(|caps| { - let whole = caps.get(0)?; - let attrs = caps.get(1).map_or("", |m| m.as_str()); - let inner = caps.get(2).map_or("", |m| m.as_str()); - if !has_class(attrs, "result-snippet") { - return None; - } - Some(RawSnippet { - pos: whole.start(), - // `get_text(separator=" ")` then normalize whitespace. - text: normalize_whitespace(&text_from_html(inner, " ")), - }) - }) - .collect() -} - -/// Strip HTML tags (replacing each with `separator`) and decode entities. -fn text_from_html(html: &str, separator: &str) -> String { - let without_tags = tag_regex().replace_all(html, separator); - decode_entities(&without_tags) -} - -/// Whether a tag's attribute string declares `class` containing `class_name`. -fn has_class(attrs: &str, class_name: &str) -> bool { - attr_value(attrs, AttrName::Class) - .is_some_and(|value| value.split_whitespace().any(|c| c == class_name)) -} - -/// The attributes we extract from a tag. -#[derive(Clone, Copy)] -enum AttrName { - Href, - Class, -} - -/// Extract a quoted attribute value from a tag's attribute string. -fn attr_value(attrs: &str, name: AttrName) -> Option { - let re = match name { - AttrName::Href => href_regex(), - AttrName::Class => class_regex(), - }; - re.captures(attrs) - .and_then(|caps| caps.get(1)) - .map(|m| m.as_str().to_string()) -} - -/// Read a single query parameter's value, form-decoded (matching `parse_qs`: -/// `+` becomes a space and `%XX` is percent-decoded). -fn query_param(url: &str, key: &str) -> Option { - let (_, query) = url.split_once('?')?; - // Drop any fragment before splitting pairs. - let query = query.split('#').next().unwrap_or(query); - for pair in query.split('&') { - let (k, v) = pair.split_once('=').unwrap_or((pair, "")); - if k == key { - return Some(percent_decode_form(v)); - } - } - None -} - -/// Form-decode a query component: `+` -> space, `%XX` -> byte, then UTF-8. -fn percent_decode_form(value: &str) -> String { - let spaced = value.replace('+', " "); - let bytes = spaced.as_bytes(); - let mut out: Vec = Vec::with_capacity(bytes.len()); - let mut i = 0; - while i < bytes.len() { - if bytes[i] == b'%' && i + 2 < bytes.len() { - if let (Some(hi), Some(lo)) = (hex_val(bytes[i + 1]), hex_val(bytes[i + 2])) { - out.push(hi * 16 + lo); - i += 3; - continue; - } - } - out.push(bytes[i]); - i += 1; - } - String::from_utf8_lossy(&out).into_owned() -} - -/// Hex digit value of an ASCII byte, or `None`. -fn hex_val(byte: u8) -> Option { - match byte { - b'0'..=b'9' => Some(byte - b'0'), - b'a'..=b'f' => Some(byte - b'a' + 10), - b'A'..=b'F' => Some(byte - b'A' + 10), - _ => None, - } -} - -/// Decode the common HTML character references in one pass. -/// -/// Covers the named references that appear in DuckDuckGo snippets plus all -/// numeric references (`&#NN;` / `&#xHH;`); unknown named references are left -/// intact (BeautifulSoup decodes the full set — this is the practical subset). -fn decode_entities(text: &str) -> String { - entity_regex() - .replace_all(text, |caps: ®ex::Captures<'_>| { - let body = &caps[1]; - if let Some(hex) = body.strip_prefix("#x").or_else(|| body.strip_prefix("#X")) { - return decode_codepoint(u32::from_str_radix(hex, 16).ok()) - .unwrap_or_else(|| caps[0].to_string()); - } - if let Some(dec) = body.strip_prefix('#') { - return decode_codepoint(dec.parse::().ok()) - .unwrap_or_else(|| caps[0].to_string()); - } - match body { - "amp" => "&", - "lt" => "<", - "gt" => ">", - "quot" => "\"", - "apos" => "'", - "nbsp" => " ", - // Typographic punctuation. - "hellip" => "…", - "mdash" => "—", - "ndash" => "–", - "rsquo" => "\u{2019}", - "lsquo" => "\u{2018}", - "rdquo" => "\u{201D}", - "ldquo" => "\u{201C}", - "laquo" => "«", - "raquo" => "»", - "middot" => "·", - "bull" => "•", - // Common symbols. - "copy" => "©", - "reg" => "®", - "trade" => "™", - "times" => "×", - "divide" => "÷", - "deg" => "°", - "euro" => "€", - "pound" => "£", - "cent" => "¢", - "sect" => "§", - // Common Western-European accented letters. - "aacute" => "á", - "agrave" => "à", - "acirc" => "â", - "auml" => "ä", - "aring" => "å", - "ccedil" => "ç", - "eacute" => "é", - "egrave" => "è", - "ecirc" => "ê", - "euml" => "ë", - "iacute" => "í", - "iuml" => "ï", - "ntilde" => "ñ", - "oacute" => "ó", - "ocirc" => "ô", - "ouml" => "ö", - "uacute" => "ú", - "uuml" => "ü", - "szlig" => "ß", - // Unknown named reference: leave the original text intact - // (BeautifulSoup decodes the full HTML5 set; this is the - // practical subset DuckDuckGo emits, plus all numeric refs). - _ => return caps[0].to_string(), - } - .to_string() - }) - .into_owned() -} - -/// Map a numeric character-reference code point to its string, if valid. -fn decode_codepoint(code: Option) -> Option { - code.and_then(char::from_u32).map(|c| c.to_string()) -} - -// --- Cached regexes (compiled once; patterns are constant) ----------------- -// -// The tag regexes use `[^>]*` for the attribute span, which assumes attribute -// values contain no literal `>` — true for the fixed DuckDuckGo Lite markup -// (see the module doc). On non-conforming markup a `>` inside an attribute -// value would truncate the match (dropping that result), never panic. - -fn anchor_regex() -> &'static Regex { - static RE: OnceLock = OnceLock::new(); - RE.get_or_init(|| Regex::new(r"(?is)]*)>(.*?)").expect("valid anchor regex")) -} - -fn td_regex() -> &'static Regex { - static RE: OnceLock = OnceLock::new(); - RE.get_or_init(|| Regex::new(r"(?is)]*)>(.*?)").expect("valid td regex")) -} - -fn tag_regex() -> &'static Regex { - static RE: OnceLock = OnceLock::new(); - RE.get_or_init(|| Regex::new(r"(?s)<[^>]*>").expect("valid tag regex")) -} - -fn href_regex() -> &'static Regex { - static RE: OnceLock = OnceLock::new(); - RE.get_or_init(|| { - Regex::new(r#"(?i)(?:^|\s)href\s*=\s*["']([^"']*)["']"#).expect("valid href regex") - }) -} - -fn class_regex() -> &'static Regex { - static RE: OnceLock = OnceLock::new(); - RE.get_or_init(|| { - Regex::new(r#"(?i)(?:^|\s)class\s*=\s*["']([^"']*)["']"#).expect("valid class regex") - }) -} - fn whitespace_regex() -> &'static Regex { static RE: OnceLock = OnceLock::new(); RE.get_or_init(|| Regex::new(r"\s+").expect("valid whitespace regex")) } - -fn entity_regex() -> &'static Regex { - static RE: OnceLock = OnceLock::new(); - RE.get_or_init(|| { - Regex::new(r"&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*);") - .expect("valid entity regex") - }) -} diff --git a/crates/browser-use-agent/src/tools/handlers/search_tests.rs b/crates/browser-use-agent/src/tools/handlers/search_tests.rs index fea6558f..8abd03c9 100644 --- a/crates/browser-use-agent/src/tools/handlers/search_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/search_tests.rs @@ -1,15 +1,14 @@ //! Tests for the async `search` tool ([`SearchTool`]). //! -//! No real network is touched: the pure parsing/formatting/URL helpers are -//! exercised against fixture HTML, and the `run` path is driven through a fake +//! No real network is touched: the pure parsing/formatting helpers are +//! exercised against fixture JSON, and the `run` path is driven through a fake //! [`SearchBackend`] (mirroring `update_plan_tests` / `tool_search_tests`). use std::sync::Arc; use super::search::{ - classify_response, extract_real_url, format_results, normalize_whitespace, parse_lite_results, - SearchBackend, SearchError, SearchRequest, SearchResult, SearchTool, SEARCH_PARALLEL_SAFE, - SEARCH_TOOL_NAME, + classify_response, format_results, normalize_whitespace, parse_results, SearchBackend, + SearchError, SearchRequest, SearchResult, SearchTool, SEARCH_PARALLEL_SAFE, SEARCH_TOOL_NAME, }; use crate::tools::approval::AskForApproval; use crate::tools::orchestrator::{ToolOrchestrator, TurnEnv}; @@ -60,60 +59,54 @@ fn turn_env() -> TurnEnv { } } -/// A fake backend returning a canned HTML body (no network). -struct HtmlBackend(String); +/// A fake backend returning a canned response body (no network). +struct StubBackend(String); #[async_trait::async_trait] -impl SearchBackend for HtmlBackend { +impl SearchBackend for StubBackend { async fn fetch(&self, _query: &str) -> Result { Ok(self.0.clone()) } } -/// A fake backend returning a challenge error (no network). -struct ChallengeBackend; +/// A fake backend failing with a 401 (no network). +struct UnauthorizedBackend; #[async_trait::async_trait] -impl SearchBackend for ChallengeBackend { +impl SearchBackend for UnauthorizedBackend { async fn fetch(&self, _query: &str) -> Result { - Err(SearchError::Challenge) + Err(SearchError::Unauthorized) } } -/// A small, realistic DuckDuckGo Lite results fixture exercising: a redirect -/// URL, an entity in the snippet, a "More info" link (skipped), a duplicate -/// (deduped), a `duckduckgo.com` target (skipped), a direct (non-redirect) link, -/// and a result without a snippet. -const FIXTURE: &str = r#" - - - - - - - - - - - - - - - - - - - - - -
1. The Rust Programming Language
 A language empowering everyone to build reliable & efficient software — fast.
www.rust-lang.org
2. Rust (duplicate target)
duplicate should be dropped
DuckDuckGo About
a duckduckgo.com target, should be dropped
More info
Direct Link No Redirect
direct link snippet
No Snippet Result
- -"#; +/// A realistic search-service response fixture exercising: a full result +/// (title + date + multi-line markdown content), a result without a +/// `published_date`, a result without a `title` (URL fallback), and a result +/// without a `url` (dropped). +const FIXTURE: &str = r##"{ + "results": [ + { + "title": "Genpact and Parallel Web Systems Partner to Drive Tangible Efficiency from AI Systems", + "url": "https://www.prnewswire.com/news-releases/genpact-parallel-302736563.html", + "published_date": "2026-04-08", + "content": "# Genpact and Parallel\n## Share this article\nIntegrating Parallel's API helps\nGenpact automate research workflows." + }, + { + "title": "Parallel raises $100M", + "url": "https://www.linkedin.com/posts/example-activity", + "content": "Nov 12, 2025 · The startup secured a $100 million Series A round." + }, + { + "url": "https://untitled.example.com/page", + "published_date": "2026-05-19", + "content": "A result whose source provided no title." + }, + { + "title": "No URL – must be dropped", + "content": "this result has no url and is filtered out" + } + ] +}"##; // ---- pure helpers: normalize_whitespace ----------------------------------- @@ -124,120 +117,56 @@ fn normalize_whitespace_collapses_and_trims() { assert_eq!(normalize_whitespace(" "), ""); } -// ---- pure helpers: extract_real_url --------------------------------------- +// ---- pure helpers: parse_results ------------------------------------------- #[test] -fn extract_real_url_unwraps_ddg_redirect() { - let raw = "//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fpage%3Fx%3D1&rut=abc"; - assert_eq!( - extract_real_url(raw), - Some("https://example.com/page?x=1".to_string()) - ); -} +fn parse_results_maps_wire_results() { + let results = parse_results(FIXTURE).unwrap(); -#[test] -fn extract_real_url_decodes_plus_as_space() { - // `parse_qs` semantics: `+` in a query value decodes to a space. - let raw = "//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fa+b"; - assert_eq!( - extract_real_url(raw), - Some("https://example.com/a b".to_string()) - ); -} + // The url-less result is dropped; the other three are kept in order. + assert_eq!(results.len(), 3); -#[test] -fn extract_real_url_adds_scheme_to_protocol_relative() { + // Full result: title, url, date, and content normalized to one line. assert_eq!( - extract_real_url("//example.com/x"), - Some("https://example.com/x".to_string()) + results[0].title, + "Genpact and Parallel Web Systems Partner to Drive Tangible Efficiency from AI Systems" ); -} - -#[test] -fn extract_real_url_passes_through_plain_http() { assert_eq!( - extract_real_url("https://example.com/"), - Some("https://example.com/".to_string()) + results[0].url, + "https://www.prnewswire.com/news-releases/genpact-parallel-302736563.html" ); - assert_eq!( - extract_real_url("http://example.com/"), - Some("http://example.com/".to_string()) - ); -} - -#[test] -fn extract_real_url_drops_ads_and_unsafe_and_empty() { - // Ad links. - assert_eq!( - extract_real_url("//duckduckgo.com/y.js?ad_provider=x"), - None - ); - // Non-http(s) schemes. - assert_eq!(extract_real_url("javascript:alert(1)"), None); - assert_eq!(extract_real_url("data:text/html,hi"), None); - // Empty. - assert_eq!(extract_real_url(""), None); -} - -// ---- pure helpers: parse_lite_results ------------------------------------- - -#[test] -fn parse_lite_results_extracts_decodes_dedupes_and_filters() { - let results = parse_lite_results(FIXTURE); - - // Kept, in order: rust-lang (redirect), direct link, no-snippet result. - // Dropped: duplicate target, duckduckgo.com target, "More info" title. - let titles: Vec<&str> = results.iter().map(|r| r.title.as_str()).collect(); - assert_eq!( - titles, - vec![ - "The Rust Programming Language", - "Direct Link No Redirect", - "No Snippet Result", - ] - ); - - // First result: redirect unwrapped + snippet entity-decoded + normalized. - assert_eq!(results[0].url, "https://www.rust-lang.org/"); + assert_eq!(results[0].published_date.as_deref(), Some("2026-04-08")); assert_eq!( results[0].description, - "A language empowering everyone to build reliable & efficient software — fast." + "# Genpact and Parallel ## Share this article Integrating Parallel's API helps Genpact automate research workflows." ); - // Direct (non-redirect) link is passed through with its own snippet. - assert_eq!(results[1].url, "https://direct.example.com/page"); - assert_eq!(results[1].description, "direct link snippet"); + // Date is optional. + assert_eq!(results[1].title, "Parallel raises $100M"); + assert_eq!(results[1].published_date, None); - // A result with no following snippet gets an empty description. - assert_eq!(results[2].url, "https://no-snippet.example.com/"); - assert_eq!(results[2].description, ""); + // Title is optional (empty when the source provided none). + assert_eq!(results[2].title, ""); + assert_eq!(results[2].url, "https://untitled.example.com/page"); + assert_eq!(results[2].published_date.as_deref(), Some("2026-05-19")); } #[test] -fn parse_lite_results_handles_empty_and_resultless_html() { - assert!(parse_lite_results("").is_empty()); - assert!(parse_lite_results("no results here").is_empty()); +fn parse_results_handles_empty_and_missing_results() { + assert!(parse_results(r#"{"results": []}"#).unwrap().is_empty()); + // `results` defaults when absent. + assert!(parse_results("{}").unwrap().is_empty()); } -/// Inline markup inside a title/snippet, real whitespace runs, and a broadened -/// named entity: exercises `text_from_html` tag-stripping (both separators), -/// `normalize_whitespace` via the parse path, and the entity table. #[test] -fn parse_lite_results_strips_inline_markup_and_collapses_whitespace() { - let html = "\ - \ - \ -
The Rust Book
Tokio is an\n async runtime for café & more.
"; - let results = parse_lite_results(html); - assert_eq!(results.len(), 1); - // Title: tags stripped (separator ""), single-spaced. - assert_eq!(results[0].title, "The Rust Book"); - assert_eq!(results[0].url, "https://book.example.com/"); - // Snippet: tags -> space, é/& decoded, whitespace runs collapsed. - assert_eq!( - results[0].description, - "Tokio is an async runtime for café & more." - ); +fn parse_results_rejects_malformed_bodies() { + for body in ["not json", "", r#"{"results": "nope"}"#, "[1,2,3]"] { + let err = parse_results(body).unwrap_err(); + assert!( + matches!(err, SearchError::Decode(_)), + "expected Decode for {body:?}, got {err:?}" + ); + } } // ---- pure helpers: format_results ----------------------------------------- @@ -248,11 +177,13 @@ fn format_results_renders_header_and_numbered_entries() { SearchResult { title: "First".to_string(), url: "https://a.example/".to_string(), + published_date: Some("2026-04-08".to_string()), description: "first snippet".to_string(), }, SearchResult { title: "Second".to_string(), url: "https://b.example/".to_string(), + published_date: None, description: String::new(), }, ]; @@ -266,18 +197,37 @@ fn format_results_renders_header_and_numbered_entries() { out.contains("do NOT navigate to a search engine"), "got: {out}" ); - assert!(out.contains("1. First"), "got: {out}"); + // The publication date is appended to the title line when known. + assert!(out.contains("1. First (2026-04-08)"), "got: {out}"); assert!(out.contains(" URL: https://a.example/"), "got: {out}"); assert!(out.contains(" first snippet"), "got: {out}"); - assert!(out.contains("2. Second"), "got: {out}"); + // No date -> bare title line. + assert!(out.contains("2. Second\n"), "got: {out}"); assert!(out.contains(" URL: https://b.example/"), "got: {out}"); } +#[test] +fn format_results_falls_back_to_url_for_untitled_results() { + let results = vec![SearchResult { + title: String::new(), + url: "https://untitled.example.com/page".to_string(), + published_date: None, + description: "snippet".to_string(), + }]; + let out = format_results("q", &results); + // The fallback title is the URL, subject to the same 30-char cap. + assert!( + out.contains("1. https://untitled.example.com/…"), + "untitled result should show its URL as the title: {out}" + ); +} + #[test] fn format_results_truncates_long_title_and_description() { let results = vec![SearchResult { title: "ThisIsAVeryLongResultTitleThatExceedsThirtyCharacters".to_string(), url: "https://example.com/keep/this/whole/url".to_string(), + published_date: None, description: "d".repeat(250), }]; let out = format_results("q", &results); @@ -311,43 +261,46 @@ fn format_results_truncates_long_title_and_description() { // ---- pure helpers: classify_response -------------------------------------- #[test] -fn classify_response_flags_challenge_status_and_anomaly_body() { +fn classify_response_names_auth_and_billing_errors() { assert!(matches!( - classify_response(202, "anything"), - Err(SearchError::Challenge) + classify_response(401, "unauthorized"), + Err(SearchError::Unauthorized) )); assert!(matches!( - classify_response(200, "...Anomaly detected..."), - Err(SearchError::Challenge) + classify_response(402, "payment required"), + Err(SearchError::InsufficientBalance) )); } #[test] -fn classify_response_flags_http_errors_with_snippet() { +fn classify_response_flags_other_errors_with_snippet() { + // 400 invalid query, 422 upstream rejected, 502/503 upstream down — all + // carry the status + body snippet. + for status in [400u16, 422, 502, 503] { + match classify_response(status, "boom") { + Err(SearchError::Http { + status: got, + snippet, + }) => { + assert_eq!(got, status); + assert_eq!(snippet, "boom"); + } + other => panic!("expected Http for {status}, got {other:?}"), + } + } + // The snippet is truncated to 200 chars. let body = "x".repeat(500); - match classify_response(503, &body) { - Err(SearchError::Http { status, snippet }) => { - assert_eq!(status, 503); - assert_eq!( - snippet.chars().count(), - 200, - "snippet truncated to 200 chars" - ); + match classify_response(500, &body) { + Err(SearchError::Http { snippet, .. }) => { + assert_eq!(snippet.chars().count(), 200, "snippet truncated"); } other => panic!("expected Http error, got {other:?}"), } } #[test] -fn classify_response_flags_4xx_and_pins_the_400_boundary() { - // 4xx is the case the port must handle (not just 5xx). - match classify_response(404, "not found") { - Err(SearchError::Http { status, snippet }) => { - assert_eq!(status, 404); - assert_eq!(snippet, "not found"); - } - other => panic!("expected Http error, got {other:?}"), - } +fn classify_response_accepts_ok_and_pins_the_400_boundary() { + assert!(classify_response(200, r#"{"results":[]}"#).is_ok()); // The 399-ok / 400-error boundary pins against an off-by-one in `>= 400`. assert!(classify_response(399, "ok").is_ok()); assert!(matches!( @@ -356,20 +309,15 @@ fn classify_response_flags_4xx_and_pins_the_400_boundary() { )); } -#[test] -fn classify_response_accepts_ok() { - assert!(classify_response(200, "fine").is_ok()); -} - // ---- run() through the fake backend --------------------------------------- #[tokio::test] -async fn run_formats_results_from_backend_html() { - let tool = SearchTool::with_backend(Arc::new(HtmlBackend(FIXTURE.to_string()))); +async fn run_formats_results_from_backend_json() { + let tool = SearchTool::with_backend(Arc::new(StubBackend(FIXTURE.to_string()))); let launch = none_launch(); let attempt = none_attempt(&launch); let out = tool - .run(&SearchRequest::new("rust"), &attempt, &ctx()) + .run(&SearchRequest::new("parallel"), &attempt, &ctx()) .await .unwrap(); @@ -377,19 +325,28 @@ async fn run_formats_results_from_backend_html() { assert!(out.stderr.is_empty()); assert!( out.stdout - .contains("Search results for \"rust\" (3 results):"), + .contains("Search results for \"parallel\" (3 results):"), "got: {}", out.stdout ); - // This title (29 chars) is within the 30-char cap, so it appears in full. + // Title truncated to 30 chars (incl. ellipsis) with the date appended. assert!( - out.stdout.contains("The Rust Programming Language"), + out.stdout + .contains("1. Genpact and Parallel Web Syst… (2026-04-08)"), "got: {}", out.stdout ); // URLs are kept intact. assert!( - out.stdout.contains("https://www.rust-lang.org/"), + out.stdout + .contains("https://www.prnewswire.com/news-releases/genpact-parallel-302736563.html"), + "got: {}", + out.stdout + ); + // Multi-line markdown content arrives normalized to one line. + assert!( + out.stdout + .contains("# Genpact and Parallel ## Share this article"), "got: {}", out.stdout ); @@ -397,9 +354,7 @@ async fn run_formats_results_from_backend_html() { #[tokio::test] async fn run_reports_no_results() { - let tool = SearchTool::with_backend(Arc::new(HtmlBackend( - "nothing".to_string(), - ))); + let tool = SearchTool::with_backend(Arc::new(StubBackend(r#"{"results":[]}"#.to_string()))); let launch = none_launch(); let attempt = none_attempt(&launch); let out = tool @@ -413,7 +368,7 @@ async fn run_reports_no_results() { #[tokio::test] async fn run_rejects_empty_query() { - let tool = SearchTool::with_backend(Arc::new(HtmlBackend(String::new()))); + let tool = SearchTool::with_backend(Arc::new(StubBackend(String::new()))); let launch = none_launch(); let attempt = none_attempt(&launch); let err = tool @@ -428,11 +383,11 @@ async fn run_rejects_empty_query() { #[tokio::test] async fn run_surfaces_backend_failure_as_soft_error() { - let tool = SearchTool::with_backend(Arc::new(ChallengeBackend)); + let tool = SearchTool::with_backend(Arc::new(UnauthorizedBackend)); let launch = none_launch(); let attempt = none_attempt(&launch); let out = tool - .run(&SearchRequest::new("rust"), &attempt, &ctx()) + .run(&SearchRequest::new("parallel"), &attempt, &ctx()) .await .unwrap(); @@ -441,7 +396,25 @@ async fn run_surfaces_backend_failure_as_soft_error() { assert_eq!(out.exit_code, 1); assert!(out.stdout.is_empty()); assert!( - out.stderr.contains("Search failed:") && out.stderr.contains("challenge"), + out.stderr.contains("Search failed:") && out.stderr.contains("API key"), + "got: {}", + out.stderr + ); +} + +#[tokio::test] +async fn run_surfaces_malformed_body_as_soft_error() { + let tool = SearchTool::with_backend(Arc::new(StubBackend("not json".to_string()))); + let launch = none_launch(); + let attempt = none_attempt(&launch); + let out = tool + .run(&SearchRequest::new("parallel"), &attempt, &ctx()) + .await + .unwrap(); + + assert_eq!(out.exit_code, 1); + assert!( + out.stderr.contains("Search failed:") && out.stderr.contains("unexpected response body"), "got: {}", out.stderr ); @@ -451,8 +424,8 @@ async fn run_surfaces_backend_failure_as_soft_error() { #[test] fn approval_accessors() { - let tool = SearchTool::with_backend(Arc::new(HtmlBackend(String::new()))); - let req = SearchRequest::new("rust"); + let tool = SearchTool::with_backend(Arc::new(StubBackend(String::new()))); + let req = SearchRequest::new("parallel"); assert_eq!(tool.approval_keys(&req).len(), 1, "one key per call"); assert_eq!( tool.sandbox_permissions(&req), @@ -462,10 +435,11 @@ fn approval_accessors() { } #[test] -fn search_is_serial_to_avoid_rate_limit_blocks() { - let tool = SearchTool::with_backend(Arc::new(HtmlBackend(String::new()))); +fn search_is_serial_by_default() { + // A conservative scheduling default for a billed API call. + let tool = SearchTool::with_backend(Arc::new(StubBackend(String::new()))); assert_eq!( - tool.parallel_safe(&SearchRequest::new("rust")), + tool.parallel_safe(&SearchRequest::new("parallel")), SEARCH_PARALLEL_SAFE ); assert!(!SEARCH_PARALLEL_SAFE); @@ -474,7 +448,7 @@ fn search_is_serial_to_avoid_rate_limit_blocks() { #[test] fn tool_name_is_search() { assert_eq!(SEARCH_TOOL_NAME, "search"); - let tool = SearchTool::with_backend(Arc::new(HtmlBackend(String::new()))); + let tool = SearchTool::with_backend(Arc::new(StubBackend(String::new()))); assert_eq!(tool.name(), "search"); } @@ -492,12 +466,12 @@ fn request_round_trips_wire_shape() { #[tokio::test] async fn orchestrated_search_completes_under_none() { let orch = ToolOrchestrator::new(NoneSandboxProvider, AutoApprover); - let tool = SearchTool::with_backend(Arc::new(HtmlBackend(FIXTURE.to_string()))); + let tool = SearchTool::with_backend(Arc::new(StubBackend(FIXTURE.to_string()))); let result = orch .run( &tool, - &SearchRequest::new("rust"), + &SearchRequest::new("parallel"), &ctx(), &turn_env(), AskForApproval::Never, @@ -507,35 +481,41 @@ async fn orchestrated_search_completes_under_none() { assert_eq!(result.sandbox_used, SandboxType::None); assert_eq!(result.output.exit_code, 0); - // Within the 30-char title cap, so it appears in full. assert!( - result - .output - .stdout - .contains("The Rust Programming Language"), + result.output.stdout.contains("Genpact and Parallel"), "got: {}", result.output.stdout ); } -// ---- live smoke (ignored: hits the real DuckDuckGo endpoint) -------------- +// ---- live smoke (ignored: hits the real browser-use search API) ----------- -/// End-to-end check against the REAL DuckDuckGo Lite endpoint via the default -/// [`HttpSearchBackend`]. Ignored by default (network + non-deterministic, and -/// DuckDuckGo may rate-limit/serve a challenge). Run it manually with: +/// End-to-end check against the REAL `search.browser-use.com` service via the +/// default [`HttpSearchBackend`]. Ignored by default (network, billing, and a +/// `BROWSER_USE_API_KEY` requirement). Run it manually with: /// /// ```text /// cargo test -p browser-use-agent --lib -- --ignored --nocapture search_live_smoke /// ``` #[tokio::test] -#[ignore = "hits the live DuckDuckGo Lite endpoint"] +#[ignore = "hits the live browser-use search API (requires BROWSER_USE_API_KEY, \ + or BROWSER_USE_SEARCH_URL pointing at an open dev instance)"] async fn search_live_smoke() { + let has_key = std::env::var("BROWSER_USE_API_KEY").is_ok_and(|key| !key.trim().is_empty()); + let has_url = std::env::var("BROWSER_USE_SEARCH_URL").is_ok_and(|url| !url.trim().is_empty()); + if !has_key && !has_url { + eprintln!( + "skipping live smoke: neither BROWSER_USE_API_KEY nor BROWSER_USE_SEARCH_URL is set" + ); + return; + } + let tool = SearchTool::new(); let launch = none_launch(); let attempt = none_attempt(&launch); let out = tool .run( - &SearchRequest::new("rust programming language"), + &SearchRequest::new("Parallel Web Systems latest announcements"), &attempt, &ctx(), ) @@ -546,8 +526,8 @@ async fn search_live_smoke() { "exit_code={}\n--- stdout ---\n{}\n--- stderr ---\n{}", out.exit_code, out.stdout, out.stderr ); - // A challenge/CAPTCHA is a legitimate live outcome (exit 1 + message); only - // assert hard on the success shape so the test documents both paths. + // An auth/billing rejection is a legitimate live outcome (exit 1 + message); + // only assert hard on the success shape so the test documents both paths. if out.exit_code == 0 { assert!( out.stdout.contains("Search results for") || out.stdout.contains("No results found"), diff --git a/crates/browser-use-agent/src/tools/registry.rs b/crates/browser-use-agent/src/tools/registry.rs index ced17ed8..7d0d66b8 100644 --- a/crates/browser-use-agent/src/tools/registry.rs +++ b/crates/browser-use-agent/src/tools/registry.rs @@ -1155,14 +1155,14 @@ to the single frame that proves the task succeeded." } } - /// `search`: a LOCALLY-executed DuckDuckGo (Lite) web search. Unlike the - /// hosted [`web_search`](definitions::web_search), the client performs the - /// HTTP request itself and returns the parsed results as text. Ported from - /// the Python `search` action's description. + /// `search`: a web search via the browser-use search API + /// (`search.browser-use.com`). Unlike the hosted + /// [`web_search`](definitions::web_search), the client performs the API + /// call itself and returns the parsed results as text. pub fn search() -> ToolDefinition { ToolDefinition { name: "search".to_string(), - description: "Search the web with a local DuckDuckGo Lite request and return compact \ + description: "Search the web with the browser-use search API and return compact \ text results. This does not use or require a browser connection or browser \ session. Use this instead of navigating a browser to Google, DuckDuckGo, Bing, \ or any other search engine; it is far more token-efficient than reading a search \ @@ -2036,8 +2036,8 @@ where tool_search, ); reg.register::<_, WebSearchRequest>("web_search", definitions::web_search(), true, web_search); - // `search`: locally-executed DuckDuckGo search. Serial to avoid - // DuckDuckGo Lite rate-limit blocks from concurrent requests. + // `search`: web search via the browser-use search API. Serial: a + // conservative scheduling default for a billed API call. reg.register::<_, SearchRequest>( "search", definitions::search(), diff --git a/crates/browser-use-agent/src/tools/registry_tests.rs b/crates/browser-use-agent/src/tools/registry_tests.rs index a02e287b..88ef282b 100644 --- a/crates/browser-use-agent/src/tools/registry_tests.rs +++ b/crates/browser-use-agent/src/tools/registry_tests.rs @@ -488,8 +488,8 @@ impl McpClient for FakeMcpClient { } } -/// A fake search backend: returns a canned DuckDuckGo Lite HTML fragment with a -/// single result echoing the query, so no network is touched (mirrors +/// A fake search backend: returns a canned search-API JSON body with a single +/// result echoing the query, so no network is touched (mirrors /// `search_tests.rs`). struct FakeSearchBackend; @@ -497,10 +497,7 @@ struct FakeSearchBackend; impl SearchBackend for FakeSearchBackend { async fn fetch(&self, query: &str) -> Result { Ok(format!( - "\ - \ - \ -
Result for {query}
snippet for {query}
" + r#"{{"results":[{{"title":"Result for {query}","url":"https://example.com/","content":"snippet for {query}"}}]}}"# )) } } @@ -582,8 +579,8 @@ fn default_registry_registers_all_tools() { fn search_definition_guides_model_away_from_browser_search_engines() { let desc = definitions::search().description; assert!( - desc.contains("local DuckDuckGo Lite request"), - "search description should explain the local search backend: {desc}" + desc.contains("browser-use search API"), + "search description should explain the search backend: {desc}" ); assert!( desc.contains("does not use or require a browser connection or browser session"), From 702421da36e3ff79e757ee3d09a637f0f4d436ae Mon Sep 17 00:00:00 2001 From: reformedot Date: Mon, 8 Jun 2026 18:03:05 -0700 Subject: [PATCH 07/10] Shorten the search tool description to a Codex-style one-liner Replace the multi-sentence search description with a concise one-liner matching the house/codex style (cf. web_search: "Search the web for a free-text query."), keeping only the key differentiator (no browser needed). Update the definition test accordingly + add a length guard. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/browser-use-agent/src/tools/registry.rs | 8 ++------ .../src/tools/registry_tests.rs | 18 ++++++++++-------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/crates/browser-use-agent/src/tools/registry.rs b/crates/browser-use-agent/src/tools/registry.rs index 327c5f0b..39808dd8 100644 --- a/crates/browser-use-agent/src/tools/registry.rs +++ b/crates/browser-use-agent/src/tools/registry.rs @@ -1168,12 +1168,8 @@ to the single frame that proves the task succeeded." pub fn search() -> ToolDefinition { ToolDefinition { name: "search".to_string(), - description: "Search the web with the browser-use search API and return compact \ - text results. This does not use or require a browser connection or browser \ - session. Use this instead of navigating a browser to Google, DuckDuckGo, Bing, \ - or any other search engine; it is far more token-efficient than reading a search \ - results page in the browser. Only use the browser after search when you need to \ - inspect a specific result page." + description: "Search the web for a free-text query and return results as text. \ + No browser needed; prefer this over opening a browser to a search engine." .to_string(), input_schema: json!({ "type": "object", diff --git a/crates/browser-use-agent/src/tools/registry_tests.rs b/crates/browser-use-agent/src/tools/registry_tests.rs index c2982efe..6fe52519 100644 --- a/crates/browser-use-agent/src/tools/registry_tests.rs +++ b/crates/browser-use-agent/src/tools/registry_tests.rs @@ -576,23 +576,25 @@ fn default_registry_registers_all_tools() { } #[test] -fn search_definition_guides_model_away_from_browser_search_engines() { +fn search_definition_is_concise_and_guides_away_from_browser() { let desc = definitions::search().description; assert!( - desc.contains("browser-use search API"), - "search description should explain the search backend: {desc}" + desc.contains("Search the web"), + "describes a web search: {desc}" ); assert!( - desc.contains("does not use or require a browser connection or browser session"), - "search description should make clear no browser connection is needed: {desc}" + desc.contains("No browser needed"), + "search description should make clear no browser is needed: {desc}" ); assert!( - desc.contains("instead of navigating a browser"), + desc.contains("prefer this over opening a browser"), "search description should prefer this tool over browser search-engine navigation: {desc}" ); + // Keep it concise (Codex-style one-liner), unlike a multi-sentence blurb. assert!( - desc.contains("token-efficient"), - "search description should call out the token-efficiency reason: {desc}" + desc.len() < 160, + "search description should stay concise, got {} chars: {desc}", + desc.len() ); } From 72d7e9583ce22b618fb94797929f68b04afda992 Mon Sep 17 00:00:00 2001 From: reformedot Date: Tue, 9 Jun 2026 18:33:51 -0700 Subject: [PATCH 08/10] Wire the stored cloud API key to the search tool + post-launch review fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit search.browser-use.com is live; an integration review against the running service and the Go source surfaced these fixes: - TUI (`prepare_tui_agent_run`): load + export the stored cloud API key (auth.browser_use_cloud.api_key) for ALL browser modes, not only Browser Use Cloud — the `search` tool reads BROWSER_USE_API_KEY from the env, so a signed-in user on Local Chrome previously got MissingApiKey. An explicitly exported env key wins; the store only fills it when unset. - CLI (`run_session_via_engine_with_runtime_and_cancel`): same export on the headless run path (env-first, store fallback). - parse_results: restore the http(s)-only URL allowlist the DuckDuckGo-era code enforced — the output tells the model to navigate to result URLs, so javascript:/data:/relative URLs from upstream are dropped (+ test). - Contract docs: the service returns 429 (rate limited) and never 422 (upstream 400/422 are sanitized into client 400); fix the three doc sites and the classify_response test statuses. - registry_tests: pin `search` serial via the hard-coded loop only, removing the self-contradicting constant-based assert. Live-verified against production: POST to search.browser-use.com with the X-Browser-Use-API-Key header; 401 correctly surfaces as the model-facing soft error. (Happy path previously verified against a dev instance.) Co-Authored-By: Claude Fable 5 --- .../src/tools/handlers/search.rs | 26 ++++++++++++------- .../src/tools/handlers/search_tests.rs | 24 +++++++++++++++-- .../src/tools/registry_tests.rs | 8 +++--- crates/browser-use-cli/src/main.rs | 12 +++++++++ crates/browser-use-tui/src/runtime.rs | 20 +++++++++----- 5 files changed, 66 insertions(+), 24 deletions(-) diff --git a/crates/browser-use-agent/src/tools/handlers/search.rs b/crates/browser-use-agent/src/tools/handlers/search.rs index 0e226a7b..3fd988d6 100644 --- a/crates/browser-use-agent/src/tools/handlers/search.rs +++ b/crates/browser-use-agent/src/tools/handlers/search.rs @@ -29,9 +29,10 @@ //! * `200` → `{"results": [{"title"?, "url", "published_date"?, "content"}]}`; //! `title` / `published_date` are omitted when the source lacks them, and //! `content` is multi-line markdown (whitespace-normalized here). -//! * Errors: `400` invalid query, `401` missing/invalid API key, `402` -//! insufficient balance, `422` upstream rejected the request, `502` upstream -//! failed, `503` auth/billing backend unavailable. +//! * Errors: `400` invalid query (the service also sanitizes an +//! upstream-rejected query into this), `401` missing/invalid API key, `402` +//! insufficient balance, `429` rate limit exceeded (retry later), `502` +//! upstream failed, `503` auth/billing backend unavailable. //! //! # Network seam (testability) //! @@ -138,7 +139,7 @@ pub enum SearchError { /// The project balance is exhausted (HTTP 402). #[error("insufficient browser-use balance (HTTP 402)")] InsufficientBalance, - /// Any other client/server error status (400, 422, 502, 503, …). + /// Any other client/server error status (400, 429, 502, 503, …). #[error("HTTP {status}: {snippet}")] Http { /// The HTTP status code. @@ -242,8 +243,8 @@ impl SearchBackend for HttpSearchBackend { /// Classify an HTTP response per the service's documented statuses: `401` and /// `402` get named, actionable errors; any other `>= 400` (400 invalid query, -/// 422 upstream rejected, 502 upstream failed, 503 auth backend down) carries -/// the status plus the first 200 chars of the body; everything else is success. +/// 429 rate limited, 502 upstream failed, 503 auth backend down) carries the +/// status plus the first 200 chars of the body; everything else is success. pub fn classify_response(status: u16, body: &str) -> Result<(), SearchError> { match status { 401 => Err(SearchError::Unauthorized), @@ -424,9 +425,11 @@ struct SearchResultWire { /// Parse the search service's JSON response body into results. /// /// The wire `content` arrives as multi-line markdown; it is whitespace- -/// normalized into the single-line `description`. Results without a `url` are -/// dropped (the model cannot follow them). A body that is not the documented -/// JSON shape is a [`SearchError::Decode`]. +/// normalized into the single-line `description`. Results without an +/// `http(s)://` URL are dropped: a url-less result cannot be followed, and the +/// formatted output tells the model to navigate to result URLs, so unsafe +/// schemes (`javascript:`, `data:`, …) must never surface. A body that is not +/// the documented JSON shape is a [`SearchError::Decode`]. pub fn parse_results(body: &str) -> Result, SearchError> { let wire: SearchResponseWire = serde_json::from_str(body).map_err(|err| SearchError::Decode(err.to_string()))?; @@ -434,7 +437,10 @@ pub fn parse_results(body: &str) -> Result, SearchError> { Ok(wire .results .into_iter() - .filter(|result| !result.url.trim().is_empty()) + .filter(|result| { + let url = result.url.trim(); + url.starts_with("https://") || url.starts_with("http://") + }) .map(|result| SearchResult { title: normalize_whitespace(&result.title), url: result.url.trim().to_string(), diff --git a/crates/browser-use-agent/src/tools/handlers/search_tests.rs b/crates/browser-use-agent/src/tools/handlers/search_tests.rs index ebfcd54e..0460e7f9 100644 --- a/crates/browser-use-agent/src/tools/handlers/search_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/search_tests.rs @@ -158,6 +158,26 @@ fn parse_results_handles_empty_and_missing_results() { assert!(parse_results("{}").unwrap().is_empty()); } +/// The formatted output instructs the model to navigate to result URLs, so +/// only `http(s)://` destinations may surface; unsafe schemes are dropped. +#[test] +fn parse_results_drops_non_http_urls() { + let body = r##"{"results": [ + {"title": "ok https", "url": "https://safe.example.com/", "content": "keep"}, + {"title": "ok http", "url": "http://plain.example.com/", "content": "keep"}, + {"title": "xss", "url": "javascript:alert(1)", "content": "drop"}, + {"title": "data", "url": "data:text/html,hi", "content": "drop"}, + {"title": "relative", "url": "/relative/path", "content": "drop"}, + {"title": "empty", "url": "", "content": "drop"} + ]}"##; + let results = parse_results(body).unwrap(); + let urls: Vec<&str> = results.iter().map(|r| r.url.as_str()).collect(); + assert_eq!( + urls, + vec!["https://safe.example.com/", "http://plain.example.com/"] + ); +} + #[test] fn parse_results_rejects_malformed_bodies() { for body in ["not json", "", r#"{"results": "nope"}"#, "[1,2,3]"] { @@ -274,9 +294,9 @@ fn classify_response_names_auth_and_billing_errors() { #[test] fn classify_response_flags_other_errors_with_snippet() { - // 400 invalid query, 422 upstream rejected, 502/503 upstream down — all + // 400 invalid query, 429 rate limited, 502/503 upstream down — all // carry the status + body snippet. - for status in [400u16, 422, 502, 503] { + for status in [400u16, 429, 502, 503] { match classify_response(status, "boom") { Err(SearchError::Http { status: got, diff --git a/crates/browser-use-agent/src/tools/registry_tests.rs b/crates/browser-use-agent/src/tools/registry_tests.rs index 6fe52519..5cd8a7c5 100644 --- a/crates/browser-use-agent/src/tools/registry_tests.rs +++ b/crates/browser-use-agent/src/tools/registry_tests.rs @@ -28,9 +28,7 @@ use crate::tools::handlers::mcp::{ McpCallResult, McpClient, McpTool, McpToolCallRequest, McpWireArgs, }; use crate::tools::handlers::python::{PythonBackend, PythonRequest, PythonTool}; -use crate::tools::handlers::search::{ - SearchBackend, SearchError, SearchTool, SEARCH_PARALLEL_SAFE, -}; +use crate::tools::handlers::search::{SearchBackend, SearchError, SearchTool}; use crate::tools::handlers::shell::{ShellRequest, ShellTool}; use crate::tools::handlers::tool_search::{ToolSearchEntry, ToolSearchRequest, ToolSearchTool}; use crate::tools::handlers::update_plan::{UpdatePlanRequest, UpdatePlanTool}; @@ -604,8 +602,8 @@ fn parallel_safe_flags_match_registration() { // Pure / read-only tools are parallel-safe. assert_eq!(reg.parallel_safe("tool_search"), Some(true)); assert_eq!(reg.parallel_safe("web_search"), Some(true)); - assert_eq!(reg.parallel_safe("search"), Some(SEARCH_PARALLEL_SAFE)); - // Everything else is serial. + // Everything else is serial — including `search`, pinned serial below as + // the conservative scheduling default for a billed API call. for name in [ "shell", "apply_patch", diff --git a/crates/browser-use-cli/src/main.rs b/crates/browser-use-cli/src/main.rs index 8b733bca..ceef8a6f 100644 --- a/crates/browser-use-cli/src/main.rs +++ b/crates/browser-use-cli/src/main.rs @@ -1571,6 +1571,18 @@ fn run_session_via_engine_with_runtime_and_cancel( cancellation_token: tokio_util::sync::CancellationToken, browser_id: Option, ) -> Result { + // Make the stored cloud API key visible to Rust-side Browser Use API + // calls (cloud browser, `search` tool) that read it from the environment, + // mirroring the TUI's `prepare_tui_agent_run`. The env wins when already + // set; the store fills it in otherwise. + if std::env::var(BROWSER_USE_CLOUD_API_KEY_ENV).map_or(true, |v| v.trim().is_empty()) { + if let Some(api_key) = store + .get_setting(BROWSER_USE_CLOUD_API_KEY_SETTING)? + .filter(|value| !value.trim().is_empty()) + { + std::env::set_var(BROWSER_USE_CLOUD_API_KEY_ENV, api_key); + } + } let _local_runtime_server = CliLocalRuntimeServer::ensure(store, &runtime_handle)?; let executor = cli_runtime_agent_executor(store, runtime_handle)?; attach_cli_child_agent_runner(store, executor.clone(), &mut config); diff --git a/crates/browser-use-tui/src/runtime.rs b/crates/browser-use-tui/src/runtime.rs index 48ecc934..fb774be3 100644 --- a/crates/browser-use-tui/src/runtime.rs +++ b/crates/browser-use-tui/src/runtime.rs @@ -351,11 +351,11 @@ fn prepare_tui_agent_run( notifier: Option, ) -> Result<(RuntimeAgentExecutor, ProviderRunConfig)> { let store = Store::open_with_optional_notifier(&state_dir, notifier.clone())?; - let browser_use_cloud_api_key = if browser == BROWSER_USE_CLOUD { - browser_use_cloud_api_key(&store)? - } else { - None - }; + // Load the stored cloud API key regardless of browser mode: besides the + // cloud browser, the Rust-side Browser Use API calls (e.g. the `search` + // tool against search.browser-use.com) read it from the environment. Only + // the Browser Use Cloud browser *requires* it. + let browser_use_cloud_api_key = browser_use_cloud_api_key(&store)?; if browser == BROWSER_USE_CLOUD && browser_use_cloud_api_key.is_none() { let error = "Browser Use Cloud selected, but BROWSER_USE_API_KEY is not set"; let _ = store.append_event( @@ -370,8 +370,14 @@ fn prepare_tui_agent_run( .filter(|value| !value.trim().is_empty()) { // Browser runtime is Rust-owned now, so the cloud API key must also be - // visible to Rust-side Browser Use API calls, not only the legacy Python worker. - std::env::set_var(BROWSER_USE_CLOUD_API_KEY_ENV, api_key); + // visible to Rust-side Browser Use API calls (cloud browser, `search` + // tool), not only the legacy Python worker. An explicitly exported + // env key wins; the store only fills it in when the env is unset — + // matching the CLI (`run_session_via_engine_with_runtime_and_cancel`) + // and avoiding a per-run env write when one is not needed. + if std::env::var(BROWSER_USE_CLOUD_API_KEY_ENV).map_or(true, |v| v.trim().is_empty()) { + std::env::set_var(BROWSER_USE_CLOUD_API_KEY_ENV, api_key); + } } let mut config = ProviderRunConfig::new(backend.into(), model.clone()) .with_options(tui_agent_options( From 0ca0bf2233e7ad30225c218fab8380aa99e62133 Mon Sep 17 00:00:00 2001 From: reformedot Date: Tue, 9 Jun 2026 19:17:09 -0700 Subject: [PATCH 09/10] Stop registering the hosted web_search tool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the browser-use `search` tool live, the hosted `web_search` competed with it: on OpenAI backends the Responses builder encodes it as the provider-side `web_search_preview`, and the model prefers its native search — observed in terminal testing, where searches bypassed search.browser-use.com entirely (and its billing/auth). Remove the registration from both the production dispatcher and `default_registry` so no provider-side search is emitted and all searches go through `search`. The handler module stays as the codex-parity model of the hosted capability (and as a pure registry-test fixture); its unused `definitions::web_search()` is deleted. The dispatcher membership test now pins `web_search` ABSENT and `search` present. Co-Authored-By: Claude Fable 5 --- .../src/entrypoint/provider.rs | 40 +++++++++--------- .../src/tools/handlers/web_search.rs | 12 ++++++ .../browser-use-agent/src/tools/registry.rs | 42 ++++++------------- .../src/tools/registry_tests.rs | 29 +++++++------ 4 files changed, 61 insertions(+), 62 deletions(-) diff --git a/crates/browser-use-agent/src/entrypoint/provider.rs b/crates/browser-use-agent/src/entrypoint/provider.rs index 7dce60b0..7a94fdc9 100644 --- a/crates/browser-use-agent/src/entrypoint/provider.rs +++ b/crates/browser-use-agent/src/entrypoint/provider.rs @@ -1138,10 +1138,10 @@ fn resolve_provider_with_python( /// ## Which tools are wired here /// The registry registers the backend-free handlers — `shell`, `apply_patch`, /// `view_image`, `update_plan`, `done`, `tool_search` (catalog populated from the registered tools' defs), -/// `web_search` (ENABLED; the Responses builder encodes it as the hosted -/// `web_search_preview` tool), `search` (a client-executed call to the -/// browser-use search API, distinct from the hosted `web_search`) — plus the -/// two product-surface tools that drive real subsystems: +/// `search` (a client-executed call to the browser-use search API; the hosted +/// `web_search` is intentionally NOT registered, so no provider-side +/// `web_search_preview` is emitted and all searches go through `search`) — +/// plus the two product-surface tools that drive real subsystems: /// * `browser` ([`BrowserTool::new`]): standalone — the production /// [`RealBackend`](crate::tools::handlers::browser::RealBackend) wraps the /// `browser-use-browser` crate and manages CDP sessions internally (keyed by @@ -1245,7 +1245,6 @@ fn build_tool_dispatcher_with_cwd_and_goal_store( use crate::tools::handlers::tool_search::{ToolSearchEntry, ToolSearchRequest, ToolSearchTool}; use crate::tools::handlers::update_plan::{UpdatePlanRequest, UpdatePlanTool}; use crate::tools::handlers::view_image::{ViewImageRequest, ViewImageTool}; - use crate::tools::handlers::web_search::{WebSearchConfig, WebSearchRequest, WebSearchTool}; use crate::tools::registry::{definitions, ToolRegistry}; // The backend-free handlers, each with its parity-grounded definition + static @@ -1313,19 +1312,15 @@ fn build_tool_dispatcher_with_cwd_and_goal_store( false, UpdatePlanTool::new(), ); - // `web_search` is ENABLED (hosted/provider-side). The OpenAI Responses - // request builder encodes it as the hosted `{"type":"web_search_preview"}` - // tool (see `browser-use-llm` `openai_responses.rs::lower_tool`). - reg.register::<_, WebSearchRequest>( - "web_search", - definitions::web_search(), - true, - WebSearchTool::new(WebSearchConfig::enabled()), - ); + // The hosted `web_search` is intentionally NOT registered: when present, + // the OpenAI Responses builder encodes it as the provider-side + // `{"type":"web_search_preview"}` tool (`browser-use-llm` + // `openai_responses.rs::lower_tool`) and the model prefers its native + // search over the browser-use one. All searches go through `search`. + // // `search`: web search via the browser-use search API — the client makes - // the API call (auth: `BROWSER_USE_API_KEY`) and formats the results itself - // (distinct from the hosted `web_search` above). Serial: a conservative - // scheduling default for a billed API call. + // the API call (auth: `BROWSER_USE_API_KEY`) and formats the results + // itself. Serial: a conservative scheduling default for a billed API call. reg.register::<_, SearchRequest>( "search", definitions::search(), @@ -3286,9 +3281,14 @@ mod tests { assert!(names.contains(&"browser")); assert!(names.contains(&"done")); assert!(names.contains(&"update_plan")); - // Both web searches are wired into the production dispatcher: the hosted - // `web_search` and the browser-use search API `search`. - assert!(names.contains(&"web_search")); + // Search goes through the browser-use search API `search` tool only; + // the hosted `web_search` must NOT be exposed (when registered, the + // OpenAI Responses builder emits the provider-side `web_search_preview` + // and the model prefers it over `search`). + assert!( + !names.contains(&"web_search"), + "hosted web_search must not be registered in the production dispatcher" + ); assert!( names.contains(&"search"), "the `search` tool must be reachable by the live model" diff --git a/crates/browser-use-agent/src/tools/handlers/web_search.rs b/crates/browser-use-agent/src/tools/handlers/web_search.rs index fad86879..fe1fe2a3 100644 --- a/crates/browser-use-agent/src/tools/handlers/web_search.rs +++ b/crates/browser-use-agent/src/tools/handlers/web_search.rs @@ -1,5 +1,17 @@ //! `web_search` tool: the HOSTED, provider-executed web-search capability. //! +//! # Status: NOT REGISTERED (superseded by `search`) +//! +//! This handler is no longer wired into [`default_registry`] or the production +//! dispatcher: when registered, the OpenAI Responses builder encodes it as the +//! provider-side `web_search_preview` tool, which competes with — and the +//! model prefers over — the browser-use [`search`](super::search) tool. All +//! searches now go through `search`. The handler is kept as the codex-parity +//! model of the hosted capability (and as a pure registry-test fixture) should +//! a provider-side search ever need re-enabling. +//! +//! [`default_registry`]: crate::tools::registry::default_registry +//! //! Unlike the other handlers in this module (`shell`, `apply_patch`, //! `view_image`, `update_plan`, `tool_search`), `web_search` //! is **not locally dispatched**. It is a *hosted tool*: the model provider runs diff --git a/crates/browser-use-agent/src/tools/registry.rs b/crates/browser-use-agent/src/tools/registry.rs index 39808dd8..417602a7 100644 --- a/crates/browser-use-agent/src/tools/registry.rs +++ b/crates/browser-use-agent/src/tools/registry.rs @@ -1142,29 +1142,12 @@ to the single frame that proves the task succeeded." } } - /// `web_search`: a hosted/passthrough web search. Parity: codex - /// `WebSearchArgs { query }` / legacy web_search args. - pub fn web_search() -> ToolDefinition { - ToolDefinition { - name: "web_search".to_string(), - description: "Search the web for a free-text query.".to_string(), - input_schema: json!({ - "type": "object", - "properties": { - "query": { "type": "string", "description": "The free-text search query." } - }, - "required": ["query"], - "additionalProperties": false - }), - output_schema: None, - namespace: None, - namespace_description: None, - } - } - /// `search`: a web search via the browser-use search API - /// (`search.browser-use.com`). Unlike the hosted [`web_search`], the client - /// performs the API call itself and returns the parsed results as text. + /// (`search.browser-use.com`). The client performs the API call itself and + /// returns the parsed results as text. This replaced the hosted + /// [`web_search`](crate::tools::handlers::web_search) tool, which is no + /// longer registered (the provider-side `web_search_preview` competed with + /// this tool and the model preferred its native search). pub fn search() -> ToolDefinition { ToolDefinition { name: "search".to_string(), @@ -1957,12 +1940,16 @@ Agent-role guidance below only helps choose which agent to use after spawning is /// `WireArgs` types. The browser/python/mcp handlers need an injected backend /// (they would otherwise reach the OS), so those are supplied by the caller. /// -/// `parallel_safe` per tool: `exec_command` / `tool_search` / `web_search` = -/// `true`; `shell` / `apply_patch` / `view_image` / `browser` / `python` / -/// `search` / `update_plan` / `done` = `false` (serial). `mcp` is registered -/// `false` here +/// `parallel_safe` per tool: `exec_command` / `tool_search` = `true`; +/// `shell` / `apply_patch` / `view_image` / `browser` / `python` / `search` / +/// `update_plan` / `done` = `false` (serial). `mcp` is registered `false` here /// (a serial default); its per-request read-only hint still drives the handler's /// own [`ToolRuntime::parallel_safe`](crate::tools::ToolRuntime::parallel_safe). +/// +/// The hosted [`web_search`](crate::tools::handlers::web_search) handler is +/// intentionally absent: when registered, the OpenAI Responses builder emits +/// the provider-side `web_search_preview` tool, which competes with (and the +/// model prefers over) the browser-use `search` tool. #[allow(clippy::too_many_arguments)] pub fn default_registry( shell: crate::tools::handlers::shell::ShellTool, @@ -1973,7 +1960,6 @@ pub fn default_registry( mcp: crate::tools::handlers::mcp::McpTool, update_plan: crate::tools::handlers::update_plan::UpdatePlanTool, tool_search: crate::tools::handlers::tool_search::ToolSearchTool, - web_search: crate::tools::handlers::web_search::WebSearchTool, search: crate::tools::handlers::search::SearchTool, done: crate::tools::handlers::done::DoneTool, ) -> ToolRegistry @@ -1993,7 +1979,6 @@ where use crate::tools::handlers::tool_search::ToolSearchRequest; use crate::tools::handlers::update_plan::UpdatePlanRequest; use crate::tools::handlers::view_image::ViewImageRequest; - use crate::tools::handlers::web_search::WebSearchRequest; let mut reg = ToolRegistry::new(); @@ -2037,7 +2022,6 @@ where true, tool_search, ); - reg.register::<_, WebSearchRequest>("web_search", definitions::web_search(), true, web_search); // `search`: web search via the browser-use search API. Serial: a // conservative scheduling default for a billed API call. reg.register::<_, SearchRequest>( diff --git a/crates/browser-use-agent/src/tools/registry_tests.rs b/crates/browser-use-agent/src/tools/registry_tests.rs index 5cd8a7c5..57718450 100644 --- a/crates/browser-use-agent/src/tools/registry_tests.rs +++ b/crates/browser-use-agent/src/tools/registry_tests.rs @@ -516,7 +516,6 @@ fn full_registry() -> ToolRegistry { "manage k8s clusters", ["namespace"], )]), - WebSearchTool::new(WebSearchConfig::enabled()), SearchTool::with_backend(Arc::new(FakeSearchBackend)), DoneTool::new(), ) @@ -535,15 +534,17 @@ fn ctx_at(name: &str, cwd: PathBuf) -> ToolCtx { #[test] fn default_registry_registers_all_tools() { let reg = full_registry(); - assert_eq!(reg.len(), 13, "all tools must register"); + assert_eq!(reg.len(), 12, "all tools must register"); let defs = reg.model_visible_definitions(); assert_eq!( defs.len(), - 13, + 12, "model_visible_definitions must list all tools" ); let mut names: Vec<&str> = defs.iter().map(|d| d.name.as_str()).collect(); names.sort_unstable(); + // The hosted `web_search` is intentionally absent: search goes through + // the browser-use `search` tool only. assert_eq!( names, vec![ @@ -558,7 +559,6 @@ fn default_registry_registers_all_tools() { "tool_search", "update_plan", "view_image", - "web_search", "write_stdin", ] ); @@ -601,7 +601,6 @@ fn parallel_safe_flags_match_registration() { let reg = full_registry(); // Pure / read-only tools are parallel-safe. assert_eq!(reg.parallel_safe("tool_search"), Some(true)); - assert_eq!(reg.parallel_safe("web_search"), Some(true)); // Everything else is serial — including `search`, pinned serial below as // the conservative scheduling default for a billed API call. for name in [ @@ -805,7 +804,7 @@ async fn update_plan_dispatches() { } #[tokio::test] -async fn tool_search_and_web_search_dispatch() { +async fn tool_search_dispatches_and_web_search_is_unknown() { let reg = full_registry(); let orch = ToolOrchestrator::stub(); let ts = reg @@ -825,7 +824,9 @@ async fn tool_search_and_web_search_dispatch() { ts.stdout ); - let ws = reg + // The hosted `web_search` is not in the default set: a dispatch to it is + // an unknown-tool error (search goes through the `search` tool only). + let err = reg .dispatch( "web_search", &serde_json::json!({ "query": "rust async" }), @@ -835,12 +836,14 @@ async fn tool_search_and_web_search_dispatch() { &orch, ) .await - .expect("web_search should dispatch"); - assert!( - ws.stdout.contains("rust async"), - "web_search: {:?}", - ws.stdout - ); + .expect_err("web_search must not be registered in the default set"); + match err { + ToolError::Other(e) => assert!( + e.to_string().contains("unknown tool `web_search`"), + "unexpected error: {e}" + ), + other => panic!("expected Other(unknown tool), got {other:?}"), + } } #[tokio::test] From c4080022f25c5d3071528bb5e48352a8eeecae54 Mon Sep 17 00:00:00 2001 From: reformedot Date: Tue, 9 Jun 2026 19:32:34 -0700 Subject: [PATCH 10/10] Cover ProviderBackend::BrowserUse in CLI analytics matches origin/main's #80 added the BrowserUse provider backend but missed the two exhaustive matches in the CLI's message analytics, so the CLI crate does not compile on main. Cover the variant per the backend's own conventions: an api_key-authenticated provider with id "browser-use" (entrypoint/provider.rs maps it to OpenAiCompatibleCustom with provider_id "browser-use" and BROWSER_USE_API_KEY auth). Co-Authored-By: Claude Fable 5 --- crates/browser-use-cli/src/main.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/browser-use-cli/src/main.rs b/crates/browser-use-cli/src/main.rs index 172b5dbe..56a86f16 100644 --- a/crates/browser-use-cli/src/main.rs +++ b/crates/browser-use-cli/src/main.rs @@ -2823,7 +2823,8 @@ fn analytics_provider_kind_for_backend(backend: ProviderBackend) -> &'static str ProviderBackend::Openai | ProviderBackend::Anthropic | ProviderBackend::Openrouter - | ProviderBackend::Deepseek => "api_key", + | ProviderBackend::Deepseek + | ProviderBackend::BrowserUse => "api_key", ProviderBackend::Fake | ProviderBackend::None => "other", } } @@ -2835,6 +2836,7 @@ fn provider_id_for_backend(backend: ProviderBackend) -> &'static str { ProviderBackend::Anthropic => "anthropic", ProviderBackend::Openrouter => "openrouter", ProviderBackend::Deepseek => "deepseek", + ProviderBackend::BrowserUse => "browser-use", ProviderBackend::Fake => "fake", ProviderBackend::None => "none", }