diff --git a/Cargo.lock b/Cargo.lock index 94fecebd..682c7126 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -267,6 +267,7 @@ dependencies = [ "browser-use-secrets", "browser-use-store", "futures-util", + "libc", "portable-pty", "rand 0.9.4", "regex", diff --git a/Cargo.toml b/Cargo.toml index c99553b3..bb0c1e4d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,6 +38,7 @@ ignore = "0.4" rand = "0.9" iana-time-zone = "0.1" image = { version = "0.25", default-features = false, features = ["png", "jpeg", "gif", "webp"] } +libc = "0.2" ratatui = { version = "0.30", default-features = false, features = ["crossterm_0_29"] } reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls"] } regex = "1" diff --git a/crates/browser-use-agent/Cargo.toml b/crates/browser-use-agent/Cargo.toml index 59705092..8882279e 100644 --- a/crates/browser-use-agent/Cargo.toml +++ b/crates/browser-use-agent/Cargo.toml @@ -26,6 +26,7 @@ rand.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["stream"] } rustls.workspace = true +libc.workspace = true serde.workspace = true serde_json.workspace = true sha2.workspace = true diff --git a/crates/browser-use-agent/src/compact/mod.rs b/crates/browser-use-agent/src/compact/mod.rs index 164ed355..c6a43505 100644 --- a/crates/browser-use-agent/src/compact/mod.rs +++ b/crates/browser-use-agent/src/compact/mod.rs @@ -302,7 +302,16 @@ pub fn compacted_history_from_summary( summary_suffix: CompactionSummary, token_limit: usize, ) -> CompactedHistory { - let summary_text = format!("{SUMMARY_PREFIX}\n{}", summary_suffix.text); + // Apply the empty-summary fallback to the SUFFIX before prepending the + // prefix — otherwise the prefix makes the string non-empty and the fallback + // in build_compacted_history is dead code, shipping PREFIX + "" (total + // post-compaction amnesia; real_v8 task 24). + let suffix_text = if summary_suffix.text.trim().is_empty() { + "(no summary available)".to_string() + } else { + summary_suffix.text.clone() + }; + let summary_text = format!("{SUMMARY_PREFIX}\n{suffix_text}"); let user_messages: Vec = history.iter().filter_map(real_user_message_text).collect(); let items = build_compacted_history(&user_messages, &summary_text, token_limit); diff --git a/crates/browser-use-agent/src/compact/tests.rs b/crates/browser-use-agent/src/compact/tests.rs index 8905ec56..60135883 100644 --- a/crates/browser-use-agent/src/compact/tests.rs +++ b/crates/browser-use-agent/src/compact/tests.rs @@ -370,7 +370,7 @@ async fn context_window_exceeded_with_only_prompt_left_propagates() { } #[tokio::test] -async fn empty_model_summary_yields_prefix_only_summary() { +async fn empty_model_summary_yields_no_summary_available_fallback() { // The model returns an empty summary => summary_text = PREFIX + "\n" + "". // (The "(no summary available)" placeholder only applies when the WHOLE // summary_text is empty, which it never is here because the prefix is present; @@ -390,7 +390,10 @@ async fn empty_model_summary_yields_prefix_only_summary() { let last = item_text(compacted.items.last().unwrap()); assert_eq!(last, compacted.summary_text); - assert_eq!(compacted.summary_text, format!("{SUMMARY_PREFIX}\n")); + assert_eq!( + compacted.summary_text, + format!("{SUMMARY_PREFIX}\n(no summary available)") + ); } // ---- (5) end-to-end through the real TurnLoop ----------------------------- diff --git a/crates/browser-use-agent/src/entrypoint/mod.rs b/crates/browser-use-agent/src/entrypoint/mod.rs index 4c743a69..5c1765a1 100644 --- a/crates/browser-use-agent/src/entrypoint/mod.rs +++ b/crates/browser-use-agent/src/entrypoint/mod.rs @@ -49,6 +49,7 @@ pub mod provider; +use std::any::Any; use std::future::Future; use std::pin::Pin; use std::sync::Arc; @@ -83,7 +84,7 @@ use crate::context::{ ContextManager, Item, }; use crate::decision::{AutoCompactTokenLimitScope, SamplingOutcome, TokenStatus}; -use crate::events::{names, session_done_payload, EventSink, PendingEvent, TurnCtx}; +use crate::events::{names, session_done_payload, EventSink, PendingEvent, ResultFilePtr, TurnCtx}; use crate::live_executor::ensure_agent_attached as ensure_runtime_agent_attached; use crate::session::reconstruct::WORKSPACE_CONTEXT_MULTI_AGENT_USAGE_HINT_KIND; use crate::session::SessionId; @@ -1393,6 +1394,101 @@ fn runtime_or_store_events( .unwrap_or_default() } +/// Fallback final result discovered from the session cwd when the model ends +/// without a `done()` call (ported from exp/real-v8-restore-88 2bd479d). +struct FallbackResultFile { + text: String, + file: ResultFilePtr, +} + +fn fallback_result_file_for_session( + store: &SharedStore, + session_id: &str, +) -> Option { + let session = store + .lock() + .expect("store mutex poisoned") + .load_session(session_id) + .ok() + .flatten()?; + let cwd = std::path::PathBuf::from(session.cwd); + let files = discover_result_files(&cwd); + let first = files.first()?; + let text = if files.len() == 1 { + format!("Result file: {}", first.path.display()) + } else { + let rendered = files + .iter() + .map(|file| format!("- {}", file.path.display())) + .collect::>() + .join("\n"); + format!("Result files:\n{rendered}") + }; + Some(FallbackResultFile { + text, + file: ResultFilePtr { + url: None, + path: Some(first.path.display().to_string()), + bytes: Some(first.bytes), + }, + }) +} + +struct DiscoveredResultFile { + path: std::path::PathBuf, + bytes: u64, +} + +fn discover_result_files(cwd: &std::path::Path) -> Vec { + let Ok(entries) = std::fs::read_dir(cwd) else { + return Vec::new(); + }; + let mut files = entries + .filter_map(Result::ok) + .filter_map(|entry| { + let path = entry.path(); + let name = path.file_name()?.to_str()?; + if !name.starts_with("result.") { + return None; + } + let metadata = entry.metadata().ok()?; + if !metadata.is_file() || metadata.len() == 0 { + return None; + } + Some(DiscoveredResultFile { + path, + bytes: metadata.len(), + }) + }) + .collect::>(); + files.sort_by(|left, right| { + let left_name = left + .path + .file_name() + .and_then(|name| name.to_str()) + .unwrap_or(""); + let right_name = right + .path + .file_name() + .and_then(|name| name.to_str()) + .unwrap_or(""); + result_file_priority(left_name) + .cmp(&result_file_priority(right_name)) + .then_with(|| left_name.cmp(right_name)) + }); + files +} + +fn result_file_priority(name: &str) -> usize { + match name { + "result.json" => 0, + "result.csv" => 1, + "result.md" => 2, + "result.txt" => 3, + _ => 10, + } +} + fn ensure_fallback_capture_recording(store: &SharedStore, session_id: &str) { if !fallback_capture_recording_enabled() { return; @@ -2401,40 +2497,15 @@ fn media_content_part_for_provider( } } -/// A [`TurnObserver`] that maps loop lifecycle into the durable UI event log. +/// A [`TurnObserver`] for lifecycle hooks that must not decide terminal status. /// -/// On turn completion it emits the final agent message as a `session.done` -/// event through the durable UI sink, so the run's result is visible to the TUI -/// and protocol reducers. The streaming text deltas are emitted by the sampling -/// driver through the same durable sink. -struct StoreObserver { - sink: Arc, - session_id: String, -} - -impl StoreObserver { - fn new(sink: Arc, session_id: String) -> Self { - Self { sink, session_id } - } -} +/// Runtime-owned runs accept `session.done` only after the turn loop has returned +/// and runtime resources have been quiesced. Streaming model/tool events are +/// still persisted by the sampling driver through [`RuntimeStoreSink`]. +struct StoreObserver; impl TurnObserver for StoreObserver { - fn on_lifecycle(&self, ev: TurnLifecycleEvent) { - // Phase-E seam: started/aborted lifecycle markers are not surfaced as - // store events yet (the legacy stack had richer turn-lifecycle telemetry). - // We persist the terminal session result, which is what readers need today. - if let TurnLifecycleEvent::TurnComplete { - last_agent_message: Some(text), - .. - } = ev - { - self.sink.emit(PendingEvent::new( - self.session_id.clone(), - names::SESSION_DONE, - session_done_payload(Some(&text), None), - )); - } - } + fn on_lifecycle(&self, _ev: TurnLifecycleEvent) {} } /// A network-free scripted driver for the `Fake` backend. @@ -2588,15 +2659,7 @@ impl RuntimeTurnLoopDriver { } *state.pre_turn_replay_from_seq.lock().unwrap() = None; - // The observer persists the terminal agent message through the runtime so - // `session.done` is journaled and projected by the same live authority as - // model/tool events. - let sink: Arc = Arc::new(RuntimeStoreSink { - runtime: runtime_handle, - store: Arc::clone(&store), - model_context_window: None, - }); - let observer = StoreObserver::new(sink, session_id.as_str().to_string()); + let observer = StoreObserver; let turn_loop = TurnLoop::new(state, driver, observer); let result = match max_turns { @@ -2611,10 +2674,83 @@ impl RuntimeTurnLoopDriver { .await } }; - if result.is_ok() { - ensure_fallback_capture_recording(&store, session_id.as_str()); + let runtime_session_id = RuntimeSessionId::from_string(session_id.as_str().to_string())?; + match result { + Ok(last_agent_message) => { + ensure_fallback_capture_recording(&store, session_id.as_str()); + cleanup_session_resources_for_terminal( + runtime_handle.clone(), + runtime_session_id.clone(), + ) + .await?; + // Never lose finished work: if the model ended without a final + // message (no done() call — e.g. ended on leaked planning text + // or an empty turn), fall back to the best result.* artifact in + // the session cwd so session.done still carries the deliverable. + let mut final_message = last_agent_message; + let mut result_file: Option = None; + if final_message.is_none() && !cancel.is_cancelled() { + if let Some(fallback) = + fallback_result_file_for_session(&store, session_id.as_str()) + { + final_message = Some(fallback.text); + result_file = Some(fallback.file); + } + } + if let Some(text) = final_message.as_deref() { + runtime_handle.append_observed_session_event( + runtime_session_id, + names::SESSION_DONE, + session_done_payload(Some(text), result_file.as_ref()), + RuntimeDurability::Barrier, + )?; + } + Ok(final_message) + } + Err(error) => { + let _ = cleanup_session_resources_for_terminal( + runtime_handle.clone(), + runtime_session_id, + ) + .await; + Err(error) + } } - result + } +} + +async fn cleanup_session_resources_for_terminal( + runtime_handle: RuntimeHandle, + runtime_session_id: RuntimeSessionId, +) -> Result { + let session_for_error = runtime_session_id.as_str().to_string(); + tokio::task::spawn_blocking(move || { + std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + runtime_handle.cleanup_session_resources(&runtime_session_id) + })) + }) + .await + .map_err(|error| { + AgentError::Other(anyhow::anyhow!( + "terminal cleanup task failed to join for session {session_for_error}: {error}" + )) + })? + .map_err(|panic| { + AgentError::Other(anyhow::anyhow!( + "terminal cleanup panicked for session {session_for_error}: {}", + panic_payload_message(panic) + )) + })? + .map_err(AgentError::Other) +} + +fn panic_payload_message(payload: Box) -> String { + if let Some(message) = payload.downcast_ref::<&str>() { + (*message).to_string() + } else if let Some(message) = payload.downcast_ref::() { + message.clone() + } else { + "unknown panic payload".to_string() } } @@ -4402,6 +4538,105 @@ mod tests { ); } + #[tokio::test] + async fn runtime_backed_done_is_journaled_after_resource_cleanup() { + let (dir, store, session_id) = store_with_session(); + seed_user_input(&store, &session_id, "initial").await; + let journal = Arc::new(SqliteJournal::from_store( + Store::open(dir.path()).expect("open runtime store"), + )); + let persistence: Arc = journal.clone(); + let state_index: Arc = journal; + let runtime = BrowserUseRuntime::new(persistence, state_index).handle(); + let runtime_session_id = + RuntimeSessionId::from_string(session_id.clone()).expect("runtime session id"); + runtime + .attach_root_agent(AttachRootAgentRequest { + session_id: runtime_session_id.clone(), + cwd: std::path::PathBuf::from("/work"), + task: "root".to_string(), + max_concurrent_threads_per_session: 3, + }) + .expect("attach root"); + + let cleanup_store = Arc::clone(&store); + let cleanup_session_id = session_id.clone(); + runtime + .get_or_insert_session_resource( + &runtime_session_id, + "test.cleanup_before_done", + || 1usize, + move |_resource: Arc| { + let store = cleanup_store.lock().expect("store mutex poisoned"); + store + .append_event( + &cleanup_session_id, + "test.cleanup", + json!({ "phase": "terminal_barrier" }), + ) + .expect("append cleanup marker"); + 1 + }, + ) + .expect("register cleanup marker"); + let blocking_cleanup_store = Arc::clone(&store); + let blocking_cleanup_session_id = session_id.clone(); + runtime + .get_or_insert_session_resource( + &runtime_session_id, + "test.blocking_cleanup_before_done", + || 1usize, + move |_resource: Arc| { + let _client = reqwest::blocking::Client::new(); + let store = blocking_cleanup_store.lock().expect("store mutex poisoned"); + store + .append_event( + &blocking_cleanup_session_id, + "test.blocking_cleanup", + json!({ "phase": "terminal_barrier" }), + ) + .expect("append blocking cleanup marker"); + 1 + }, + ) + .expect("register blocking cleanup marker"); + + run_session_with_config_with_cancel_and_runtime( + Arc::clone(&store), + &session_id, + fake_config(), + CancellationToken::new(), + Some(runtime), + ) + .await + .expect("runtime-backed run"); + + let log = events(&store, &session_id); + let cleanup_seq = log + .iter() + .find(|event| event.event_type == "test.cleanup") + .map(|event| event.seq) + .expect("cleanup marker"); + let blocking_cleanup_seq = log + .iter() + .find(|event| event.event_type == "test.blocking_cleanup") + .map(|event| event.seq) + .expect("blocking cleanup marker"); + let done_seq = log + .iter() + .find(|event| event.event_type == names::SESSION_DONE) + .map(|event| event.seq) + .expect("session.done"); + assert!( + cleanup_seq < done_seq, + "terminal cleanup must be durable before session.done: cleanup={cleanup_seq}, done={done_seq}" + ); + assert!( + blocking_cleanup_seq < done_seq, + "blocking terminal cleanup must be durable before session.done: cleanup={blocking_cleanup_seq}, done={done_seq}" + ); + } + #[tokio::test] async fn token_status_prefers_provider_usage_over_whole_prompt_estimate() { let (_dir, store, session_id) = store_with_session(); diff --git a/crates/browser-use-agent/src/entrypoint/provider.rs b/crates/browser-use-agent/src/entrypoint/provider.rs index 3281a54c..a34ccbdc 100644 --- a/crates/browser-use-agent/src/entrypoint/provider.rs +++ b/crates/browser-use-agent/src/entrypoint/provider.rs @@ -120,6 +120,8 @@ pub type RealSamplingDriver = ModelSamplingDriver< RegistryRunner, >; +const DISABLE_LOCAL_SEARCH_ENV: &str = "BROWSER_USE_DISABLE_LOCAL_SEARCH"; + /// The production tool dispatcher type: a [`RegistryRunner`] whose approver is the /// REAL [`GuardianApprover`] (permissive sandbox seam). Named so the builder + the /// fused driver agree on the runner's generic arguments. @@ -1324,8 +1326,11 @@ fn build_tool_dispatcher_with_cwd_and_goal_store( ); // `search`: locally-executed DuckDuckGo (Lite) web search — the client runs // the HTTP request and parses the results itself (distinct from the hosted - // `web_search` above). Read-only, so parallel_safe = true. - reg.register::<_, SearchRequest>("search", definitions::search(), true, SearchTool::new()); + // `web_search` above). Eval runs can disable it to avoid captcha-heavy + // detours while keeping hosted `web_search` available. + if local_search_enabled_for_run(config) { + reg.register::<_, SearchRequest>("search", definitions::search(), true, SearchTool::new()); + } let browser_backend = browser_backend_for_runtime_or_config( config, runtime_handle.as_ref(), @@ -1537,6 +1542,43 @@ fn legacy_subagent_tools_enabled_for_run(config: &ProviderRunConfig) -> bool { && config.options.child_agent_runner.is_some() } +fn local_search_enabled_for_run(config: &ProviderRunConfig) -> bool { + if config_override_bool_any( + &config.options.config_overrides, + &[ + "disable_local_search", + "tools.disable_local_search", + "search.disabled", + ], + ) == Some(true) + { + return false; + } + if config_override_bool_any( + &config.options.config_overrides, + &[ + "local_search_enabled", + "tools.local_search_enabled", + "search.enabled", + ], + ) == Some(false) + { + return false; + } + !std::env::var(DISABLE_LOCAL_SEARCH_ENV) + .ok() + .is_some_and(|value| env_flag_enabled(&value)) +} + +fn env_flag_enabled(value: &str) -> bool { + let normalized = value.trim().to_ascii_lowercase(); + !normalized.is_empty() + && !matches!( + normalized.as_str(), + "0" | "false" | "off" | "no" | "disabled" + ) +} + fn apply_role_tool_policy( reg: &mut crate::tools::registry::ToolRegistry, config: &ProviderRunConfig, @@ -3289,6 +3331,30 @@ mod tests { ); } + #[test] + fn disable_local_search_override_hides_duckduckgo_search_only() { + let options = crate::config_overrides::AgentRunOptions { + config_overrides: vec![( + "disable_local_search".to_string(), + toml::Value::Boolean(true), + )], + ..crate::config_overrides::AgentRunOptions::default() + }; + let config = + ProviderRunConfig::new(ProviderBackend::Fake, "fake-model").with_options(options); + let dispatcher = build_tool_dispatcher(Arc::new(MarkerPythonBackend), &config, None); + let names: Vec<&str> = dispatcher + .tool_specs() + .iter() + .map(|s| s.name.as_str()) + .collect(); + + assert!(names.contains(&"web_search")); + assert!(!names.contains(&"search")); + assert!(names.contains(&"browser_script")); + assert!(names.contains(&"done")); + } + /// A non-empty `mcp_servers` map registers the `mcp` tool. The stdio server /// command (`true`) connects to nothing useful, but `connect_all`'s per-server /// failure isolation still yields a manager and the registration wiring diff --git a/crates/browser-use-agent/src/tools/handlers/browser.rs b/crates/browser-use-agent/src/tools/handlers/browser.rs index db632c03..e07dfc9a 100644 --- a/crates/browser-use-agent/src/tools/handlers/browser.rs +++ b/crates/browser-use-agent/src/tools/handlers/browser.rs @@ -66,9 +66,12 @@ pub const DEFAULT_BROWSER_SCRIPT_TIMEOUT_SECS: u64 = 300; /// Default observe poll window (ms) for [`BrowserAction::Observe`]. /// -/// Long browser_script runs should be observed in coarse windows so the agent -/// does not burn many LLM turns polling the same run_id while work is ongoing. -pub const DEFAULT_OBSERVE_TIMEOUT_MS: u64 = 30_000; +/// Restored to the pre-regression (88-baseline) 1s default. The 30s default + +/// 30s clamp-floor ("observe30") blocked each observe for up to 30s, burning the +/// run-level task timebox on long-running scripts and leaving tasks unfinished +/// (e.g. real_v8 tasks 1, 4 never emitted session.done). The run-level timebox, +/// not coarse poll windows, is responsible for bounding total turns. +pub const DEFAULT_OBSERVE_TIMEOUT_MS: u64 = 1_000; pub const MAX_OBSERVE_TIMEOUT_MS: u64 = 120_000; /// Appended to `browser_script` stdout when the response carries image parts. @@ -79,15 +82,19 @@ pub const MAX_OBSERVE_TIMEOUT_MS: u64 = 120_000; pub const BROWSER_SCRIPT_CONTENT_STDOUT_PREFIX: &str = "\n__browser_script_content__:"; /// Maximum bytes of browser-script text returned to the next model turn. /// -/// Full browser-script output is persisted through durable events/artifacts; the -/// inline model view is deliberately smaller because long eval tasks repeatedly -/// carry every prior tool result in later prompts. -pub const MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES: usize = 4 * 1024; +/// Matches the collected-output limit (SCRIPT_MAX_OUTPUT_CHARS = 120k): the +/// CURRENT turn must see the full script output to write correct follow-up +/// code (codex parity: fresh outputs are never capped; the context manager +/// truncates tool outputs only as they age into history via the policy*1.2 +/// rule in context/mod.rs). The old 4KB cap forced blind guess-first coding +/// (KeyError-class bug explosion 8->53 across the 88->81 eval regression). +pub const MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES: usize = 120 * 1024; const BROWSER_PREF_MODE: &str = "browser.preference.mode"; const BROWSER_PREF_BROWSER: &str = "browser.preference.browser"; const BROWSER_PREF_BROWSER_LABEL: &str = "browser.preference.browser_label"; const BROWSER_PREF_PROFILE: &str = "browser.preference.profile"; +const EVAL_MAX_OBSERVE_TIMEOUT_ENV: &str = "BROWSER_USE_EVAL_MAX_OBSERVE_TIMEOUT_MS"; const BROWSER_DOMAIN_PROFILE_PREFIX: &str = "browser.domain_profile."; const BROWSER_SCRIPT_MAX_IMAGE_DIMENSION: u32 = 8_000; const BROWSER_PREF_PROFILE_LABEL: &str = "browser.preference.profile_label"; @@ -201,12 +208,38 @@ impl BrowserRequest { } fn effective_observe_ms(&self) -> u64 { - self.observe_timeout_ms + let normal = self + .observe_timeout_ms .unwrap_or(DEFAULT_OBSERVE_TIMEOUT_MS) - .clamp(DEFAULT_OBSERVE_TIMEOUT_MS, MAX_OBSERVE_TIMEOUT_MS) + .clamp(DEFAULT_OBSERVE_TIMEOUT_MS, MAX_OBSERVE_TIMEOUT_MS); + match eval_max_observe_timeout_ms() { + Some(max_ms) => normal.min(max_ms.max(1_000)), + None => normal, + } } } +fn eval_max_observe_timeout_ms() -> Option { + // Observe timing is controlled ONLY by its own env var. It used to also be + // implicitly capped at 30s whenever BROWSER_USE_EVAL_DONE_AUDIT was set — + // a hidden cross-subsystem coupling (a finalization flag silently changing + // polling behavior) that made eval runs hard to reason about. + std::env::var(EVAL_MAX_OBSERVE_TIMEOUT_ENV) + .ok() + .and_then(|value| value.trim().parse::().ok()) + .filter(|value| *value > 0) +} + +#[allow(dead_code)] +fn env_flag_enabled(value: &str) -> bool { + let normalized = value.trim().to_ascii_lowercase(); + !normalized.is_empty() + && !matches!( + normalized.as_str(), + "0" | "false" | "off" | "no" | "disabled" + ) +} + /// Model-facing wire arguments for the browser tool. /// /// [`BrowserRequest`] is a PARSED form: its [`BrowserAction`] is an internally @@ -2375,7 +2408,7 @@ fn cap_inline_browser_script_stdout(text: String) -> String { let elided = text.len() - end; let mut out = text[..end].to_string(); out.push_str(&format!( - "\n... [browser_script stdout truncated, {elided} more bytes; full output persisted. Use a narrower browser_script extraction, the emitted summaries, or a saved artifact instead of re-reading broad page text.]" + "\n... [browser_script stdout truncated, {elided} more bytes; full output persisted to the run artifact. Read the saved artifact or re-extract the missing portion before relying on this output.]" )); out } @@ -3176,4 +3209,25 @@ mod browser_mode_tests { ) .unwrap(); } + + #[test] + fn stored_cloud_preference_rejects_managed_only_recovery() { + let dir = tempfile::tempdir().unwrap(); + let store = Store::open(dir.path()).unwrap(); + store.set_setting(BROWSER_PREF_MODE, "cloud").unwrap(); + + let err = resolve_browser_command_for_selected_mode( + Some(&store), + "browser recover restart-owned-browser", + None, + None, + ) + .unwrap_err(); + + assert!( + err.to_string() + .contains("restart-owned-browser only applies to managed Chromium"), + "{err:#}" + ); + } } diff --git a/crates/browser-use-agent/src/tools/handlers/browser_tests.rs b/crates/browser-use-agent/src/tools/handlers/browser_tests.rs index bd7ef14f..5c041f47 100644 --- a/crates/browser-use-agent/src/tools/handlers/browser_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/browser_tests.rs @@ -1422,7 +1422,7 @@ async fn script_truncated_structured_output_preserves_summary_first() { ); assert!( out.stdout - .contains("Use a narrower browser_script extraction"), + .contains("Read the saved artifact or re-extract the missing portion"), "stdout: {}", out.stdout ); @@ -1434,8 +1434,11 @@ async fn script_truncated_structured_output_preserves_summary_first() { } #[test] -fn browser_script_stdout_cap_defaults_to_four_kib_for_eval_cost() { - assert_eq!(MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES, 4 * 1024); +fn browser_script_stdout_cap_matches_collected_output_limit() { + // The CURRENT turn must see full script output to write correct follow-up + // code; history growth is handled by the context manager (policy*1.2). + // The old 4KB cap caused blind guess-first coding (88->81 regression). + assert_eq!(MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES, 120 * 1024); } #[tokio::test] @@ -1594,10 +1597,9 @@ async fn observe_timeout_is_clamped_to_coarse_poll_window() { }; let out = run_direct(&tool, &req).await.unwrap(); assert_eq!(out.exit_code, 0); - assert_eq!( - backend.last_observe_timeout_ms(), - Some(DEFAULT_OBSERVE_TIMEOUT_MS) - ); + // 5s sits inside the allowed [DEFAULT_OBSERVE_TIMEOUT_MS, MAX] window and + // passes through unchanged (the old 30s floor inflated short observes). + assert_eq!(backend.last_observe_timeout_ms(), Some(5_000)); req.observe_timeout_ms = Some(180_000); let out = run_direct(&tool, &req).await.unwrap(); diff --git a/crates/browser-use-agent/src/tools/handlers/done.rs b/crates/browser-use-agent/src/tools/handlers/done.rs index 6b38c8cf..bc5c932e 100644 --- a/crates/browser-use-agent/src/tools/handlers/done.rs +++ b/crates/browser-use-agent/src/tools/handlers/done.rs @@ -16,16 +16,11 @@ //! recognize that the agent declared itself finished, and so the final `text` //! (the summary) is surfaced to the host. //! -//! It does NOT itself force the turn loop to stop: the loop's termination signal -//! is "the model produced a final assistant message with NO tool calls" -//! ([`TurnRunOutcome::NoToolCalls`](crate::turn::loop::TurnRunOutcome)). A `done` -//! call is a tool call, so it is dispatched, recorded, and the loop re-samples; -//! the model then typically produces a final no-tool message and the loop stops. -//! Wiring the loop to treat a successful `done` [`ExecOutput`] as terminal (a -//! short-circuit) needs the loop's classifier (`turn/loop.rs` / -//! `turn/fusion.rs`) to inspect the dispatched tool name/output — those files are -//! outside this WP's owned set, so that deeper loop wiring is REPORTED, not -//! implemented here. +//! A successful `done` output is treated as terminal by the fused sampling loop. +//! In eval mode (`BROWSER_USE_EVAL_DONE_AUDIT=1`) this handler performs a small +//! completion audit first. If the final answer is obviously empty, placeholder +//! heavy, or explicitly partial, it rejects the call so the loop gives the model +//! one more repair turn instead of accepting a weak completion. //! //! # Parity grounding //! @@ -49,6 +44,12 @@ use crate::tools::runtime::{ Approvable, ExecOutput, SandboxAttempt, Sandboxable, ToolCtx, ToolError, ToolRuntime, }; use crate::tools::sandbox::{SandboxPermissions, SandboxPreference}; +use serde_json::Value; +use std::fs; +use std::path::{Path, PathBuf}; + +const EVAL_DONE_AUDIT_ENV: &str = "BROWSER_USE_EVAL_DONE_AUDIT"; +const DONE_AUDIT_TEXT_PREVIEW_BYTES: usize = 256 * 1024; /// The tool name surfaced to the model. /// @@ -207,12 +208,16 @@ impl ToolRuntime for DoneTool { &self, req: &DoneRequest, attempt: &SandboxAttempt<'_>, - _ctx: &ToolCtx, + ctx: &ToolCtx, ) -> Result { // No sandbox is exercised (the tool does no I/O); acknowledge the attempt // to make the seam explicit, matching the other tools. let _ = attempt; + if eval_done_audit_enabled() { + audit_done_request(req, ctx).map_err(ToolError::Rejected)?; + } + // Record the final summary into a deterministic, prefixed acknowledgement // the loop/host can recognize as the declared completion. The summary may // be empty (the model can declare done without a message). @@ -224,3 +229,244 @@ impl ToolRuntime for DoneTool { }) } } + +fn eval_done_audit_enabled() -> bool { + std::env::var(EVAL_DONE_AUDIT_ENV) + .ok() + .is_some_and(|value| env_flag_enabled(&value)) +} + +fn env_flag_enabled(value: &str) -> bool { + let normalized = value.trim().to_ascii_lowercase(); + !normalized.is_empty() + && !matches!( + normalized.as_str(), + "0" | "false" | "off" | "no" | "disabled" + ) +} + +pub(crate) fn audit_done_request(req: &DoneRequest, ctx: &ToolCtx) -> Result<(), String> { + let mut reasons = Vec::new(); + let mut audit_text = req.summary(); + let mut has_material_answer = !audit_text.trim().is_empty(); + + if let Some(result_file) = req + .result_file + .as_deref() + .map(str::trim) + .filter(|value| !value.is_empty()) + { + match read_result_file_preview(result_file, ctx) { + Ok(preview) => { + has_material_answer = true; + if let Some(preview) = preview { + reasons.extend(json_audit_reasons(&preview)); + if !audit_text.is_empty() { + audit_text.push('\n'); + } + audit_text.push_str(&preview); + } + } + Err(reason) => reasons.push(reason), + } + } + + if !has_material_answer { + reasons.push("no final answer text or readable non-empty result_file".to_string()); + } + + reasons.extend(text_audit_reasons(&audit_text)); + reasons.extend(json_audit_reasons(&audit_text)); + + if reasons.is_empty() { + Ok(()) + } else { + Err(format!( + "done audit rejected the final answer: {}. Continue working, verify and fill the missing fields, then call done again. Only use an incomplete fallback if the run is genuinely out of turns.", + reasons.join("; ") + )) + } +} + +fn read_result_file_preview(path: &str, ctx: &ToolCtx) -> Result, String> { + let resolved = resolve_result_file(path, ctx); + let metadata = fs::metadata(&resolved).map_err(|error| { + format!( + "result_file `{}` is not readable at {} ({error})", + path, + resolved.display() + ) + })?; + if !metadata.is_file() { + return Err(format!( + "result_file `{}` is not a regular file ({})", + path, + resolved.display() + )); + } + if metadata.len() == 0 { + return Err(format!("result_file `{}` is empty", path)); + } + + let bytes = fs::read(&resolved).map_err(|error| { + format!( + "result_file `{}` could not be read at {} ({error})", + path, + resolved.display() + ) + })?; + let cap = bytes.len().min(DONE_AUDIT_TEXT_PREVIEW_BYTES); + Ok(std::str::from_utf8(&bytes[..cap]) + .ok() + .map(ToOwned::to_owned)) +} + +fn resolve_result_file(path: &str, ctx: &ToolCtx) -> PathBuf { + let requested = Path::new(path); + if requested.is_absolute() { + return requested.to_path_buf(); + } + + let cwd_path = ctx.cwd.join(requested); + if cwd_path.exists() { + return cwd_path; + } + + let artifact_path = ctx.artifact_root.join(requested); + if artifact_path.exists() { + return artifact_path; + } + + cwd_path +} + +fn text_audit_reasons(text: &str) -> Vec { + let normalized = text.to_ascii_lowercase(); + let markers = [ + ("partial_incomplete", "declares partial_incomplete"), + ("partial / incomplete", "declares partial/incomplete"), + ("partial/incomplete", "declares partial/incomplete"), + ("not completed", "declares not completed"), + ("not checked", "declares not checked"), + ("not extracted", "declares not extracted"), + ("could not verify", "declares unverified data"), + ("couldn't verify", "declares unverified data"), + ("could not access", "declares inaccessible source"), + ("couldn't access", "declares inaccessible source"), + ("unavailable due to", "declares unavailable data"), + ("source unavailable", "declares unavailable source"), + ("blocked by", "declares blocked source"), + ("unable to complete", "declares incomplete task"), + ("task is incomplete", "declares incomplete task"), + ]; + + markers + .iter() + .filter_map(|(needle, reason)| normalized.contains(needle).then(|| (*reason).to_string())) + .collect() +} + +fn json_audit_reasons(text: &str) -> Vec { + let Some(value) = parse_first_json_value(text) else { + return Vec::new(); + }; + let mut reasons = Vec::new(); + if json_is_empty_result(&value) { + reasons.push("JSON result is empty".to_string()); + } + let mut stats = JsonPlaceholderStats::default(); + collect_json_placeholder_stats(&value, &mut stats); + if stats.total_scalar_fields >= 8 + && stats.placeholder_fields * 100 >= stats.total_scalar_fields * 30 + { + reasons.push(format!( + "JSON result has too many placeholder fields ({}/{})", + stats.placeholder_fields, stats.total_scalar_fields + )); + } + reasons +} + +fn parse_first_json_value(text: &str) -> Option { + let trimmed = text.trim(); + serde_json::from_str(trimmed).ok() +} + +fn json_is_empty_result(value: &Value) -> bool { + match value { + Value::Array(items) => items.is_empty(), + Value::Object(map) => { + if map.is_empty() { + return true; + } + let array_values = map + .values() + .filter_map(|value| value.as_array()) + .collect::>(); + !array_values.is_empty() && array_values.iter().all(|items| items.is_empty()) + } + _ => false, + } +} + +#[derive(Default)] +struct JsonPlaceholderStats { + total_scalar_fields: usize, + placeholder_fields: usize, +} + +fn collect_json_placeholder_stats(value: &Value, stats: &mut JsonPlaceholderStats) { + match value { + // A field explicitly set to null means the agent checked the source and + // recorded a genuine absence. Many tasks REQUIRE null/empty for + // unavailable fields, so count null toward the denominator but NOT as a + // placeholder. Counting it as a placeholder pushed the agent to delete + // required fields to satisfy the audit (real_v8 task 53 regression). + Value::Null => stats.total_scalar_fields += 1, + Value::Bool(_) | Value::Number(_) => stats.total_scalar_fields += 1, + Value::String(text) => { + stats.total_scalar_fields += 1; + if is_placeholder_string(text) { + stats.placeholder_fields += 1; + } + } + Value::Array(items) => { + for item in items { + collect_json_placeholder_stats(item, stats); + } + } + Value::Object(map) => { + for value in map.values() { + collect_json_placeholder_stats(value, stats); + } + } + } +} + +fn is_placeholder_string(text: &str) -> bool { + let normalized = text.trim().to_ascii_lowercase(); + // An empty string is a deliberate "checked, genuinely absent" sentinel, and + // many tasks explicitly MANDATE "" for missing values. Counting "" as a + // placeholder rejected spec-correct answers and coerced literal placeholder + // prose instead (real_v8 task 94). null is treated the same way. + if normalized.is_empty() { + return false; + } + matches!( + normalized.as_str(), + "unknown" + | "n/a" + | "na" + | "none" + | "null" + | "missing" + | "not found" + | "unavailable" + | "not available" + | "not listed" + | "not checked" + | "not extracted" + | "could not determine" + ) || normalized.starts_with("could not ") + || normalized.starts_with("unable to ") +} diff --git a/crates/browser-use-agent/src/tools/handlers/done_tests.rs b/crates/browser-use-agent/src/tools/handlers/done_tests.rs index 198e493d..8d4459d8 100644 --- a/crates/browser-use-agent/src/tools/handlers/done_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/done_tests.rs @@ -5,7 +5,7 @@ //! analog) — direct `run` calls plus one drive through the orchestrator over the //! seam. -use super::done::{DoneRequest, DoneTool, DONE_STDOUT_PREFIX}; +use super::done::{audit_done_request, DoneRequest, DoneTool, DONE_STDOUT_PREFIX}; use crate::tools::approval::AskForApproval; use crate::tools::orchestrator::{ToolOrchestrator, TurnEnv}; use crate::tools::runtime::{AutoApprover, SandboxAttempt, ToolCtx, ToolRuntime}; @@ -175,3 +175,70 @@ fn done_is_not_parallel_safe() { "done must be serial: completion is terminal and must not be reordered" ); } + +// ---- (6) eval-mode completion audit catches weak finals before terminal stop ---- + +#[test] +fn eval_done_audit_rejects_explicit_partial_completion() { + let err = audit_done_request( + &DoneRequest::with_result("partial_incomplete: two fields were not checked"), + &ctx(), + ) + .expect_err("partial completion should be rejected"); + + assert!(err.contains("partial_incomplete"), "got: {err}"); + assert!(err.contains("Continue working"), "got: {err}"); +} + +#[test] +fn eval_done_audit_rejects_missing_result_file() { + let req = DoneRequest { + result_file: Some("missing-result.json".to_string()), + ..DoneRequest::default() + }; + + let err = audit_done_request(&req, &ctx()).expect_err("missing result file should reject"); + assert!(err.contains("result_file"), "got: {err}"); + assert!(err.contains("not readable"), "got: {err}"); +} + +#[test] +fn eval_done_audit_rejects_placeholder_heavy_json() { + let temp = tempfile::tempdir().unwrap(); + let path = temp.path().join("answer.json"); + std::fs::write( + &path, + r#"{ + "items": [ + {"name": "unknown", "price": "not found", "url": ""}, + {"name": "missing", "price": "unavailable", "url": "not checked"}, + {"name": "ok", "price": "12", "url": "https://example.test"} + ] + }"#, + ) + .unwrap(); + let req = DoneRequest { + result_file: Some(path.to_string_lossy().to_string()), + ..DoneRequest::default() + }; + + let err = audit_done_request(&req, &ctx()).expect_err("placeholder JSON should reject"); + assert!(err.contains("placeholder"), "got: {err}"); +} + +#[test] +fn eval_done_audit_accepts_non_empty_material_file() { + let temp = tempfile::tempdir().unwrap(); + let path = temp.path().join("answer.json"); + std::fs::write( + &path, + r#"{"items":[{"name":"A","price":"12","url":"https://example.test/a"},{"name":"B","price":"14","url":"https://example.test/b"}]}"#, + ) + .unwrap(); + let req = DoneRequest { + result_file: Some(path.to_string_lossy().to_string()), + ..DoneRequest::default() + }; + + audit_done_request(&req, &ctx()).expect("material JSON should pass"); +} diff --git a/crates/browser-use-agent/src/tools/handlers/shell.rs b/crates/browser-use-agent/src/tools/handlers/shell.rs index 5ab33d8c..1e6409c8 100644 --- a/crates/browser-use-agent/src/tools/handlers/shell.rs +++ b/crates/browser-use-agent/src/tools/handlers/shell.rs @@ -48,6 +48,7 @@ use crate::tools::unified_exec::{ /// Matches codex `DEFAULT_EXEC_COMMAND_TIMEOUT_MS` (exec.rs:51) and legacy /// `DEFAULT_SHELL_COMMAND_TIMEOUT_MS` (command.rs:26). pub const DEFAULT_SHELL_COMMAND_TIMEOUT_MS: u64 = 10_000; +pub const DEFAULT_EXEC_COMMAND_TIMEOUT_MS: u64 = 600_000; /// Exit code reported when a command is killed for exceeding its timeout. /// @@ -312,6 +313,10 @@ pub struct ExecCommandRequest { /// Initial output collection window before returning a process id. #[serde(default)] pub yield_time_ms: Option, + /// Maximum wall time before the process tree is killed. Defaults to a + /// generous cap so one forgotten command cannot pin the whole run. + #[serde(default)] + pub timeout_ms: Option, /// Maximum model-visible output tokens. #[serde(default)] pub max_output_tokens: Option, @@ -348,6 +353,10 @@ impl ExecCommandRequest { fn yield_time_ms(&self) -> u64 { self.yield_time_ms.unwrap_or(DEFAULT_EXEC_YIELD_TIME_MS) } + + fn timeout_ms(&self) -> u64 { + self.timeout_ms.unwrap_or(DEFAULT_EXEC_COMMAND_TIMEOUT_MS) + } } /// Send stdin to, or poll, a live unified exec process. @@ -610,7 +619,7 @@ impl ToolRuntime for ExecCommandTool { tty: req.tty, yield_time_ms: req.yield_time_ms(), max_output_tokens: req.max_output_tokens, - timeout_ms: None, + timeout_ms: Some(req.timeout_ms()), kill_on_cancel: false, call_id: ctx.call_id.clone(), tool_name: ctx.tool_name.clone(), diff --git a/crates/browser-use-agent/src/tools/handlers/shell_tests.rs b/crates/browser-use-agent/src/tools/handlers/shell_tests.rs index 76cbf324..caca0ef7 100644 --- a/crates/browser-use-agent/src/tools/handlers/shell_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/shell_tests.rs @@ -96,6 +96,7 @@ async fn exec_command_returns_session_id_and_write_stdin_polls_to_exit() { shell: None, login: None, yield_time_ms: Some(10), + timeout_ms: None, max_output_tokens: None, env: HashMap::new(), tty: false, @@ -130,6 +131,45 @@ async fn exec_command_returns_session_id_and_write_stdin_polls_to_exit() { assert!(second.stdout.contains("Output:\ndone")); } +#[tokio::test] +async fn exec_command_timeout_kills_process_group() { + let dir = tempfile::tempdir().unwrap(); + let manager = UnifiedExecManager::deterministic_for_tests(); + let exec = ExecCommandTool::new(manager); + let launch = none_launch(); + let attempt = none_attempt(&launch); + let exec_ctx = ctx_for(dir.path(), "exec_command"); + let marker = dir.path().join("late.txt"); + + let out = exec + .run( + &ExecCommandRequest { + cmd: Some("(sleep 1; printf bad > late.txt) & sleep 5".to_string()), + command: None, + workdir: None, + cwd: None, + shell: None, + login: None, + yield_time_ms: Some(1_000), + timeout_ms: Some(100), + max_output_tokens: None, + env: HashMap::new(), + tty: false, + }, + &attempt, + &exec_ctx, + ) + .await + .expect("exec_command returns timeout snapshot"); + + assert!(out.stdout.contains("Process exited with code 124")); + tokio::time::sleep(std::time::Duration::from_millis(1_200)).await; + assert!( + !marker.exists(), + "exec_command timeout should kill background children" + ); +} + #[tokio::test] async fn write_stdin_rejects_non_tty_input() { let dir = tempfile::tempdir().unwrap(); @@ -151,6 +191,7 @@ async fn write_stdin_rejects_non_tty_input() { shell: None, login: None, yield_time_ms: Some(10), + timeout_ms: None, max_output_tokens: None, env: HashMap::new(), tty: false, @@ -205,6 +246,7 @@ async fn tty_write_stdin_sends_input_and_returns_exit_metadata() { shell: None, login: None, yield_time_ms: Some(10), + timeout_ms: None, max_output_tokens: None, env: HashMap::new(), tty: true, @@ -262,6 +304,7 @@ async fn exec_command_applies_codex_env_defaults() { shell: None, login: None, yield_time_ms: Some(1_000), + timeout_ms: None, max_output_tokens: None, env, tty: false, @@ -300,6 +343,7 @@ async fn exec_command_interruption_preserves_live_session() { shell: None, login: None, yield_time_ms: Some(5_000), + timeout_ms: None, max_output_tokens: None, env: HashMap::new(), tty: false, @@ -371,6 +415,7 @@ async fn exec_command_cancel_returns_live_session_snapshot() { shell: None, login: None, yield_time_ms: Some(30_000), + timeout_ms: None, max_output_tokens: None, env: HashMap::new(), tty: false, @@ -429,6 +474,7 @@ async fn terminate_all_best_effort_kills_managed_processes() { shell: None, login: None, yield_time_ms: Some(250), + timeout_ms: None, max_output_tokens: None, env: HashMap::new(), tty: false, @@ -469,6 +515,7 @@ async fn exec_command_max_output_tokens_truncates_model_output() { shell: None, login: None, yield_time_ms: Some(1_000), + timeout_ms: None, max_output_tokens: Some(2), env: HashMap::new(), tty: false, diff --git a/crates/browser-use-agent/src/tools/registry.rs b/crates/browser-use-agent/src/tools/registry.rs index 869a96ff..3a690e38 100644 --- a/crates/browser-use-agent/src/tools/registry.rs +++ b/crates/browser-use-agent/src/tools/registry.rs @@ -816,6 +816,10 @@ pub mod definitions { "type": "integer", "description": "How long to wait (in milliseconds) for output before yielding." }, + "timeout_ms": { + "type": "integer", + "description": "Maximum wall time in milliseconds before the process tree is killed. Defaults to a generous failsafe cap." + }, "max_output_tokens": { "type": "integer", "description": "Maximum number of tokens to return. Excess output will be truncated." @@ -1116,7 +1120,7 @@ to the single frame that proves the task succeeded." ToolDefinition { name: "done".to_string(), description: - "Signal that the task is finished, carrying the user-facing final answer. If the wall-clock or step budget is nearly exhausted, call done with the best verified partial result instead of continuing until external cancellation; clearly mark unknown, unavailable, or incomplete fields and name the remaining gaps." + "Signal that the user-facing task is complete, carrying the final answer. Call done only after the requested result is complete and verified or persisted. If the remaining step budget is nearly exhausted and a complete answer is no longer possible, call done with an explicitly incomplete fallback result that names the missing requirements." .to_string(), input_schema: json!({ "type": "object", diff --git a/crates/browser-use-agent/src/tools/registry_tests.rs b/crates/browser-use-agent/src/tools/registry_tests.rs index 3b92411d..a74b0795 100644 --- a/crates/browser-use-agent/src/tools/registry_tests.rs +++ b/crates/browser-use-agent/src/tools/registry_tests.rs @@ -1024,13 +1024,14 @@ fn definitions_carry_required_fields_and_names() { } #[test] -fn done_definition_preserves_timebox_partial_result_guidance() { +fn done_definition_requires_complete_result_with_step_exhaustion_fallback() { let done = definitions::done(); - assert!(done.description.contains("best verified partial result")); - assert!(done.description.contains("external cancellation")); + assert!(done.description.contains("task is complete")); + assert!(done.description.contains("complete and verified")); + assert!(done.description.contains("remaining step budget")); assert!(done .description - .contains("unknown, unavailable, or incomplete fields")); + .contains("explicitly incomplete fallback result")); } #[test] diff --git a/crates/browser-use-agent/src/tools/unified_exec.rs b/crates/browser-use-agent/src/tools/unified_exec.rs index da97c91d..89616e49 100644 --- a/crates/browser-use-agent/src/tools/unified_exec.rs +++ b/crates/browser-use-agent/src/tools/unified_exec.rs @@ -7,6 +7,8 @@ use std::collections::HashMap; use std::io::{Read, Write}; +#[cfg(unix)] +use std::os::unix::process::CommandExt; use std::path::{Path, PathBuf}; use std::process::{Child as StdChild, Command as StdCommand, Stdio}; use std::sync::Arc; @@ -618,7 +620,10 @@ struct BlockingReader { } enum ManagedChild { - Pipe(StdChild), + Pipe { + child: StdChild, + process_group: Option, + }, Pty(Box), } @@ -626,6 +631,12 @@ enum ManagedStdin { Pty(Box), } +#[derive(Clone, Copy, Debug)] +struct ProcessGroup { + #[cfg(unix)] + pgid: libc::pid_t, +} + async fn spawn_managed_backend( argv: &[String], cwd: &PathBuf, @@ -652,6 +663,7 @@ async fn spawn_pipe_backend( .stdin(Stdio::null()) .stdout(Stdio::piped()) .stderr(Stdio::piped()); + configure_pipe_child_process_group(&mut command); for (key, value) in env { command.env(key, value); } @@ -659,6 +671,7 @@ async fn spawn_pipe_backend( let mut child = command.spawn().map_err(|source| { ToolError::Other(anyhow::anyhow!("failed to spawn `{program}`: {source}")) })?; + let process_group = process_group_for_child(&child); let stdout = child.stdout.take(); let stderr = child.stderr.take(); let mut blocking_readers = Vec::new(); @@ -675,7 +688,10 @@ async fn spawn_pipe_backend( }); } Ok(SpawnedProcess { - child: ManagedChild::Pipe(child), + child: ManagedChild::Pipe { + child, + process_group, + }, stdin: None, pty_master: None, reader_count: blocking_readers.len(), @@ -683,6 +699,35 @@ async fn spawn_pipe_backend( }) } +fn configure_pipe_child_process_group(command: &mut StdCommand) { + #[cfg(unix)] + unsafe { + command.pre_exec(|| { + if libc::setsid() < 0 { + Err(std::io::Error::last_os_error()) + } else { + Ok(()) + } + }); + } + #[cfg(not(unix))] + let _ = command; +} + +fn process_group_for_child(child: &StdChild) -> Option { + #[cfg(unix)] + { + Some(ProcessGroup { + pgid: child.id() as libc::pid_t, + }) + } + #[cfg(not(unix))] + { + let _ = child; + None + } +} + async fn spawn_pty_backend( argv: &[String], cwd: &PathBuf, @@ -882,7 +927,7 @@ impl ManagedProcess { let status = { let mut child = self.child.lock().await; match &mut *child { - ManagedChild::Pipe(child) => child + ManagedChild::Pipe { child, .. } => child .try_wait() .map_err(|source| { ToolError::Other(anyhow::anyhow!("polling process: {source}")) @@ -971,9 +1016,11 @@ impl ManagedProcess { { let mut child = self.child.lock().await; match &mut *child { - ManagedChild::Pipe(child) => { - let _ = child.kill(); - wait_for_std_child_exit(child, Duration::from_secs(2)).await; + ManagedChild::Pipe { + child, + process_group, + } => { + terminate_std_child_tree(child, *process_group, Duration::from_secs(2)).await; } ManagedChild::Pty(child) => { let _ = child.kill(); @@ -993,9 +1040,11 @@ impl ManagedProcess { { let mut child = self.child.lock().await; match &mut *child { - ManagedChild::Pipe(child) => { - let _ = child.kill(); - wait_for_std_child_exit(child, Duration::from_secs(2)).await; + ManagedChild::Pipe { + child, + process_group, + } => { + terminate_std_child_tree(child, *process_group, Duration::from_secs(2)).await; } ManagedChild::Pty(child) => { let _ = child.kill(); @@ -1014,9 +1063,15 @@ impl ManagedProcess { { let mut child = self.child.blocking_lock(); match &mut *child { - ManagedChild::Pipe(child) => { - let _ = child.kill(); - wait_for_std_child_exit_blocking(child, Duration::from_secs(2)); + ManagedChild::Pipe { + child, + process_group, + } => { + terminate_std_child_tree_blocking( + child, + *process_group, + Duration::from_secs(2), + ); } ManagedChild::Pty(child) => { let _ = child.kill(); @@ -1296,6 +1351,18 @@ async fn wait_for_std_child_exit(child: &mut StdChild, max_wait: Duration) { } } +async fn terminate_std_child_tree( + child: &mut StdChild, + process_group: Option, + max_wait: Duration, +) { + signal_process_group(process_group, ProcessSignal::Terminate); + let _ = child.kill(); + wait_for_std_child_exit(child, max_wait).await; + signal_process_group(process_group, ProcessSignal::Kill); + wait_for_std_child_exit(child, Duration::from_millis(EXIT_WATCH_INTERVAL_MS)).await; +} + fn wait_for_std_child_exit_blocking(child: &mut StdChild, max_wait: Duration) { let deadline = Instant::now() + max_wait; loop { @@ -1307,6 +1374,42 @@ fn wait_for_std_child_exit_blocking(child: &mut StdChild, max_wait: Duration) { } } +fn terminate_std_child_tree_blocking( + child: &mut StdChild, + process_group: Option, + max_wait: Duration, +) { + signal_process_group(process_group, ProcessSignal::Terminate); + let _ = child.kill(); + wait_for_std_child_exit_blocking(child, max_wait); + signal_process_group(process_group, ProcessSignal::Kill); + wait_for_std_child_exit_blocking(child, Duration::from_millis(EXIT_WATCH_INTERVAL_MS)); +} + +#[derive(Clone, Copy)] +enum ProcessSignal { + Terminate, + Kill, +} + +fn signal_process_group(process_group: Option, signal: ProcessSignal) { + #[cfg(unix)] + { + let Some(process_group) = process_group else { + return; + }; + let signal = match signal { + ProcessSignal::Terminate => libc::SIGTERM, + ProcessSignal::Kill => libc::SIGKILL, + }; + unsafe { + let _ = libc::kill(-process_group.pgid, signal); + } + } + #[cfg(not(unix))] + let _ = (process_group, signal); +} + fn clamp_yield_time(yield_time_ms: u64, empty_poll: bool) -> u64 { if empty_poll { yield_time_ms @@ -1383,3 +1486,56 @@ fn signal_exit_code(status: &std::process::ExitStatus) -> i32 { let _ = status; -1 } + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::time::Duration; + + use super::{SpawnProcessRequest, UnifiedExecManager}; + + fn request(argv: Vec, cwd: std::path::PathBuf, timeout_ms: u64) -> SpawnProcessRequest { + SpawnProcessRequest { + argv, + cwd, + env: HashMap::new(), + tty: false, + yield_time_ms: 1_000, + max_output_tokens: None, + timeout_ms: Some(timeout_ms), + kill_on_cancel: true, + call_id: "test-call".to_string(), + tool_name: "shell".to_string(), + emitter: None, + cancel: None, + } + } + + #[cfg(unix)] + #[tokio::test] + async fn timeout_kills_background_children_before_they_can_write() { + let dir = tempfile::tempdir().expect("tempdir"); + let late_file = dir.path().join("late.txt"); + let manager = UnifiedExecManager::deterministic_for_tests(); + + let snapshot = manager + .run_to_completion(request( + vec![ + "sh".to_string(), + "-c".to_string(), + "(sleep 1; printf bad > late.txt) & sleep 5".to_string(), + ], + dir.path().to_path_buf(), + 200, + )) + .await + .expect("run command"); + + assert!(snapshot.timed_out, "command should time out"); + tokio::time::sleep(Duration::from_millis(1_400)).await; + assert!( + !late_file.exists(), + "timed-out background child must not survive to write artifacts" + ); + } +} diff --git a/crates/browser-use-agent/src/turn/sampling.rs b/crates/browser-use-agent/src/turn/sampling.rs index 531739f4..36b6e18f 100644 --- a/crates/browser-use-agent/src/turn/sampling.rs +++ b/crates/browser-use-agent/src/turn/sampling.rs @@ -721,6 +721,19 @@ fn calls_done_tool(tool_calls: &[ContentPart]) -> bool { .any(|p| matches!(p, ContentPart::ToolCall { name, .. } if name == DONE_TOOL_NAME)) } +fn done_tool_succeeded(tool_calls: &[ContentPart], outputs: &[Message]) -> bool { + tool_calls + .iter() + .zip(outputs.iter()) + .any(|(call, output)| match call { + ContentPart::ToolCall { name, .. } if name == DONE_TOOL_NAME => { + let (_, is_error) = tool_result_text_and_status(output); + !is_error + } + _ => false, + }) +} + /// The final summary carried by the model's `done` call, if any. /// /// Reads the `result` field from the first `done` tool call's JSON arguments, @@ -989,19 +1002,11 @@ impl SamplingDriver match (&self.dispatcher, &self.recorder, tool_calls.is_empty()) { // Fused path with at least one tool call. (Some(dispatcher), Some(recorder), false) => { - // A `done` call declares the turn finished: dispatch it (so the - // summary is recorded) but report NO follow-up, terminating the - // loop. Detect it BEFORE the calls vec is consumed by dispatch. - let is_terminal = calls_done_tool(&tool_calls); - // Surface the `done` summary as the turn result when the model - // declared completion via `done` and streamed no other text, so - // the loop returns the summary (codex keeps the final message). - if is_terminal && last_agent_message.is_none() { - last_agent_message = done_summary(&tool_calls); - if last_agent_message.is_some() { - acc.defers_mailbox_delivery_to_next_turn = true; - } - } + // A successful `done` call declares the turn finished. Detect + // the call before dispatch, but decide terminality only after + // the handler output has been recorded; eval-mode done audit + // can reject weak completions and should get a repair turn. + let has_done_call = calls_done_tool(&tool_calls); // 1. Record the assistant message (text + tool calls), so the // recorded transcript carries the call before its output. @@ -1025,14 +1030,29 @@ impl SamplingDriver recorder.record(&result.outputs_in_order).await; } - // A `done` call is TERMINAL: the model declared completion, so - // the loop must stop even though a tool ran. Otherwise, follow-up - // iff a tool actually ran (codex re-samples after feeding tool - // outputs back into history). - if is_terminal { + let done_succeeded = has_done_call + && done_tool_succeeded(&tool_calls, &result.outputs_in_order); + + // Surface the successful `done` summary as the turn result + // when the model declared completion and streamed no other + // text, so the loop returns the summary (codex keeps the final + // message). Rejected done calls keep following up. + if done_succeeded && last_agent_message.is_none() { + last_agent_message = done_summary(&tool_calls); + if last_agent_message.is_some() { + acc.defers_mailbox_delivery_to_next_turn = true; + } + } + + // A successful `done` call is TERMINAL. A rejected `done` call + // needs follow-up so the model can repair the final answer. + if done_succeeded { false } else { - decision::needs_follow_up(result.needs_follow_up, false) + decision::needs_follow_up( + result.needs_follow_up || has_done_call, + false, + ) } } // Text-only sampler, OR fusion configured but no tool call: the diff --git a/crates/browser-use-browser/src/lib.rs b/crates/browser-use-browser/src/lib.rs index ed46a265..facd5c9c 100644 --- a/crates/browser-use-browser/src/lib.rs +++ b/crates/browser-use-browser/src/lib.rs @@ -29,7 +29,7 @@ const BU_API: &str = "https://api.browser-use.com/api/v3"; const LOG_LIMIT: usize = 250; const SCRIPT_MAX_OUTPUT_CHARS: usize = 120_000; const BROWSER_SCRIPT_DEFAULT_INITIAL_WAIT_MS: u64 = 15_000; -const BROWSER_SCRIPT_DEFAULT_OBSERVE_MS: u64 = 30_000; +const BROWSER_SCRIPT_DEFAULT_OBSERVE_MS: u64 = 1_000; const BROWSER_SCRIPT_HELPERS: &str = include_str!("browser_script_helpers.py"); const BROWSER_CONNECT_LOCAL_HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(120); const BROWSER_CONNECT_ATTACH_DEADLINE: Duration = Duration::from_secs(8); @@ -276,8 +276,8 @@ static MANAGED_BROWSER_PIDS: OnceLock>> = OnceLock::new(); static LOCAL_CDP_CONNECTIONS: OnceLock>> = OnceLock::new(); static BROWSER_SCRIPT_RUN_COUNTER: AtomicU64 = AtomicU64::new(1); -const BROWSER_SCRIPT_COMPLETED_CACHE_TTL_MS: u128 = 10 * 60 * 1_000; -const BROWSER_SCRIPT_COMPLETED_CACHE_MAX: usize = 128; +const BROWSER_SCRIPT_COMPLETED_CACHE_TTL_MS: u128 = 60 * 60 * 1_000; +const BROWSER_SCRIPT_COMPLETED_CACHE_MAX: usize = 2048; const MANAGED_BROWSER_PROFILE_PREFIX: &str = "but-managed-browser."; const MANAGED_BROWSER_MARKER_FILE: &str = "BrowserUseManagedChrome.json"; @@ -1282,7 +1282,7 @@ pub fn observe_browser_script_with_registry( let mut run = match lookup { BrowserScriptRunLookup::Run(run) => run, BrowserScriptRunLookup::Cached(output) => return Ok(output), - BrowserScriptRunLookup::Unknown => bail!("unknown browser_script run_id {run_id:?}"), + BrowserScriptRunLookup::Unknown => return Ok(unknown_browser_script_run_output(run_id)), }; if run.session_id != session_id { let owner = run.session_id.clone(); @@ -1359,6 +1359,38 @@ pub fn observe_browser_script_with_registry( } } +fn unknown_browser_script_run_output(run_id: &str) -> BrowserScriptOutput { + BrowserScriptOutput { + ok: true, + status: Some("not_found".to_string()), + run_id: Some(run_id.to_string()), + text: format!( + "No active or recently completed browser_script run was found for run_id: {run_id}. The id may be stale, already consumed, or from an older turn. Do not retry the same observe blindly; inspect the current page state or start a new browser_script if more data is needed." + ), + data: json!({ + "status": "not_found", + "run_id": run_id, + }), + ..Default::default() + } +} + +fn unknown_browser_script_cancel_output(run_id: &str) -> BrowserScriptOutput { + BrowserScriptOutput { + ok: true, + status: Some("not_found".to_string()), + run_id: Some(run_id.to_string()), + text: format!( + "No active browser_script run was found to cancel for run_id: {run_id}. The run may have already finished, timed out, or been cancelled. Continue by inspecting the current page state or starting a new browser_script if more data is needed." + ), + data: json!({ + "status": "not_found", + "run_id": run_id, + }), + ..Default::default() + } +} + pub fn cancel_browser_script(session_id: &str, run_id: &str) -> Result { secrets_runtime::finish_with_redaction( session_id, @@ -1371,11 +1403,13 @@ pub fn cancel_browser_script_with_registry( run_id: &str, registry: &BrowserScriptRunRegistry, ) -> Result { - let mut run = registry + let Some(mut run) = registry .lock() .expect("browser_script run registry poisoned") .remove(run_id) - .ok_or_else(|| anyhow!("unknown browser_script run_id {run_id:?}"))?; + else { + return Ok(unknown_browser_script_cancel_output(run_id)); + }; if run.session_id != session_id { let owner = run.session_id.clone(); registry @@ -13230,14 +13264,12 @@ print("bridge retry ok") "private browser_script runs must not be inserted into the legacy global registry" ); let run_id = started.run_id.as_deref().unwrap(); - let global_err = observe_browser_script(session_id, run_id, 50) - .expect_err("global registry must not see private run"); - assert!( - global_err - .to_string() - .contains("unknown browser_script run_id"), - "unexpected global observe error: {global_err}" - ); + let global_output = + observe_browser_script(session_id, run_id, 50).expect("unknown observe is recoverable"); + assert_eq!(global_output.status.as_deref(), Some("not_found")); + assert!(global_output + .text + .contains("No active or recently completed browser_script run")); let mut finished = observe_browser_script_with_registry(session_id, run_id, 2_500, ®istry).unwrap(); @@ -13251,6 +13283,34 @@ print("bridge retry ok") assert_eq!(registry.active_run_count_for_session(session_id), 0); } + #[test] + fn browser_script_observe_unknown_run_id_is_recoverable() { + let registry = BrowserScriptRunRegistry::new(); + let output = + observe_browser_script_with_registry("script-unknown-run", "bs-missing", 50, ®istry) + .unwrap(); + + assert!(output.ok); + assert_eq!(output.status.as_deref(), Some("not_found")); + assert_eq!(output.run_id.as_deref(), Some("bs-missing")); + assert!(output.text.contains("inspect the current page state")); + } + + #[test] + fn browser_script_cancel_unknown_run_id_is_recoverable() { + let registry = BrowserScriptRunRegistry::new(); + let output = + cancel_browser_script_with_registry("script-unknown-cancel", "bs-missing", ®istry) + .unwrap(); + + assert!(output.ok); + assert_eq!(output.status.as_deref(), Some("not_found")); + assert_eq!(output.run_id.as_deref(), Some("bs-missing")); + assert!(output + .text + .contains("No active browser_script run was found to cancel")); + } + #[test] fn browser_script_observe_can_return_no_new_output() { let temp = tempfile::tempdir().unwrap(); diff --git a/crates/browser-use-secrets/src/lib.rs b/crates/browser-use-secrets/src/lib.rs index 0718eb73..cf992cc6 100644 --- a/crates/browser-use-secrets/src/lib.rs +++ b/crates/browser-use-secrets/src/lib.rs @@ -497,7 +497,10 @@ mod tests { // Inputs without `/` or `\` are emitted unchanged, so keys already // persisted on disk stay valid (backward compatibility). assert_eq!(account_for("github.com", "password"), "github.com/password"); - assert_eq!(account_for("\u{1}agentmail", "token"), "\u{1}agentmail/token"); + assert_eq!( + account_for("\u{1}agentmail", "token"), + "\u{1}agentmail/token" + ); // And the store actually keeps such pairs separate end-to-end. let store = InMemorySecretStore::new(); diff --git a/prompts/browser-agent-system.md b/prompts/browser-agent-system.md index 3458e691..7018a76d 100644 --- a/prompts/browser-agent-system.md +++ b/prompts/browser-agent-system.md @@ -4,7 +4,7 @@ Raw CDP is the center of page interaction. Treat `cdp("Domain.method", ...)` ins The `browser` tool behaves like a CLI for browser runtime management. Use it for `browser status --json`, `browser connect local`, `browser local setup`, `browser connect managed`, `browser remote start`, `browser doctor`, explicit recovery, profile summaries, runtime logs, and ownership checks. It does not interact with pages. -The `browser_script` tool runs fresh Python in a browser-connected environment. Browser/CDP state persists in Rust; Python variables do not persist across calls. Important helpers include `cdp`, `new_tab`, `goto_url`, `page_info`, `js`, `capture_screenshot`, `screenshot`, `screenshot_clip`, `emit_image`, `click_at_xy`, `fill_input`, `type_text`, `press_key`, `scroll`, `wait_for_load`, `wait_for_element`, `wait_for_network_idle`, `current_tab`, `list_tabs`, `switch_tab`, `ensure_real_tab`, `upload_file`, `drain_events`, `http_get`, `http_get_many`, `browser_fetch`, `browser_fetch_many`, `copy_artifact`, `artifact_root`, `outputs_dir`, `session_metadata`, `audit_artifact`, `agent_workspace`, `load_agent_helpers`, `domain_skills_for_url`, and `last_domain_skills`. Use `js(function_source, *args)` when passing JSON-serializable Python values into JavaScript; use `target_id=` as a keyword for iframe targets. +The `browser_script` tool runs fresh Python in a browser-connected environment. Browser/CDP state persists in Rust; Python variables do not persist across calls. Important helpers include `cdp`, `new_tab`, `goto_url`, `page_info`, `js`, `capture_screenshot`, `screenshot`, `screenshot_clip`, `emit_image`, `click_at_xy`, `fill_input`, `type_text`, `press_key`, `scroll`, `wait_for_load`, `wait_for_element`, `wait_for_network_idle`, `current_tab`, `list_tabs`, `switch_tab`, `ensure_real_tab`, `upload_file`, `drain_events`, `http_get`, `browser_fetch`, `copy_artifact`, `artifact_root`, `outputs_dir`, `session_metadata`, `audit_artifact`, `agent_workspace`, `load_agent_helpers`, `domain_skills_for_url`, and `last_domain_skills`. Use `js(function_source, *args)` when passing JSON-serializable Python values into JavaScript; use `target_id=` as a keyword for iframe targets. `browser_script` has a start/listen lifecycle. A fast call returns final output immediately. A longer call returns `status: running` plus `run_id`; observe it with `action="observe"` until final status. If observe returns no new output for its wait window, back off instead of polling constantly. Images/artifacts emitted by the running script are returned by observe as soon as they exist. Use `action="cancel"` with the `run_id` only when the running script is no longer useful. @@ -29,12 +29,12 @@ Browser-harness workflow: - First navigation should usually be `new_tab(url)`, not `goto_url(url)`, because `goto_url` mutates the active tab. `new_tab(url)` and `goto_url(url)` have zero implicit wait: they send the CDP navigation command and then return without waiting for readyState, network idle, selectors, paint, or sleeps. If you chain more work in the same script after navigation, explicitly wait or poll before reading/clicking. If navigation is the last action before yielding to the model, the LLM call itself may provide enough elapsed time; the next call must still inspect state before assuming the page loaded. - When a task is site-specific and a matching domain skill exists, read it before inventing selectors, private API routes, or flows. Use `domain_skills_for_url(url, include_content=True)` before or immediately after navigation; `goto_url(url)` also records matching skill metadata in the tool result. -- Use screenshots as labeled temporal checkpoints when visual state matters. For text-heavy research, document reading, search, pricing, tables, and list extraction, default to `page_info()`, `js(...)`, targeted DOM text, `http_get_many`, or `browser_fetch_many`; screenshots add latency and usually do not help. Capture visual state before and after meaningful browser actions only when layout, coordinates, blockers, menus, dialogs, downloads, uploads, form submissions, or final visual verification matter. +- Use screenshots as labeled temporal checkpoints. Screenshots are often the fastest way to understand the page, spot blockers, read visible state, and verify what changed. Capture visual state before and after meaningful browser actions: initial load, clicks, scrolls, route changes, menus, dialogs, downloads, uploads, form submissions, and final verification. For screenshot or visual-output tasks, verify the artifact is contentful and nonblank before `done`. - Prefer coordinate clicks for visible targets. Use `screenshot` or `capture_screenshot`, inspect the pixels, `click_at_xy(x, y)`, then screenshot again to verify. Chrome hit-testing handles iframes, shadow DOM, and cross-origin content better than selector abstractions. - For forms, behave like a browser user. Inspect visually with screenshots first; use read-only JS only when pixels are insufficient to identify labels or stable selectors. Click into visible text fields before typing. Use `type_text(...)`, `press_key(...)`, or `fill_input(...)` for text, and real coordinate clicks for checkboxes, radios, buttons, dropdowns, and custom controls. Never bulk-fill a live form by setting DOM values, setting checked state, dispatching synthetic form events, or running a restore loop; this can desynchronize framework state from the visible DOM. - Prefer capturing the action timeline inside one `browser_script` tool call when possible: `screenshot("before_click")`, perform the action, wait for the state change, then `screenshot("after_click")`. - Do not call `screenshot` repeatedly on an unchanged viewport. Once you have a screenshot, either take an action, inspect with CDP/JS, navigate, scroll, call `screenshot_clip(...)` for a different region, wait for an async transition, or finish. Every screenshot should have a purpose: observe current state, verify an action, inspect a changed region, or preserve final evidence. -- Use raw `cdp(...)`, `page_info()`, `wait_for_element(...)`, `wait_for_network_idle(...)`, and `js(...)` when coordinates are the wrong tool or you need structured data. If you have three or more independent URLs, files, documents, or API endpoints to inspect, batch them in one `browser_script` call with `http_get_many` or `browser_fetch_many` instead of visiting them one at a time. +- Use raw `cdp(...)`, `page_info()`, `wait_for_element(...)`, `wait_for_network_idle(...)`, and `js(...)` when coordinates are the wrong tool or you need structured data. Use scripts to make source-completion faster, but keep each extraction chunk small enough to inspect progress, counts, missing fields, and source coverage before continuing. - `js(...)` returns Python values. After `text = js("document.body.innerText")`, use Python slicing like `text[:1000]`; only use JavaScript methods such as `.slice(...)` inside the JavaScript expression itself. - After actions that trigger loads, SPA transitions, XHR/fetch, menus, dialogs, downloads, uploads, or other visible state changes, be patient by making several cheap observations, not one long blind wait. Prefer short waits, then inspect again with `page_info()` or a screenshot. A wait returning false is not a task failure; inspect the current page and continue from the best available state or decide whether it is stuck. - If redirected to an auth wall or credential prompt, stop and ask the user. Do not infer or type credentials from screenshots. @@ -51,10 +51,14 @@ Python namespace rule: `browser_script` variables do not persist across calls. S Durable helper rule: if you discover a reusable selector, site quirk, private API, or interaction helper, put the smallest useful helper in `.browser-use/agent-workspace/agent_helpers.py` and use it on later calls. The file is auto-loaded when it changes; call `load_agent_helpers()` if you need to force reload. Keep helpers task-focused, CDP-friendly, and free of secrets. Do not build manager layers, retry frameworks, page-object frameworks, or wrapper abstractions unless the task itself absolutely requires it. -Multi-item collection rule: when the task asks for many products, countries, people, records, plans, prices, links, or fields, maintain a checklist of every required row and field. Spend work across the checklist, not indefinitely on one difficult item. For each item/source, use a small number of targeted attempts, then either record the best verified value, mark it unavailable/unknown with the source and reason, or move to a better source. Do not keep varying one search term while other required rows are untouched. Before `done`, audit the checklist: every requested row/field must be filled, explicitly unavailable/unknown, or clearly reported as partial with the remaining gaps. +Never fabricate or pattern-guess values (emails, IDs, prices, names) to fill required fields or to satisfy an audit; every value must come from a source you actually observed. An honest null or "not found" with the checked source noted is an acceptable final value; a fabricated value is never acceptable and is worse than an empty field. -Single-site collection rule: when the task asks for data from one website, one vendor, one domain, or "a single website", choose one viable domain early and complete the checklist on that domain. Candidate scouting should be brief: verify the domain has the right category, currency, locale, or authority, then commit. If the task permits unavailable/missing rows, a domain is viable as soon as it has the requested category/source type and at least one requested row or a searchable catalog in the requested currency/locale; do not keep searching for a perfect domain that has every row. Do not stitch rows from multiple domains, and do not keep vendor-hopping after a viable domain exists. Switch domains only when the current domain clearly cannot satisfy the requested category/currency/authority after a bounded check. If an item is missing on the committed domain, mark it unavailable for that domain and move to the next checklist row. +Multi-item collection rule: when the task asks for many products, countries, people, records, plans, prices, links, or fields, maintain a checklist of every required row and field. Required rows and fields must be completed from the correct source type unless absence is proven from that source. Do not mark required values unknown just because one query, endpoint, page, or selector failed. Spend work across the checklist, but before `done`, audit row count, required fields, null rates, dedupe keys, and source coverage; missing required values must either be fixed or reported as an explicitly incomplete fallback. -Use the browser to discover and verify. Once the browser reveals stable data endpoints, static links, downloadable assets, XHR/fetch patterns, or predictable pagination URLs, switch to `http_get_many` for independent public URLs or `browser_fetch_many` when browser cookies/session state are needed. Use single `http_get`/`browser_fetch` calls for one-off checks. For long extraction loops, split work into bounded chunks, use explicit timeouts, checkpoint partial results to files, and resume from checkpoints instead of restarting. Use one global deadline plus per-item micro timeouts, and check the global deadline before every navigation, wait, and sleep. Any loop over multiple pages/items must emit short progress every item or every 2 seconds, whichever comes first. For list/profile extraction, filter candidates before navigating when possible, and poll for record readiness rather than nullable answer fields; if a loaded record has a missing optional field, record it as missing and continue. Extract only task-relevant fields; do not emit full profile text, full DOM text, cookies, localStorage, or entire app caches unless smaller field-level extraction failed. Use `outputs_dir()` for generated result files; files written there are collected as artifacts automatically. Use `copy_artifact(path)` only for files created elsewhere, and `emit_image(path)` for screenshots or visual artifacts. When a task expects a large JSON/CSV/list output, write the full file; if the final answer must be inline structured content, return that content with `done(result=...)` and optionally include `result_file=path`, otherwise finish with `done(result_file=path)`. +Single-site collection rule: when the task asks for data from one website, one vendor, one domain, or "a single website", choose one viable domain early and complete the checklist on that domain. Candidate scouting should be brief, but completion on the chosen domain is not optional. Do not stitch rows from multiple domains, do not silently source-switch, and do not mark missing rows unavailable unless the correct source path was checked and the absence is supported by evidence. Switch domains only when the current domain clearly cannot satisfy the requested category, currency, locale, or authority. + +Bounded audit rule: verification should catch real mistakes without spinning forever. After writing a substantial artifact, run a small independent audit of count/schema/required-fields/source coverage, then repair the specific gaps it finds. Keep repairing until the required rows and fields are satisfied or the run-level task timebox is nearly spent — do not stop after a single repair pass while required data is still missing and reachable from its correct source. What you must avoid is blindly restarting the same full crawl, pagination sweep, or detail scrape just because counts fluctuate or a site intermittently returns empty pages; target the specific missing items instead of starting over. Finalize as explicitly incomplete only when the source genuinely does not expose the required data, or the run is nearly out of turns. + +Use the browser to discover and verify. Once the browser reveals a stable data endpoint, static link, downloadable asset, XHR/fetch pattern, or predictable pagination URL, use single targeted `http_get` or `browser_fetch` calls when they help verify or complete the source. Do not replace source completion with blind bulk fetching. If a script, `http_get`, `browser_fetch`, endpoint, or selector errors, returns empty, or is blocked, fall back to navigating and reading the rendered page before marking anything unavailable — do not re-run the same failing script or abandon the source for a degraded one. For long extraction loops, split work into bounded chunks, use explicit timeouts, checkpoint partial results to files, and resume from checkpoints instead of restarting. Each chunk must emit compact progress with source URL/pattern, count, missing-field count, failure count, and next cursor/page when relevant. Extract only task-relevant fields; do not emit full profile text, full DOM text, cookies, localStorage, or entire app caches unless smaller field-level extraction failed. Use `outputs_dir()` for generated result files; files written there are collected as artifacts automatically. Use `copy_artifact(path)` only for files created elsewhere, and `emit_image(path)` for screenshots or visual artifacts. When a task expects a large JSON/CSV/list output, write the full file; if the final answer must be inline structured content, return that content with `done(result=...)` and optionally include `result_file=path`, otherwise finish with `done(result_file=path)`. Before `done`, block claimed-complete results with empty arrays, high null rates in required fields, missing required fields, wrong known counts, source drift, blank screenshots, or partial/incomplete markers. Use helper agents only when the user explicitly asks for sub-agents, delegation, or parallel agent work. Requests for depth, thoroughness, research, investigation, or detailed codebase analysis do not by themselves authorize spawning a helper. When delegation is authorized, give each helper a narrow, self-contained task that materially advances the work, keep urgent blocking work local, avoid duplicate helper work, and continue useful non-overlapping local work while the helper runs. Use the `explorer` role for authorized read-only repository questions and `worker` for authorized implementation work with a bounded write scope. diff --git a/prompts/browser-script-tool-description.md b/prompts/browser-script-tool-description.md index 646b9df6..c4a354e9 100644 --- a/prompts/browser-script-tool-description.md +++ b/prompts/browser-script-tool-description.md @@ -56,9 +56,7 @@ email_address() email_inbox(limit=20, sent_after=None) email_message(message_id) http_get(url, **kwargs) -http_get_many(urls, **kwargs) browser_fetch(url, **kwargs) -browser_fetch_many(requests, **kwargs) copy_artifact(path, kind="file") emit_output(value, label=None) @@ -82,7 +80,7 @@ Usage guidance: - Do not combine `Input.dispatchKeyEvent` carrying printable `text` with a manual `char` event for the same character; that double-inserts text in Chrome. - If the task is site-specific, call `domain_skills_for_url(url, include_content=True)` before inventing selectors, private API routes, or flows. `goto_url(url)` also returns matching `domain_skills` metadata when a skill root is available. - Be patient with loading pages by making several cheap observations, not one long blind wait. Prefer short waits such as `wait_for_load(1)`, `wait_for_element(selector, timeout=2)`, or `wait_for_network_idle(2)`, then inspect again. If a wait returns false, that is not a task failure; inspect the current page and continue from the best available state or decide whether it is stuck. -- Use screenshots as labeled temporal checkpoints when visual state matters: before/after meaningful clicks, scrolls, route changes, dialogs, uploads, downloads, and visual final verification. For text-heavy research, document reading, search, pricing, tables, and list extraction, prefer `page_info()`, `js(...)`, targeted DOM text, `http_get_many`, or `browser_fetch_many`; screenshots add latency and usually do not help. +- Use screenshots as labeled temporal checkpoints: initial load, before/after meaningful clicks, scrolls, route changes, dialogs, uploads, downloads, and final verification. For screenshot or visual-output tasks, verify the saved image is contentful and nonblank before `done`. - The common screenshot call is `screenshot(label)`, for example `screenshot("before_submit")`. - Screenshot/image artifacts are sent as `input_image` content to the next model turn. The user does not see those pixels inline in the terminal; describe what you see or provide the saved artifact path when the user asks for the screenshot. - If a script emits screenshots/images and then fails, the next model turn still receives the images alongside the failure diagnosis. Use those pixels to decide the next smaller retry. @@ -118,45 +116,16 @@ emit_output(rows, label="employee_rows") - Use `js(...)` for DOM inspection and raw `cdp(...)` for lower-level browser actions. - Use `js(function_source, *args)` when passing JSON-serializable Python values into JavaScript; use `target_id=` as a keyword for iframe targets. - For real user forms, act like a browser user: screenshot, click the visible field/control, type with `type_text(...)`, `press_key(...)`, or `fill_input(...)`, then screenshot or otherwise verify. Use coordinate clicks for checkboxes, radios, buttons, dropdowns, and custom controls. Do not assign `element.value`, `element.checked`, `selectedIndex`, React private state, or MutationObserver restore loops on live forms. Do not synthesize `input`, `change`, `click`, or keyboard events in page JavaScript to make a form look filled. Those anti-patterns can desynchronize framework state from the visible DOM. -- Use `http_get(...)` for one static page/API URL after the browser reveals a stable endpoint, and `http_get_many(...)` for several independent public URLs. Use `browser_fetch(...)` or `browser_fetch_many(...)` when the page's cookies, auth headers, or browser session are needed. Returned bodies are strings by default, bytes with `binary=True`, and expose `.status_code`, `.headers`, `.url`, `.text`, `.content`, and `.json()` for convenience. `browser_fetch(...)` and the batch helpers return error records by default so one bad endpoint does not waste the whole extraction chunk; pass `return_error=False` or `return_errors=False` only when a hard failure is intended. If direct HTTP hits bot or login protection, retry with `browser_fetch(...)`, site-specific headers/cookies, or the configured Browser Use fetch proxy. -- Batch recipe after discovering stable links or endpoints: - -```python -# browser_summary: -# { -# "fetch_progress": { -# "kind": "extracted", -# "message": "Fetched ${$.ok_count}/${$.total} independent URLs" -# }, -# "records": { -# "kind": "extracted", -# "message": "Extracted ${$.length} records from fetched pages" -# } -# } - -urls = [...] -responses = http_get_many(urls, timeout=12, max_workers=8) -ok = [r for r in responses if not isinstance(r, dict) and getattr(r, "status_code", 0) < 400] -emit_output({"total": len(responses), "ok_count": len(ok)}, label="fetch_progress") - -records = [] -for url, response in zip(urls, responses): - if isinstance(response, dict) and response.get("error"): - records.append({"url": url, "status": "error", "error": response["error"]}) - continue - text = response.text - records.append({"url": url, "status": response.status_code, "title": text[:200]}) - -emit_output(records, label="records") -``` +- Use `http_get(...)` for one static page/API URL after the browser reveals a stable endpoint. Use `browser_fetch(...)` when the page's cookies, auth headers, or browser session are needed. Returned bodies are strings by default, bytes with `binary=True`, and expose `.status_code`, `.headers`, `.url`, `.text`, `.content`, and `.json()` for convenience. If direct HTTP hits bot or login protection, retry with `browser_fetch(...)`, site-specific headers/cookies, or the configured Browser Use fetch proxy. Do not replace source completion with blind bulk fetching; use small inspected chunks with progress, counts, missing fields, and source coverage. - Extract only fields needed for the task. Do not emit full profile text, full DOM text, cookies, localStorage, or entire app caches unless you are debugging and the smaller field-level extraction failed. - Save complete generated result files under `outputs_dir()` or relative paths in the current working directory. Files written there are collected as artifacts automatically; `copy_artifact(...)` is for files created elsewhere. - For large structured results, write the full JSON/CSV/text to a file. If the task asks for an exact inline final format, return that content with `done(result=...)` and optionally include `result_file=path`; otherwise finish with `done(result_file=path)`. - For loops over multiple pages/items, emit short progress every item or every 2 seconds, whichever comes first. Progress can be a short `print(...)` line or compact `emit_output(..., label="progress")`. +- For audits after a large result, run a small independent sample/count/schema check, then repair the specific gaps it finds until the required rows/fields are complete or the run is nearly out of turns. Do not rerun the whole crawl or full detail scrape just because counts fluctuate or some pages are intermittently empty; target the missing items, and mark a gap as a genuine absence only after checking its correct source path. - For list/profile extraction, filter the candidate list before navigating when the list page already contains enough information, such as employee versus contractor. Do not visit rows that cannot affect the final answer. -- Poll for record readiness, not for nullable answer fields. If the app cache or DOM record for a person exists but `birthday`, phone, address, or another optional field is missing/null, record that value as missing and continue instead of waiting for the optional field to appear. -- For long extraction or verification loops, prefer bounded chunks with checkpoints written to files. Use one global deadline plus per-item micro timeouts; check the global deadline before every navigation, wait, and sleep. If a chunk fails with a usable-page diagnosis, shrink the next chunk and resume from the last checkpoint. +- Poll until the record itself is ready before extracting fields. If a loaded record is missing a required field, inspect the correct source path before marking it absent; do not record required values as missing just because the first record view is null. +- For long extraction or verification loops, prefer bounded chunks with checkpoints written to files. Use per-item micro timeouts and inspect progress after each chunk. If a chunk fails with a usable-page diagnosis, shrink the next chunk and resume from the last checkpoint. Signing in / sign-ups: before signing up with a new email, check whether you're already logged in (you often drive the user's own profile) or have a saved credential for the site (listed under "Saved credentials") — if so, use it. If there's no existing login, ask the user whether to sign in with their own account (they save it via `/secrets`) or have you create a disposable account (you generate a throwaway inbox with `email_address()` and read its verification emails yourself), and wait for their choice. For the disposable path, call `email_address()`, record whatever context you need before submitting (`current_datetime()["utc"]`, existing `message_id`s from `email_inbox()`, or both), fill the email field, submit, then inspect/poll `email_inbox(sent_after=...)` or compare `timestamp`/`message_id` yourself (newest-first; `preview` already holds the code; `email_message(message_id)` has the full `text`/`html` for magic links). diff --git a/prompts/dataset-case-user.md b/prompts/dataset-case-user.md index d668d259..1efc0d61 100644 --- a/prompts/dataset-case-user.md +++ b/prompts/dataset-case-user.md @@ -6,7 +6,7 @@ Task ID: {{task_id}} Task: {{task}} -Use `browser` for browser connection/status/recovery and `browser_script` for browser interaction. Rust owns the browser connection; `browser_script` exposes helpers plus raw CDP access when needed. Prefer robust CDP/DOM observations over guessing. For text-heavy research, document reading, search, pricing, or list extraction, prefer DOM/text/API observations and batch fetches over screenshots. Attach screenshots only after meaningful visual transitions or when visible layout, coordinates, blockers, or final visual state matter. +Use `browser` for browser connection/status/recovery and `browser_script` for browser interaction. Rust owns the browser connection; `browser_script` exposes helpers plus raw CDP access when needed. Prefer robust CDP/DOM observations over guessing. Use screenshots as verification checkpoints after meaningful visual transitions and whenever visible state, layout, blockers, coordinates, dynamic content, or final browser state matters. Filesystem contract: if the task asks you to save files, write them in the current working directory using relative paths. For large JSON/CSV/list results, save the full result to `result.json` or `result.csv` so it is available as an artifact. If the requested final answer is not an exact inline format, return a compact final answer with the output path, record count, schema/columns, and one sample row instead of pasting a giant blob. @@ -16,9 +16,11 @@ File result contract: after writing a complete JSON/CSV/text result file, use `d Remote browser contract: browser automation may run on a different machine from the local filesystem. Files downloaded by the remote browser are not automatically available in the local working directory. If a task needs a downloaded file locally, transfer or fetch it into the current working directory, then verify the local path exists before referencing, opening, or finalizing it. For uploads, make sure the file you intend to upload is available to the browser context you are controlling. -Long extraction contract: if the task needs many pages, rows, files, or detail records, work in bounded chunks. Discover the endpoint or pagination pattern first, then fetch in batches with explicit timeouts, checkpoint partial results in the current working directory, and print compact progress counts. A timed-out all-in-one crawl with no saved artifact is not progress; resume from checkpoints when a chunk fails. +Long extraction contract: if the task needs many pages, rows, files, or detail records, work in bounded chunks. Discover the source and pagination pattern first, inspect progress from the browser/source itself, checkpoint partial results in the current working directory, and print compact progress counts. A timed-out all-in-one crawl with no saved artifact is not progress; resume from checkpoints when a chunk fails. -Timebox contract: dataset runs have a short wall-clock budget. For long research, document, or extraction tasks, set a soft deadline before starting broad collection, about 7 minutes from now, and a hard deadline about 8.5 minutes from now. Check the deadline before each new page, document, query, or file. After the soft deadline, stop broad research and fill remaining fields from the strongest verified evidence or mark them unknown/unavailable. Before the hard deadline, call `done(...)` with the completed or partial result. Never keep running until the external runner timeout with no saved result. +Bounded verification contract: after a large result is saved, verify it with a compact independent check of count/schema/required-fields/source coverage, then repair the specific gaps it finds. Keep repairing until the required rows and fields are satisfied or the step budget is nearly spent; target the specific missing items rather than blindly restarting full crawls or pagination sweeps just because counts fluctuate. Finalize from the best saved artifact only when the source genuinely cannot supply the missing data. + +Completion discipline: complete the requested task before calling `done(...)`. Do not call `done` with a partial result just because the source is awkward, slow, or a first extraction path failed. Only when the remaining step budget is nearly exhausted and a full answer is no longer possible, save the best verified artifact, explicitly mark the result incomplete, list the missing requirements, and then call `done(...)`. Completion contract: the final answer must contain the requested answer or a clear pointer to the artifact that contains it. For artifact-heavy results, include the artifact path, record count, schema/columns, and one sample row. A bare acknowledgement such as `Done.` is not useful unless the task explicitly asked for no visible answer. @@ -28,6 +30,6 @@ Verification contract: when the task has explicit checkable requirements for rec If the task gives fallback instructions, treat them as part of the task. Do not finish with "this would need to be supplemented" when the prompt already specifies how to supplement it. -When the turn budget is nearly exhausted, stop starting new lines of investigation. Finalize from the strongest current evidence, write any partial artifacts, and explicitly mark unknown or ambiguous fields instead of timing out with no deliverable. +When the turn budget is nearly exhausted, stop starting new lines of investigation. Finalize from the strongest current evidence only as an explicitly incomplete fallback, write any partial artifacts, and name unknown or ambiguous fields instead of timing out with no deliverable. Return the final answer with the done tool only when the task is complete. diff --git a/prompts/python-tool-description.md b/prompts/python-tool-description.md index 84314cd1..14991a3f 100644 --- a/prompts/python-tool-description.md +++ b/prompts/python-tool-description.md @@ -12,7 +12,7 @@ Do not import Playwright, Selenium, or Pyppeteer. Browser-harness workflow: firs To pass pixels directly to the next model turn, call raw `cdp("Page.captureScreenshot", format="png")`, `screenshot(label)`, `screenshot_clip(label, x, y, width, height)`, or `capture_screenshot(..., attach=True)`; use `emit_image(path)` for existing image files. Raw `Page.captureScreenshot` results are attached automatically. The user does not see attached pixels inline in the terminal; describe what you see or provide the saved artifact path when the user asks for a screenshot. Multiple labeled screenshots are good when they form a temporal trace, not when they repeat the same unchanged page. -For final structured answers, write the complete JSON/CSV/text to a file under `outputs_dir()` or a relative path in the current working directory, then verify the count/schema. If the task asks for exact inline JSON, CSV, a table, markdown, or a schema-shaped response, return that content with `done(result=...)`; you may also include `result_file=path` to attach the saved file. Use `done(result_file=path)` by itself only when a file pointer or artifact summary satisfies the task. For long extraction loops, split work into bounded chunks, use explicit timeouts, checkpoint partial data to files, emit progress every item or every 2 seconds, use one global deadline, filter candidates before navigation when possible, and poll for record readiness rather than nullable answer fields. Extract only task-relevant fields; avoid full profile text, full DOM text, cookies, localStorage, or whole app caches unless debugging requires them. Use the browser to discover and verify; after stable endpoints, file URLs, XHR/fetch patterns, or pagination URLs are known, use `requests`, `http_get`, or threaded HTTP for bulk extraction. +For final structured answers, write the complete JSON/CSV/text to a file under `outputs_dir()` or a relative path in the current working directory, then verify the count/schema. If the task asks for exact inline JSON, CSV, a table, markdown, or a schema-shaped response, return that content with `done(result=...)`; you may also include `result_file=path` to attach the saved file. Use `done(result_file=path)` by itself only when a file pointer or artifact summary satisfies the task. For long extraction loops, split work into bounded chunks, use explicit timeouts, checkpoint partial data to files, and emit progress every item or every 2 seconds with source URL/pattern, count, missing-field count, failure count, and next cursor/page when relevant. After saving a large artifact, verify with a compact independent count/schema/required-fields/source check, then repair the specific gaps it finds until the required data is complete or the run is nearly out of turns; target the missing items rather than restarting the full crawl or detail scrape when the source is inconsistent or intermittently empty. Extract only task-relevant fields; avoid full profile text, full DOM text, cookies, localStorage, or whole app caches unless debugging requires them. Use the browser to discover and verify stable endpoints, file URLs, XHR/fetch patterns, or pagination URLs, then use targeted `requests` or `http_get` calls to complete or verify the source. Do not replace source completion with blind bulk fetching. Use `audit_artifact(...)` or ordinary Python checks when the task has explicit checkable requirements for the result or files. If checks fail, fix the result and rerun them when possible; otherwise finalize as partial/incomplete and name the remaining gaps.