diff --git a/Cargo.lock b/Cargo.lock index 94fecebd..682c7126 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -267,6 +267,7 @@ dependencies = [ "browser-use-secrets", "browser-use-store", "futures-util", + "libc", "portable-pty", "rand 0.9.4", "regex", diff --git a/Cargo.toml b/Cargo.toml index c99553b3..bb0c1e4d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,6 +38,7 @@ ignore = "0.4" rand = "0.9" iana-time-zone = "0.1" image = { version = "0.25", default-features = false, features = ["png", "jpeg", "gif", "webp"] } +libc = "0.2" ratatui = { version = "0.30", default-features = false, features = ["crossterm_0_29"] } reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls"] } regex = "1" diff --git a/crates/browser-use-agent/Cargo.toml b/crates/browser-use-agent/Cargo.toml index 59705092..8882279e 100644 --- a/crates/browser-use-agent/Cargo.toml +++ b/crates/browser-use-agent/Cargo.toml @@ -26,6 +26,7 @@ rand.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["stream"] } rustls.workspace = true +libc.workspace = true serde.workspace = true serde_json.workspace = true sha2.workspace = true diff --git a/crates/browser-use-agent/src/compact/mod.rs b/crates/browser-use-agent/src/compact/mod.rs index 164ed355..1afa6374 100644 --- a/crates/browser-use-agent/src/compact/mod.rs +++ b/crates/browser-use-agent/src/compact/mod.rs @@ -302,7 +302,18 @@ pub fn compacted_history_from_summary( summary_suffix: CompactionSummary, token_limit: usize, ) -> CompactedHistory { - let summary_text = format!("{SUMMARY_PREFIX}\n{}", summary_suffix.text); + // The empty-summary fallback must apply to the SUFFIX, before the prefix + // is prepended — otherwise the prefix makes the combined string non-empty + // and the fallback in build_compacted_history never fires. A summarize() + // pass that returns no text then ships "...Here is the summary...:" followed + // by NOTHING, leaving the post-compaction model with total amnesia (real_v8 + // task 24: 30 thrash turns, task lost). + let suffix_text = if summary_suffix.text.trim().is_empty() { + "(no summary available)".to_string() + } else { + summary_suffix.text.clone() + }; + let summary_text = format!("{SUMMARY_PREFIX}\n{suffix_text}"); let user_messages: Vec = history.iter().filter_map(real_user_message_text).collect(); let items = build_compacted_history(&user_messages, &summary_text, token_limit); diff --git a/crates/browser-use-agent/src/compact/tests.rs b/crates/browser-use-agent/src/compact/tests.rs index 8905ec56..355b4e48 100644 --- a/crates/browser-use-agent/src/compact/tests.rs +++ b/crates/browser-use-agent/src/compact/tests.rs @@ -370,11 +370,11 @@ async fn context_window_exceeded_with_only_prompt_left_propagates() { } #[tokio::test] -async fn empty_model_summary_yields_prefix_only_summary() { - // The model returns an empty summary => summary_text = PREFIX + "\n" + "". - // (The "(no summary available)" placeholder only applies when the WHOLE - // summary_text is empty, which it never is here because the prefix is present; - // codex compact.rs:516.) +async fn empty_model_summary_yields_no_summary_available_fallback() { + // The model returns an empty summary => the SUFFIX falls back to + // "(no summary available)" BEFORE the prefix is prepended (codex + // compact.rs:516). Shipping PREFIX+"" gave the post-compaction model + // total amnesia (real_v8 task 24). let history = vec![user_item("a user message")]; let sampler = ScriptedSampler::new(vec![Ok(String::new())]); @@ -390,7 +390,10 @@ async fn empty_model_summary_yields_prefix_only_summary() { let last = item_text(compacted.items.last().unwrap()); assert_eq!(last, compacted.summary_text); - assert_eq!(compacted.summary_text, format!("{SUMMARY_PREFIX}\n")); + assert_eq!( + compacted.summary_text, + format!("{SUMMARY_PREFIX}\n(no summary available)") + ); } // ---- (5) end-to-end through the real TurnLoop ----------------------------- diff --git a/crates/browser-use-agent/src/entrypoint/mod.rs b/crates/browser-use-agent/src/entrypoint/mod.rs index 4c743a69..5c1765a1 100644 --- a/crates/browser-use-agent/src/entrypoint/mod.rs +++ b/crates/browser-use-agent/src/entrypoint/mod.rs @@ -49,6 +49,7 @@ pub mod provider; +use std::any::Any; use std::future::Future; use std::pin::Pin; use std::sync::Arc; @@ -83,7 +84,7 @@ use crate::context::{ ContextManager, Item, }; use crate::decision::{AutoCompactTokenLimitScope, SamplingOutcome, TokenStatus}; -use crate::events::{names, session_done_payload, EventSink, PendingEvent, TurnCtx}; +use crate::events::{names, session_done_payload, EventSink, PendingEvent, ResultFilePtr, TurnCtx}; use crate::live_executor::ensure_agent_attached as ensure_runtime_agent_attached; use crate::session::reconstruct::WORKSPACE_CONTEXT_MULTI_AGENT_USAGE_HINT_KIND; use crate::session::SessionId; @@ -1393,6 +1394,101 @@ fn runtime_or_store_events( .unwrap_or_default() } +/// Fallback final result discovered from the session cwd when the model ends +/// without a `done()` call (ported from exp/real-v8-restore-88 2bd479d). +struct FallbackResultFile { + text: String, + file: ResultFilePtr, +} + +fn fallback_result_file_for_session( + store: &SharedStore, + session_id: &str, +) -> Option { + let session = store + .lock() + .expect("store mutex poisoned") + .load_session(session_id) + .ok() + .flatten()?; + let cwd = std::path::PathBuf::from(session.cwd); + let files = discover_result_files(&cwd); + let first = files.first()?; + let text = if files.len() == 1 { + format!("Result file: {}", first.path.display()) + } else { + let rendered = files + .iter() + .map(|file| format!("- {}", file.path.display())) + .collect::>() + .join("\n"); + format!("Result files:\n{rendered}") + }; + Some(FallbackResultFile { + text, + file: ResultFilePtr { + url: None, + path: Some(first.path.display().to_string()), + bytes: Some(first.bytes), + }, + }) +} + +struct DiscoveredResultFile { + path: std::path::PathBuf, + bytes: u64, +} + +fn discover_result_files(cwd: &std::path::Path) -> Vec { + let Ok(entries) = std::fs::read_dir(cwd) else { + return Vec::new(); + }; + let mut files = entries + .filter_map(Result::ok) + .filter_map(|entry| { + let path = entry.path(); + let name = path.file_name()?.to_str()?; + if !name.starts_with("result.") { + return None; + } + let metadata = entry.metadata().ok()?; + if !metadata.is_file() || metadata.len() == 0 { + return None; + } + Some(DiscoveredResultFile { + path, + bytes: metadata.len(), + }) + }) + .collect::>(); + files.sort_by(|left, right| { + let left_name = left + .path + .file_name() + .and_then(|name| name.to_str()) + .unwrap_or(""); + let right_name = right + .path + .file_name() + .and_then(|name| name.to_str()) + .unwrap_or(""); + result_file_priority(left_name) + .cmp(&result_file_priority(right_name)) + .then_with(|| left_name.cmp(right_name)) + }); + files +} + +fn result_file_priority(name: &str) -> usize { + match name { + "result.json" => 0, + "result.csv" => 1, + "result.md" => 2, + "result.txt" => 3, + _ => 10, + } +} + fn ensure_fallback_capture_recording(store: &SharedStore, session_id: &str) { if !fallback_capture_recording_enabled() { return; @@ -2401,40 +2497,15 @@ fn media_content_part_for_provider( } } -/// A [`TurnObserver`] that maps loop lifecycle into the durable UI event log. +/// A [`TurnObserver`] for lifecycle hooks that must not decide terminal status. /// -/// On turn completion it emits the final agent message as a `session.done` -/// event through the durable UI sink, so the run's result is visible to the TUI -/// and protocol reducers. The streaming text deltas are emitted by the sampling -/// driver through the same durable sink. -struct StoreObserver { - sink: Arc, - session_id: String, -} - -impl StoreObserver { - fn new(sink: Arc, session_id: String) -> Self { - Self { sink, session_id } - } -} +/// Runtime-owned runs accept `session.done` only after the turn loop has returned +/// and runtime resources have been quiesced. Streaming model/tool events are +/// still persisted by the sampling driver through [`RuntimeStoreSink`]. +struct StoreObserver; impl TurnObserver for StoreObserver { - fn on_lifecycle(&self, ev: TurnLifecycleEvent) { - // Phase-E seam: started/aborted lifecycle markers are not surfaced as - // store events yet (the legacy stack had richer turn-lifecycle telemetry). - // We persist the terminal session result, which is what readers need today. - if let TurnLifecycleEvent::TurnComplete { - last_agent_message: Some(text), - .. - } = ev - { - self.sink.emit(PendingEvent::new( - self.session_id.clone(), - names::SESSION_DONE, - session_done_payload(Some(&text), None), - )); - } - } + fn on_lifecycle(&self, _ev: TurnLifecycleEvent) {} } /// A network-free scripted driver for the `Fake` backend. @@ -2588,15 +2659,7 @@ impl RuntimeTurnLoopDriver { } *state.pre_turn_replay_from_seq.lock().unwrap() = None; - // The observer persists the terminal agent message through the runtime so - // `session.done` is journaled and projected by the same live authority as - // model/tool events. - let sink: Arc = Arc::new(RuntimeStoreSink { - runtime: runtime_handle, - store: Arc::clone(&store), - model_context_window: None, - }); - let observer = StoreObserver::new(sink, session_id.as_str().to_string()); + let observer = StoreObserver; let turn_loop = TurnLoop::new(state, driver, observer); let result = match max_turns { @@ -2611,10 +2674,83 @@ impl RuntimeTurnLoopDriver { .await } }; - if result.is_ok() { - ensure_fallback_capture_recording(&store, session_id.as_str()); + let runtime_session_id = RuntimeSessionId::from_string(session_id.as_str().to_string())?; + match result { + Ok(last_agent_message) => { + ensure_fallback_capture_recording(&store, session_id.as_str()); + cleanup_session_resources_for_terminal( + runtime_handle.clone(), + runtime_session_id.clone(), + ) + .await?; + // Never lose finished work: if the model ended without a final + // message (no done() call — e.g. ended on leaked planning text + // or an empty turn), fall back to the best result.* artifact in + // the session cwd so session.done still carries the deliverable. + let mut final_message = last_agent_message; + let mut result_file: Option = None; + if final_message.is_none() && !cancel.is_cancelled() { + if let Some(fallback) = + fallback_result_file_for_session(&store, session_id.as_str()) + { + final_message = Some(fallback.text); + result_file = Some(fallback.file); + } + } + if let Some(text) = final_message.as_deref() { + runtime_handle.append_observed_session_event( + runtime_session_id, + names::SESSION_DONE, + session_done_payload(Some(text), result_file.as_ref()), + RuntimeDurability::Barrier, + )?; + } + Ok(final_message) + } + Err(error) => { + let _ = cleanup_session_resources_for_terminal( + runtime_handle.clone(), + runtime_session_id, + ) + .await; + Err(error) + } } - result + } +} + +async fn cleanup_session_resources_for_terminal( + runtime_handle: RuntimeHandle, + runtime_session_id: RuntimeSessionId, +) -> Result { + let session_for_error = runtime_session_id.as_str().to_string(); + tokio::task::spawn_blocking(move || { + std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + runtime_handle.cleanup_session_resources(&runtime_session_id) + })) + }) + .await + .map_err(|error| { + AgentError::Other(anyhow::anyhow!( + "terminal cleanup task failed to join for session {session_for_error}: {error}" + )) + })? + .map_err(|panic| { + AgentError::Other(anyhow::anyhow!( + "terminal cleanup panicked for session {session_for_error}: {}", + panic_payload_message(panic) + )) + })? + .map_err(AgentError::Other) +} + +fn panic_payload_message(payload: Box) -> String { + if let Some(message) = payload.downcast_ref::<&str>() { + (*message).to_string() + } else if let Some(message) = payload.downcast_ref::() { + message.clone() + } else { + "unknown panic payload".to_string() } } @@ -4402,6 +4538,105 @@ mod tests { ); } + #[tokio::test] + async fn runtime_backed_done_is_journaled_after_resource_cleanup() { + let (dir, store, session_id) = store_with_session(); + seed_user_input(&store, &session_id, "initial").await; + let journal = Arc::new(SqliteJournal::from_store( + Store::open(dir.path()).expect("open runtime store"), + )); + let persistence: Arc = journal.clone(); + let state_index: Arc = journal; + let runtime = BrowserUseRuntime::new(persistence, state_index).handle(); + let runtime_session_id = + RuntimeSessionId::from_string(session_id.clone()).expect("runtime session id"); + runtime + .attach_root_agent(AttachRootAgentRequest { + session_id: runtime_session_id.clone(), + cwd: std::path::PathBuf::from("/work"), + task: "root".to_string(), + max_concurrent_threads_per_session: 3, + }) + .expect("attach root"); + + let cleanup_store = Arc::clone(&store); + let cleanup_session_id = session_id.clone(); + runtime + .get_or_insert_session_resource( + &runtime_session_id, + "test.cleanup_before_done", + || 1usize, + move |_resource: Arc| { + let store = cleanup_store.lock().expect("store mutex poisoned"); + store + .append_event( + &cleanup_session_id, + "test.cleanup", + json!({ "phase": "terminal_barrier" }), + ) + .expect("append cleanup marker"); + 1 + }, + ) + .expect("register cleanup marker"); + let blocking_cleanup_store = Arc::clone(&store); + let blocking_cleanup_session_id = session_id.clone(); + runtime + .get_or_insert_session_resource( + &runtime_session_id, + "test.blocking_cleanup_before_done", + || 1usize, + move |_resource: Arc| { + let _client = reqwest::blocking::Client::new(); + let store = blocking_cleanup_store.lock().expect("store mutex poisoned"); + store + .append_event( + &blocking_cleanup_session_id, + "test.blocking_cleanup", + json!({ "phase": "terminal_barrier" }), + ) + .expect("append blocking cleanup marker"); + 1 + }, + ) + .expect("register blocking cleanup marker"); + + run_session_with_config_with_cancel_and_runtime( + Arc::clone(&store), + &session_id, + fake_config(), + CancellationToken::new(), + Some(runtime), + ) + .await + .expect("runtime-backed run"); + + let log = events(&store, &session_id); + let cleanup_seq = log + .iter() + .find(|event| event.event_type == "test.cleanup") + .map(|event| event.seq) + .expect("cleanup marker"); + let blocking_cleanup_seq = log + .iter() + .find(|event| event.event_type == "test.blocking_cleanup") + .map(|event| event.seq) + .expect("blocking cleanup marker"); + let done_seq = log + .iter() + .find(|event| event.event_type == names::SESSION_DONE) + .map(|event| event.seq) + .expect("session.done"); + assert!( + cleanup_seq < done_seq, + "terminal cleanup must be durable before session.done: cleanup={cleanup_seq}, done={done_seq}" + ); + assert!( + blocking_cleanup_seq < done_seq, + "blocking terminal cleanup must be durable before session.done: cleanup={blocking_cleanup_seq}, done={done_seq}" + ); + } + #[tokio::test] async fn token_status_prefers_provider_usage_over_whole_prompt_estimate() { let (_dir, store, session_id) = store_with_session(); diff --git a/crates/browser-use-agent/src/entrypoint/provider.rs b/crates/browser-use-agent/src/entrypoint/provider.rs index 3281a54c..a34ccbdc 100644 --- a/crates/browser-use-agent/src/entrypoint/provider.rs +++ b/crates/browser-use-agent/src/entrypoint/provider.rs @@ -120,6 +120,8 @@ pub type RealSamplingDriver = ModelSamplingDriver< RegistryRunner, >; +const DISABLE_LOCAL_SEARCH_ENV: &str = "BROWSER_USE_DISABLE_LOCAL_SEARCH"; + /// The production tool dispatcher type: a [`RegistryRunner`] whose approver is the /// REAL [`GuardianApprover`] (permissive sandbox seam). Named so the builder + the /// fused driver agree on the runner's generic arguments. @@ -1324,8 +1326,11 @@ fn build_tool_dispatcher_with_cwd_and_goal_store( ); // `search`: locally-executed DuckDuckGo (Lite) web search — the client runs // the HTTP request and parses the results itself (distinct from the hosted - // `web_search` above). Read-only, so parallel_safe = true. - reg.register::<_, SearchRequest>("search", definitions::search(), true, SearchTool::new()); + // `web_search` above). Eval runs can disable it to avoid captcha-heavy + // detours while keeping hosted `web_search` available. + if local_search_enabled_for_run(config) { + reg.register::<_, SearchRequest>("search", definitions::search(), true, SearchTool::new()); + } let browser_backend = browser_backend_for_runtime_or_config( config, runtime_handle.as_ref(), @@ -1537,6 +1542,43 @@ fn legacy_subagent_tools_enabled_for_run(config: &ProviderRunConfig) -> bool { && config.options.child_agent_runner.is_some() } +fn local_search_enabled_for_run(config: &ProviderRunConfig) -> bool { + if config_override_bool_any( + &config.options.config_overrides, + &[ + "disable_local_search", + "tools.disable_local_search", + "search.disabled", + ], + ) == Some(true) + { + return false; + } + if config_override_bool_any( + &config.options.config_overrides, + &[ + "local_search_enabled", + "tools.local_search_enabled", + "search.enabled", + ], + ) == Some(false) + { + return false; + } + !std::env::var(DISABLE_LOCAL_SEARCH_ENV) + .ok() + .is_some_and(|value| env_flag_enabled(&value)) +} + +fn env_flag_enabled(value: &str) -> bool { + let normalized = value.trim().to_ascii_lowercase(); + !normalized.is_empty() + && !matches!( + normalized.as_str(), + "0" | "false" | "off" | "no" | "disabled" + ) +} + fn apply_role_tool_policy( reg: &mut crate::tools::registry::ToolRegistry, config: &ProviderRunConfig, @@ -3289,6 +3331,30 @@ mod tests { ); } + #[test] + fn disable_local_search_override_hides_duckduckgo_search_only() { + let options = crate::config_overrides::AgentRunOptions { + config_overrides: vec![( + "disable_local_search".to_string(), + toml::Value::Boolean(true), + )], + ..crate::config_overrides::AgentRunOptions::default() + }; + let config = + ProviderRunConfig::new(ProviderBackend::Fake, "fake-model").with_options(options); + let dispatcher = build_tool_dispatcher(Arc::new(MarkerPythonBackend), &config, None); + let names: Vec<&str> = dispatcher + .tool_specs() + .iter() + .map(|s| s.name.as_str()) + .collect(); + + assert!(names.contains(&"web_search")); + assert!(!names.contains(&"search")); + assert!(names.contains(&"browser_script")); + assert!(names.contains(&"done")); + } + /// A non-empty `mcp_servers` map registers the `mcp` tool. The stdio server /// command (`true`) connects to nothing useful, but `connect_all`'s per-server /// failure isolation still yields a manager and the registration wiring diff --git a/crates/browser-use-agent/src/tools/handlers/browser.rs b/crates/browser-use-agent/src/tools/handlers/browser.rs index db632c03..e07dfc9a 100644 --- a/crates/browser-use-agent/src/tools/handlers/browser.rs +++ b/crates/browser-use-agent/src/tools/handlers/browser.rs @@ -66,9 +66,12 @@ pub const DEFAULT_BROWSER_SCRIPT_TIMEOUT_SECS: u64 = 300; /// Default observe poll window (ms) for [`BrowserAction::Observe`]. /// -/// Long browser_script runs should be observed in coarse windows so the agent -/// does not burn many LLM turns polling the same run_id while work is ongoing. -pub const DEFAULT_OBSERVE_TIMEOUT_MS: u64 = 30_000; +/// Restored to the pre-regression (88-baseline) 1s default. The 30s default + +/// 30s clamp-floor ("observe30") blocked each observe for up to 30s, burning the +/// run-level task timebox on long-running scripts and leaving tasks unfinished +/// (e.g. real_v8 tasks 1, 4 never emitted session.done). The run-level timebox, +/// not coarse poll windows, is responsible for bounding total turns. +pub const DEFAULT_OBSERVE_TIMEOUT_MS: u64 = 1_000; pub const MAX_OBSERVE_TIMEOUT_MS: u64 = 120_000; /// Appended to `browser_script` stdout when the response carries image parts. @@ -79,15 +82,19 @@ pub const MAX_OBSERVE_TIMEOUT_MS: u64 = 120_000; pub const BROWSER_SCRIPT_CONTENT_STDOUT_PREFIX: &str = "\n__browser_script_content__:"; /// Maximum bytes of browser-script text returned to the next model turn. /// -/// Full browser-script output is persisted through durable events/artifacts; the -/// inline model view is deliberately smaller because long eval tasks repeatedly -/// carry every prior tool result in later prompts. -pub const MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES: usize = 4 * 1024; +/// Matches the collected-output limit (SCRIPT_MAX_OUTPUT_CHARS = 120k): the +/// CURRENT turn must see the full script output to write correct follow-up +/// code (codex parity: fresh outputs are never capped; the context manager +/// truncates tool outputs only as they age into history via the policy*1.2 +/// rule in context/mod.rs). The old 4KB cap forced blind guess-first coding +/// (KeyError-class bug explosion 8->53 across the 88->81 eval regression). +pub const MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES: usize = 120 * 1024; const BROWSER_PREF_MODE: &str = "browser.preference.mode"; const BROWSER_PREF_BROWSER: &str = "browser.preference.browser"; const BROWSER_PREF_BROWSER_LABEL: &str = "browser.preference.browser_label"; const BROWSER_PREF_PROFILE: &str = "browser.preference.profile"; +const EVAL_MAX_OBSERVE_TIMEOUT_ENV: &str = "BROWSER_USE_EVAL_MAX_OBSERVE_TIMEOUT_MS"; const BROWSER_DOMAIN_PROFILE_PREFIX: &str = "browser.domain_profile."; const BROWSER_SCRIPT_MAX_IMAGE_DIMENSION: u32 = 8_000; const BROWSER_PREF_PROFILE_LABEL: &str = "browser.preference.profile_label"; @@ -201,12 +208,38 @@ impl BrowserRequest { } fn effective_observe_ms(&self) -> u64 { - self.observe_timeout_ms + let normal = self + .observe_timeout_ms .unwrap_or(DEFAULT_OBSERVE_TIMEOUT_MS) - .clamp(DEFAULT_OBSERVE_TIMEOUT_MS, MAX_OBSERVE_TIMEOUT_MS) + .clamp(DEFAULT_OBSERVE_TIMEOUT_MS, MAX_OBSERVE_TIMEOUT_MS); + match eval_max_observe_timeout_ms() { + Some(max_ms) => normal.min(max_ms.max(1_000)), + None => normal, + } } } +fn eval_max_observe_timeout_ms() -> Option { + // Observe timing is controlled ONLY by its own env var. It used to also be + // implicitly capped at 30s whenever BROWSER_USE_EVAL_DONE_AUDIT was set — + // a hidden cross-subsystem coupling (a finalization flag silently changing + // polling behavior) that made eval runs hard to reason about. + std::env::var(EVAL_MAX_OBSERVE_TIMEOUT_ENV) + .ok() + .and_then(|value| value.trim().parse::().ok()) + .filter(|value| *value > 0) +} + +#[allow(dead_code)] +fn env_flag_enabled(value: &str) -> bool { + let normalized = value.trim().to_ascii_lowercase(); + !normalized.is_empty() + && !matches!( + normalized.as_str(), + "0" | "false" | "off" | "no" | "disabled" + ) +} + /// Model-facing wire arguments for the browser tool. /// /// [`BrowserRequest`] is a PARSED form: its [`BrowserAction`] is an internally @@ -2375,7 +2408,7 @@ fn cap_inline_browser_script_stdout(text: String) -> String { let elided = text.len() - end; let mut out = text[..end].to_string(); out.push_str(&format!( - "\n... [browser_script stdout truncated, {elided} more bytes; full output persisted. Use a narrower browser_script extraction, the emitted summaries, or a saved artifact instead of re-reading broad page text.]" + "\n... [browser_script stdout truncated, {elided} more bytes; full output persisted to the run artifact. Read the saved artifact or re-extract the missing portion before relying on this output.]" )); out } @@ -3176,4 +3209,25 @@ mod browser_mode_tests { ) .unwrap(); } + + #[test] + fn stored_cloud_preference_rejects_managed_only_recovery() { + let dir = tempfile::tempdir().unwrap(); + let store = Store::open(dir.path()).unwrap(); + store.set_setting(BROWSER_PREF_MODE, "cloud").unwrap(); + + let err = resolve_browser_command_for_selected_mode( + Some(&store), + "browser recover restart-owned-browser", + None, + None, + ) + .unwrap_err(); + + assert!( + err.to_string() + .contains("restart-owned-browser only applies to managed Chromium"), + "{err:#}" + ); + } } diff --git a/crates/browser-use-agent/src/tools/handlers/browser_tests.rs b/crates/browser-use-agent/src/tools/handlers/browser_tests.rs index bd7ef14f..5c041f47 100644 --- a/crates/browser-use-agent/src/tools/handlers/browser_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/browser_tests.rs @@ -1422,7 +1422,7 @@ async fn script_truncated_structured_output_preserves_summary_first() { ); assert!( out.stdout - .contains("Use a narrower browser_script extraction"), + .contains("Read the saved artifact or re-extract the missing portion"), "stdout: {}", out.stdout ); @@ -1434,8 +1434,11 @@ async fn script_truncated_structured_output_preserves_summary_first() { } #[test] -fn browser_script_stdout_cap_defaults_to_four_kib_for_eval_cost() { - assert_eq!(MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES, 4 * 1024); +fn browser_script_stdout_cap_matches_collected_output_limit() { + // The CURRENT turn must see full script output to write correct follow-up + // code; history growth is handled by the context manager (policy*1.2). + // The old 4KB cap caused blind guess-first coding (88->81 regression). + assert_eq!(MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES, 120 * 1024); } #[tokio::test] @@ -1594,10 +1597,9 @@ async fn observe_timeout_is_clamped_to_coarse_poll_window() { }; let out = run_direct(&tool, &req).await.unwrap(); assert_eq!(out.exit_code, 0); - assert_eq!( - backend.last_observe_timeout_ms(), - Some(DEFAULT_OBSERVE_TIMEOUT_MS) - ); + // 5s sits inside the allowed [DEFAULT_OBSERVE_TIMEOUT_MS, MAX] window and + // passes through unchanged (the old 30s floor inflated short observes). + assert_eq!(backend.last_observe_timeout_ms(), Some(5_000)); req.observe_timeout_ms = Some(180_000); let out = run_direct(&tool, &req).await.unwrap(); diff --git a/crates/browser-use-agent/src/tools/handlers/done.rs b/crates/browser-use-agent/src/tools/handlers/done.rs index 6b38c8cf..d7564b1f 100644 --- a/crates/browser-use-agent/src/tools/handlers/done.rs +++ b/crates/browser-use-agent/src/tools/handlers/done.rs @@ -16,16 +16,11 @@ //! recognize that the agent declared itself finished, and so the final `text` //! (the summary) is surfaced to the host. //! -//! It does NOT itself force the turn loop to stop: the loop's termination signal -//! is "the model produced a final assistant message with NO tool calls" -//! ([`TurnRunOutcome::NoToolCalls`](crate::turn::loop::TurnRunOutcome)). A `done` -//! call is a tool call, so it is dispatched, recorded, and the loop re-samples; -//! the model then typically produces a final no-tool message and the loop stops. -//! Wiring the loop to treat a successful `done` [`ExecOutput`] as terminal (a -//! short-circuit) needs the loop's classifier (`turn/loop.rs` / -//! `turn/fusion.rs`) to inspect the dispatched tool name/output — those files are -//! outside this WP's owned set, so that deeper loop wiring is REPORTED, not -//! implemented here. +//! A successful `done` output is treated as terminal by the fused sampling loop. +//! In eval mode (`BROWSER_USE_EVAL_DONE_AUDIT=1`) this handler performs a small +//! completion audit first. If the final answer is obviously empty, placeholder +//! heavy, or explicitly partial, it rejects the call so the loop gives the model +//! one more repair turn instead of accepting a weak completion. //! //! # Parity grounding //! @@ -49,6 +44,12 @@ use crate::tools::runtime::{ Approvable, ExecOutput, SandboxAttempt, Sandboxable, ToolCtx, ToolError, ToolRuntime, }; use crate::tools::sandbox::{SandboxPermissions, SandboxPreference}; +use serde_json::Value; +use std::fs; +use std::path::{Path, PathBuf}; + +const EVAL_DONE_AUDIT_ENV: &str = "BROWSER_USE_EVAL_DONE_AUDIT"; +const DONE_AUDIT_TEXT_PREVIEW_BYTES: usize = 256 * 1024; /// The tool name surfaced to the model. /// @@ -207,12 +208,16 @@ impl ToolRuntime for DoneTool { &self, req: &DoneRequest, attempt: &SandboxAttempt<'_>, - _ctx: &ToolCtx, + ctx: &ToolCtx, ) -> Result { // No sandbox is exercised (the tool does no I/O); acknowledge the attempt // to make the seam explicit, matching the other tools. let _ = attempt; + if eval_done_audit_enabled() { + audit_done_request(req, ctx).map_err(ToolError::Rejected)?; + } + // Record the final summary into a deterministic, prefixed acknowledgement // the loop/host can recognize as the declared completion. The summary may // be empty (the model can declare done without a message). @@ -224,3 +229,228 @@ impl ToolRuntime for DoneTool { }) } } + +fn eval_done_audit_enabled() -> bool { + std::env::var(EVAL_DONE_AUDIT_ENV) + .ok() + .is_some_and(|value| env_flag_enabled(&value)) +} + +fn env_flag_enabled(value: &str) -> bool { + let normalized = value.trim().to_ascii_lowercase(); + !normalized.is_empty() + && !matches!( + normalized.as_str(), + "0" | "false" | "off" | "no" | "disabled" + ) +} + +pub(crate) fn audit_done_request(req: &DoneRequest, ctx: &ToolCtx) -> Result<(), String> { + let mut reasons = Vec::new(); + let mut audit_text = req.summary(); + let mut has_material_answer = !audit_text.trim().is_empty(); + + if let Some(result_file) = req + .result_file + .as_deref() + .map(str::trim) + .filter(|value| !value.is_empty()) + { + match read_result_file_preview(result_file, ctx) { + Ok(preview) => { + has_material_answer = true; + if let Some(preview) = preview { + reasons.extend(json_audit_reasons(&preview)); + if !audit_text.is_empty() { + audit_text.push('\n'); + } + audit_text.push_str(&preview); + } + } + Err(reason) => reasons.push(reason), + } + } + + if !has_material_answer { + reasons.push("no final answer text or readable non-empty result_file".to_string()); + } + + // NOTE: phrase-based wording checks (text_audit_reasons) were removed. + // They rejected honest negative findings ("could not access", "blocked + // by", "source unavailable"), and the measured agent response was to + // resubmit the SAME data with laundered, more confident phrasing — the + // audit selected for wording over truth (real_v8 tasks 75, 77) and + // forced futile work on source-exhausted specs (task 87, 5.7 min after a + // correct final answer). Evidence-based checks (empty result, placeholder + // density) below carry the audit's measured wins (task 41). + reasons.extend(json_audit_reasons(&audit_text)); + + if reasons.is_empty() { + Ok(()) + } else { + Err(format!( + "done audit rejected the final answer: {}. Continue working, verify and fill the missing fields, then call done again. Only use an incomplete fallback if the run is genuinely out of turns.", + reasons.join("; ") + )) + } +} + +fn read_result_file_preview(path: &str, ctx: &ToolCtx) -> Result, String> { + let resolved = resolve_result_file(path, ctx); + let metadata = fs::metadata(&resolved).map_err(|error| { + format!( + "result_file `{}` is not readable at {} ({error})", + path, + resolved.display() + ) + })?; + if !metadata.is_file() { + return Err(format!( + "result_file `{}` is not a regular file ({})", + path, + resolved.display() + )); + } + if metadata.len() == 0 { + return Err(format!("result_file `{}` is empty", path)); + } + + let bytes = fs::read(&resolved).map_err(|error| { + format!( + "result_file `{}` could not be read at {} ({error})", + path, + resolved.display() + ) + })?; + let cap = bytes.len().min(DONE_AUDIT_TEXT_PREVIEW_BYTES); + Ok(std::str::from_utf8(&bytes[..cap]) + .ok() + .map(ToOwned::to_owned)) +} + +fn resolve_result_file(path: &str, ctx: &ToolCtx) -> PathBuf { + let requested = Path::new(path); + if requested.is_absolute() { + return requested.to_path_buf(); + } + + let cwd_path = ctx.cwd.join(requested); + if cwd_path.exists() { + return cwd_path; + } + + let artifact_path = ctx.artifact_root.join(requested); + if artifact_path.exists() { + return artifact_path; + } + + cwd_path +} + + +fn json_audit_reasons(text: &str) -> Vec { + let Some(value) = parse_first_json_value(text) else { + return Vec::new(); + }; + let mut reasons = Vec::new(); + if json_is_empty_result(&value) { + reasons.push("JSON result is empty".to_string()); + } + let mut stats = JsonPlaceholderStats::default(); + collect_json_placeholder_stats(&value, &mut stats); + if stats.total_scalar_fields >= 8 + && stats.placeholder_fields * 100 >= stats.total_scalar_fields * 30 + { + reasons.push(format!( + "JSON result has too many placeholder fields ({}/{})", + stats.placeholder_fields, stats.total_scalar_fields + )); + } + reasons +} + +fn parse_first_json_value(text: &str) -> Option { + let trimmed = text.trim(); + serde_json::from_str(trimmed).ok() +} + +fn json_is_empty_result(value: &Value) -> bool { + match value { + Value::Array(items) => items.is_empty(), + Value::Object(map) => { + if map.is_empty() { + return true; + } + let array_values = map + .values() + .filter_map(|value| value.as_array()) + .collect::>(); + !array_values.is_empty() && array_values.iter().all(|items| items.is_empty()) + } + _ => false, + } +} + +#[derive(Default)] +struct JsonPlaceholderStats { + total_scalar_fields: usize, + placeholder_fields: usize, +} + +fn collect_json_placeholder_stats(value: &Value, stats: &mut JsonPlaceholderStats) { + match value { + // A field explicitly set to null means the agent checked the source and + // recorded a genuine absence. Many tasks REQUIRE null/empty for + // unavailable fields, so count null toward the denominator but NOT as a + // placeholder. Counting it as a placeholder pushed the agent to delete + // required fields to satisfy the audit (real_v8 task 53 regression). + Value::Null => stats.total_scalar_fields += 1, + Value::Bool(_) | Value::Number(_) => stats.total_scalar_fields += 1, + Value::String(text) => { + stats.total_scalar_fields += 1; + if is_placeholder_string(text) { + stats.placeholder_fields += 1; + } + } + Value::Array(items) => { + for item in items { + collect_json_placeholder_stats(item, stats); + } + } + Value::Object(map) => { + for value in map.values() { + collect_json_placeholder_stats(value, stats); + } + } + } +} + +fn is_placeholder_string(text: &str) -> bool { + let normalized = text.trim().to_ascii_lowercase(); + // An empty string, like null, is a deliberate "checked, genuinely absent" + // sentinel — and many tasks explicitly MANDATE "" for missing values. + // Counting "" as a placeholder rejected a spec-correct answer and coerced + // the agent into writing literal placeholder prose instead (real_v8 task + // 94: first done used "" per spec, audit bounced it, rewrite with + // "Members only - please Login..." text then failed the judge). + if normalized.is_empty() { + return false; + } + matches!( + normalized.as_str(), + "unknown" + | "n/a" + | "na" + | "none" + | "null" + | "missing" + | "not found" + | "unavailable" + | "not available" + | "not listed" + | "not checked" + | "not extracted" + | "could not determine" + ) || normalized.starts_with("could not ") + || normalized.starts_with("unable to ") +} diff --git a/crates/browser-use-agent/src/tools/handlers/done_tests.rs b/crates/browser-use-agent/src/tools/handlers/done_tests.rs index 198e493d..a56e80c9 100644 --- a/crates/browser-use-agent/src/tools/handlers/done_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/done_tests.rs @@ -5,7 +5,7 @@ //! analog) — direct `run` calls plus one drive through the orchestrator over the //! seam. -use super::done::{DoneRequest, DoneTool, DONE_STDOUT_PREFIX}; +use super::done::{audit_done_request, DoneRequest, DoneTool, DONE_STDOUT_PREFIX}; use crate::tools::approval::AskForApproval; use crate::tools::orchestrator::{ToolOrchestrator, TurnEnv}; use crate::tools::runtime::{AutoApprover, SandboxAttempt, ToolCtx, ToolRuntime}; @@ -175,3 +175,72 @@ fn done_is_not_parallel_safe() { "done must be serial: completion is terminal and must not be reordered" ); } + +// ---- (6) eval-mode completion audit catches weak finals before terminal stop ---- + +#[test] +fn eval_done_audit_accepts_honest_partial_declaration() { + // Phrase-based wording rejections were removed: they trained the model to + // launder the SAME data behind confident phrasing (real_v8 tasks 75/77) + // and forced futile work on source-exhausted specs (task 87). Honest + // declarations are accepted; evidence checks (empty/placeholder JSON) + // still gate weak completions. + audit_done_request( + &DoneRequest::with_result("partial_incomplete: two fields were not checked"), + &ctx(), + ) + .expect("honest partial declaration is accepted"); +} + +#[test] +fn eval_done_audit_rejects_missing_result_file() { + let req = DoneRequest { + result_file: Some("missing-result.json".to_string()), + ..DoneRequest::default() + }; + + let err = audit_done_request(&req, &ctx()).expect_err("missing result file should reject"); + assert!(err.contains("result_file"), "got: {err}"); + assert!(err.contains("not readable"), "got: {err}"); +} + +#[test] +fn eval_done_audit_rejects_placeholder_heavy_json() { + let temp = tempfile::tempdir().unwrap(); + let path = temp.path().join("answer.json"); + std::fs::write( + &path, + r#"{ + "items": [ + {"name": "unknown", "price": "not found", "url": ""}, + {"name": "missing", "price": "unavailable", "url": "not checked"}, + {"name": "ok", "price": "12", "url": "https://example.test"} + ] + }"#, + ) + .unwrap(); + let req = DoneRequest { + result_file: Some(path.to_string_lossy().to_string()), + ..DoneRequest::default() + }; + + let err = audit_done_request(&req, &ctx()).expect_err("placeholder JSON should reject"); + assert!(err.contains("placeholder"), "got: {err}"); +} + +#[test] +fn eval_done_audit_accepts_non_empty_material_file() { + let temp = tempfile::tempdir().unwrap(); + let path = temp.path().join("answer.json"); + std::fs::write( + &path, + r#"{"items":[{"name":"A","price":"12","url":"https://example.test/a"},{"name":"B","price":"14","url":"https://example.test/b"}]}"#, + ) + .unwrap(); + let req = DoneRequest { + result_file: Some(path.to_string_lossy().to_string()), + ..DoneRequest::default() + }; + + audit_done_request(&req, &ctx()).expect("material JSON should pass"); +} diff --git a/crates/browser-use-agent/src/tools/handlers/shell.rs b/crates/browser-use-agent/src/tools/handlers/shell.rs index 5ab33d8c..1e6409c8 100644 --- a/crates/browser-use-agent/src/tools/handlers/shell.rs +++ b/crates/browser-use-agent/src/tools/handlers/shell.rs @@ -48,6 +48,7 @@ use crate::tools::unified_exec::{ /// Matches codex `DEFAULT_EXEC_COMMAND_TIMEOUT_MS` (exec.rs:51) and legacy /// `DEFAULT_SHELL_COMMAND_TIMEOUT_MS` (command.rs:26). pub const DEFAULT_SHELL_COMMAND_TIMEOUT_MS: u64 = 10_000; +pub const DEFAULT_EXEC_COMMAND_TIMEOUT_MS: u64 = 600_000; /// Exit code reported when a command is killed for exceeding its timeout. /// @@ -312,6 +313,10 @@ pub struct ExecCommandRequest { /// Initial output collection window before returning a process id. #[serde(default)] pub yield_time_ms: Option, + /// Maximum wall time before the process tree is killed. Defaults to a + /// generous cap so one forgotten command cannot pin the whole run. + #[serde(default)] + pub timeout_ms: Option, /// Maximum model-visible output tokens. #[serde(default)] pub max_output_tokens: Option, @@ -348,6 +353,10 @@ impl ExecCommandRequest { fn yield_time_ms(&self) -> u64 { self.yield_time_ms.unwrap_or(DEFAULT_EXEC_YIELD_TIME_MS) } + + fn timeout_ms(&self) -> u64 { + self.timeout_ms.unwrap_or(DEFAULT_EXEC_COMMAND_TIMEOUT_MS) + } } /// Send stdin to, or poll, a live unified exec process. @@ -610,7 +619,7 @@ impl ToolRuntime for ExecCommandTool { tty: req.tty, yield_time_ms: req.yield_time_ms(), max_output_tokens: req.max_output_tokens, - timeout_ms: None, + timeout_ms: Some(req.timeout_ms()), kill_on_cancel: false, call_id: ctx.call_id.clone(), tool_name: ctx.tool_name.clone(), diff --git a/crates/browser-use-agent/src/tools/handlers/shell_tests.rs b/crates/browser-use-agent/src/tools/handlers/shell_tests.rs index 76cbf324..caca0ef7 100644 --- a/crates/browser-use-agent/src/tools/handlers/shell_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/shell_tests.rs @@ -96,6 +96,7 @@ async fn exec_command_returns_session_id_and_write_stdin_polls_to_exit() { shell: None, login: None, yield_time_ms: Some(10), + timeout_ms: None, max_output_tokens: None, env: HashMap::new(), tty: false, @@ -130,6 +131,45 @@ async fn exec_command_returns_session_id_and_write_stdin_polls_to_exit() { assert!(second.stdout.contains("Output:\ndone")); } +#[tokio::test] +async fn exec_command_timeout_kills_process_group() { + let dir = tempfile::tempdir().unwrap(); + let manager = UnifiedExecManager::deterministic_for_tests(); + let exec = ExecCommandTool::new(manager); + let launch = none_launch(); + let attempt = none_attempt(&launch); + let exec_ctx = ctx_for(dir.path(), "exec_command"); + let marker = dir.path().join("late.txt"); + + let out = exec + .run( + &ExecCommandRequest { + cmd: Some("(sleep 1; printf bad > late.txt) & sleep 5".to_string()), + command: None, + workdir: None, + cwd: None, + shell: None, + login: None, + yield_time_ms: Some(1_000), + timeout_ms: Some(100), + max_output_tokens: None, + env: HashMap::new(), + tty: false, + }, + &attempt, + &exec_ctx, + ) + .await + .expect("exec_command returns timeout snapshot"); + + assert!(out.stdout.contains("Process exited with code 124")); + tokio::time::sleep(std::time::Duration::from_millis(1_200)).await; + assert!( + !marker.exists(), + "exec_command timeout should kill background children" + ); +} + #[tokio::test] async fn write_stdin_rejects_non_tty_input() { let dir = tempfile::tempdir().unwrap(); @@ -151,6 +191,7 @@ async fn write_stdin_rejects_non_tty_input() { shell: None, login: None, yield_time_ms: Some(10), + timeout_ms: None, max_output_tokens: None, env: HashMap::new(), tty: false, @@ -205,6 +246,7 @@ async fn tty_write_stdin_sends_input_and_returns_exit_metadata() { shell: None, login: None, yield_time_ms: Some(10), + timeout_ms: None, max_output_tokens: None, env: HashMap::new(), tty: true, @@ -262,6 +304,7 @@ async fn exec_command_applies_codex_env_defaults() { shell: None, login: None, yield_time_ms: Some(1_000), + timeout_ms: None, max_output_tokens: None, env, tty: false, @@ -300,6 +343,7 @@ async fn exec_command_interruption_preserves_live_session() { shell: None, login: None, yield_time_ms: Some(5_000), + timeout_ms: None, max_output_tokens: None, env: HashMap::new(), tty: false, @@ -371,6 +415,7 @@ async fn exec_command_cancel_returns_live_session_snapshot() { shell: None, login: None, yield_time_ms: Some(30_000), + timeout_ms: None, max_output_tokens: None, env: HashMap::new(), tty: false, @@ -429,6 +474,7 @@ async fn terminate_all_best_effort_kills_managed_processes() { shell: None, login: None, yield_time_ms: Some(250), + timeout_ms: None, max_output_tokens: None, env: HashMap::new(), tty: false, @@ -469,6 +515,7 @@ async fn exec_command_max_output_tokens_truncates_model_output() { shell: None, login: None, yield_time_ms: Some(1_000), + timeout_ms: None, max_output_tokens: Some(2), env: HashMap::new(), tty: false, diff --git a/crates/browser-use-agent/src/tools/registry.rs b/crates/browser-use-agent/src/tools/registry.rs index 869a96ff..3a690e38 100644 --- a/crates/browser-use-agent/src/tools/registry.rs +++ b/crates/browser-use-agent/src/tools/registry.rs @@ -816,6 +816,10 @@ pub mod definitions { "type": "integer", "description": "How long to wait (in milliseconds) for output before yielding." }, + "timeout_ms": { + "type": "integer", + "description": "Maximum wall time in milliseconds before the process tree is killed. Defaults to a generous failsafe cap." + }, "max_output_tokens": { "type": "integer", "description": "Maximum number of tokens to return. Excess output will be truncated." @@ -1116,7 +1120,7 @@ to the single frame that proves the task succeeded." ToolDefinition { name: "done".to_string(), description: - "Signal that the task is finished, carrying the user-facing final answer. If the wall-clock or step budget is nearly exhausted, call done with the best verified partial result instead of continuing until external cancellation; clearly mark unknown, unavailable, or incomplete fields and name the remaining gaps." + "Signal that the user-facing task is complete, carrying the final answer. Call done only after the requested result is complete and verified or persisted. If the remaining step budget is nearly exhausted and a complete answer is no longer possible, call done with an explicitly incomplete fallback result that names the missing requirements." .to_string(), input_schema: json!({ "type": "object", diff --git a/crates/browser-use-agent/src/tools/registry_tests.rs b/crates/browser-use-agent/src/tools/registry_tests.rs index 3b92411d..a74b0795 100644 --- a/crates/browser-use-agent/src/tools/registry_tests.rs +++ b/crates/browser-use-agent/src/tools/registry_tests.rs @@ -1024,13 +1024,14 @@ fn definitions_carry_required_fields_and_names() { } #[test] -fn done_definition_preserves_timebox_partial_result_guidance() { +fn done_definition_requires_complete_result_with_step_exhaustion_fallback() { let done = definitions::done(); - assert!(done.description.contains("best verified partial result")); - assert!(done.description.contains("external cancellation")); + assert!(done.description.contains("task is complete")); + assert!(done.description.contains("complete and verified")); + assert!(done.description.contains("remaining step budget")); assert!(done .description - .contains("unknown, unavailable, or incomplete fields")); + .contains("explicitly incomplete fallback result")); } #[test] diff --git a/crates/browser-use-agent/src/tools/unified_exec.rs b/crates/browser-use-agent/src/tools/unified_exec.rs index da97c91d..89616e49 100644 --- a/crates/browser-use-agent/src/tools/unified_exec.rs +++ b/crates/browser-use-agent/src/tools/unified_exec.rs @@ -7,6 +7,8 @@ use std::collections::HashMap; use std::io::{Read, Write}; +#[cfg(unix)] +use std::os::unix::process::CommandExt; use std::path::{Path, PathBuf}; use std::process::{Child as StdChild, Command as StdCommand, Stdio}; use std::sync::Arc; @@ -618,7 +620,10 @@ struct BlockingReader { } enum ManagedChild { - Pipe(StdChild), + Pipe { + child: StdChild, + process_group: Option, + }, Pty(Box), } @@ -626,6 +631,12 @@ enum ManagedStdin { Pty(Box), } +#[derive(Clone, Copy, Debug)] +struct ProcessGroup { + #[cfg(unix)] + pgid: libc::pid_t, +} + async fn spawn_managed_backend( argv: &[String], cwd: &PathBuf, @@ -652,6 +663,7 @@ async fn spawn_pipe_backend( .stdin(Stdio::null()) .stdout(Stdio::piped()) .stderr(Stdio::piped()); + configure_pipe_child_process_group(&mut command); for (key, value) in env { command.env(key, value); } @@ -659,6 +671,7 @@ async fn spawn_pipe_backend( let mut child = command.spawn().map_err(|source| { ToolError::Other(anyhow::anyhow!("failed to spawn `{program}`: {source}")) })?; + let process_group = process_group_for_child(&child); let stdout = child.stdout.take(); let stderr = child.stderr.take(); let mut blocking_readers = Vec::new(); @@ -675,7 +688,10 @@ async fn spawn_pipe_backend( }); } Ok(SpawnedProcess { - child: ManagedChild::Pipe(child), + child: ManagedChild::Pipe { + child, + process_group, + }, stdin: None, pty_master: None, reader_count: blocking_readers.len(), @@ -683,6 +699,35 @@ async fn spawn_pipe_backend( }) } +fn configure_pipe_child_process_group(command: &mut StdCommand) { + #[cfg(unix)] + unsafe { + command.pre_exec(|| { + if libc::setsid() < 0 { + Err(std::io::Error::last_os_error()) + } else { + Ok(()) + } + }); + } + #[cfg(not(unix))] + let _ = command; +} + +fn process_group_for_child(child: &StdChild) -> Option { + #[cfg(unix)] + { + Some(ProcessGroup { + pgid: child.id() as libc::pid_t, + }) + } + #[cfg(not(unix))] + { + let _ = child; + None + } +} + async fn spawn_pty_backend( argv: &[String], cwd: &PathBuf, @@ -882,7 +927,7 @@ impl ManagedProcess { let status = { let mut child = self.child.lock().await; match &mut *child { - ManagedChild::Pipe(child) => child + ManagedChild::Pipe { child, .. } => child .try_wait() .map_err(|source| { ToolError::Other(anyhow::anyhow!("polling process: {source}")) @@ -971,9 +1016,11 @@ impl ManagedProcess { { let mut child = self.child.lock().await; match &mut *child { - ManagedChild::Pipe(child) => { - let _ = child.kill(); - wait_for_std_child_exit(child, Duration::from_secs(2)).await; + ManagedChild::Pipe { + child, + process_group, + } => { + terminate_std_child_tree(child, *process_group, Duration::from_secs(2)).await; } ManagedChild::Pty(child) => { let _ = child.kill(); @@ -993,9 +1040,11 @@ impl ManagedProcess { { let mut child = self.child.lock().await; match &mut *child { - ManagedChild::Pipe(child) => { - let _ = child.kill(); - wait_for_std_child_exit(child, Duration::from_secs(2)).await; + ManagedChild::Pipe { + child, + process_group, + } => { + terminate_std_child_tree(child, *process_group, Duration::from_secs(2)).await; } ManagedChild::Pty(child) => { let _ = child.kill(); @@ -1014,9 +1063,15 @@ impl ManagedProcess { { let mut child = self.child.blocking_lock(); match &mut *child { - ManagedChild::Pipe(child) => { - let _ = child.kill(); - wait_for_std_child_exit_blocking(child, Duration::from_secs(2)); + ManagedChild::Pipe { + child, + process_group, + } => { + terminate_std_child_tree_blocking( + child, + *process_group, + Duration::from_secs(2), + ); } ManagedChild::Pty(child) => { let _ = child.kill(); @@ -1296,6 +1351,18 @@ async fn wait_for_std_child_exit(child: &mut StdChild, max_wait: Duration) { } } +async fn terminate_std_child_tree( + child: &mut StdChild, + process_group: Option, + max_wait: Duration, +) { + signal_process_group(process_group, ProcessSignal::Terminate); + let _ = child.kill(); + wait_for_std_child_exit(child, max_wait).await; + signal_process_group(process_group, ProcessSignal::Kill); + wait_for_std_child_exit(child, Duration::from_millis(EXIT_WATCH_INTERVAL_MS)).await; +} + fn wait_for_std_child_exit_blocking(child: &mut StdChild, max_wait: Duration) { let deadline = Instant::now() + max_wait; loop { @@ -1307,6 +1374,42 @@ fn wait_for_std_child_exit_blocking(child: &mut StdChild, max_wait: Duration) { } } +fn terminate_std_child_tree_blocking( + child: &mut StdChild, + process_group: Option, + max_wait: Duration, +) { + signal_process_group(process_group, ProcessSignal::Terminate); + let _ = child.kill(); + wait_for_std_child_exit_blocking(child, max_wait); + signal_process_group(process_group, ProcessSignal::Kill); + wait_for_std_child_exit_blocking(child, Duration::from_millis(EXIT_WATCH_INTERVAL_MS)); +} + +#[derive(Clone, Copy)] +enum ProcessSignal { + Terminate, + Kill, +} + +fn signal_process_group(process_group: Option, signal: ProcessSignal) { + #[cfg(unix)] + { + let Some(process_group) = process_group else { + return; + }; + let signal = match signal { + ProcessSignal::Terminate => libc::SIGTERM, + ProcessSignal::Kill => libc::SIGKILL, + }; + unsafe { + let _ = libc::kill(-process_group.pgid, signal); + } + } + #[cfg(not(unix))] + let _ = (process_group, signal); +} + fn clamp_yield_time(yield_time_ms: u64, empty_poll: bool) -> u64 { if empty_poll { yield_time_ms @@ -1383,3 +1486,56 @@ fn signal_exit_code(status: &std::process::ExitStatus) -> i32 { let _ = status; -1 } + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::time::Duration; + + use super::{SpawnProcessRequest, UnifiedExecManager}; + + fn request(argv: Vec, cwd: std::path::PathBuf, timeout_ms: u64) -> SpawnProcessRequest { + SpawnProcessRequest { + argv, + cwd, + env: HashMap::new(), + tty: false, + yield_time_ms: 1_000, + max_output_tokens: None, + timeout_ms: Some(timeout_ms), + kill_on_cancel: true, + call_id: "test-call".to_string(), + tool_name: "shell".to_string(), + emitter: None, + cancel: None, + } + } + + #[cfg(unix)] + #[tokio::test] + async fn timeout_kills_background_children_before_they_can_write() { + let dir = tempfile::tempdir().expect("tempdir"); + let late_file = dir.path().join("late.txt"); + let manager = UnifiedExecManager::deterministic_for_tests(); + + let snapshot = manager + .run_to_completion(request( + vec![ + "sh".to_string(), + "-c".to_string(), + "(sleep 1; printf bad > late.txt) & sleep 5".to_string(), + ], + dir.path().to_path_buf(), + 200, + )) + .await + .expect("run command"); + + assert!(snapshot.timed_out, "command should time out"); + tokio::time::sleep(Duration::from_millis(1_400)).await; + assert!( + !late_file.exists(), + "timed-out background child must not survive to write artifacts" + ); + } +} diff --git a/crates/browser-use-agent/src/turn/loop_driver.rs b/crates/browser-use-agent/src/turn/loop_driver.rs index e72821d2..9768e248 100644 --- a/crates/browser-use-agent/src/turn/loop_driver.rs +++ b/crates/browser-use-agent/src/turn/loop_driver.rs @@ -79,9 +79,15 @@ use super::{CompactionMode, SamplingDriver, TurnObserver, TurnState}; use crate::decision::{self, LoopStep}; use crate::events::TurnCtx; use crate::task::{TurnAbortReason, TurnLifecycleEvent}; -use browser_use_llm::schema::{ContentPart, Message, MessageRole}; +use browser_use_llm::schema::{ContentPart, FinishReason, Message, MessageRole}; use tokio_util::sync::CancellationToken; +/// Injected ONCE when a bounded (max_turns) run ends on plain text without a +/// successful `done` call. The model sometimes leaks planning text as its last +/// turn (e.g. confusion after compaction) and the loop would otherwise accept +/// that text as the final result (real_v8 task 24/45 class). +const CALL_DONE_NUDGE: &str = "You ended your turn with plain text instead of calling the done tool. If the task is finished, call done(result=...) (or done(result_file=path) for a saved artifact) with the complete final answer now. If it is not finished, continue working with tool calls."; + const FINAL_MAX_TURNS_NUDGE: &str = "This is the final allowed step for this run. Stop exploring and call the done tool with the best complete answer you can provide now. Include unknown or unavailable items explicitly instead of continuing to search."; const PROGRESS_MAX_TURNS_NUDGE: &str = "Progress checkpoint: If you have enough evidence, a saved artifact, or a complete-enough answer, stop further exploration and call the done tool now. Continue only for clearly missing required information that is likely to change the final answer."; @@ -160,6 +166,8 @@ impl TurnLoop { let mut can_drain = decision::initial_can_drain(turn_has_fresh_input); let mut last_agent_message: Option = None; let mut turns_run = 0usize; + let mut done_nudge_sent = false; + let mut pending_done_nudge = false; // Unbounded (`turn.rs:214`): NO max-turns counter. The only exits are // Complete, cancellation, or a hard error. @@ -191,6 +199,13 @@ impl TurnLoop { vec![ContentPart::text(PROGRESS_MAX_TURNS_NUDGE)], )); } + if pending_done_nudge { + pending_done_nudge = false; + request.push(Message::new( + MessageRole::Developer, + vec![ContentPart::text(CALL_DONE_NUDGE)], + )); + } // ---- 2. run one sampling round-trip ---- let outcome = match self @@ -229,6 +244,18 @@ impl TurnLoop { // ---- 4. act on the step (codex `turn.rs:250-355`) ---- match step { LoopStep::Complete => { + // Bounded (dataset/eval) runs should end with an explicit + // done() call: a text-only completion (finish_reason Stop / + // None — never a successful done's ToolUse) gets ONE nudge + // to call done before we accept the text as final. + let ended_without_done = + !matches!(outcome.finish_reason, Some(FinishReason::ToolUse)); + if max_turns.is_some() && ended_without_done && !done_nudge_sent { + done_nudge_sent = true; + pending_done_nudge = true; + can_drain = true; + continue; + } // Terminal: no follow-up needed and no compaction. Record the // final agent message and break (`turn.rs:340-355`). self.observer diff --git a/crates/browser-use-agent/src/turn/loop_tests.rs b/crates/browser-use-agent/src/turn/loop_tests.rs index 06a2097d..8a6e9429 100644 --- a/crates/browser-use-agent/src/turn/loop_tests.rs +++ b/crates/browser-use-agent/src/turn/loop_tests.rs @@ -23,7 +23,7 @@ use std::collections::VecDeque; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex}; -use browser_use_llm::schema::{ContentPart, Message, MessageRole}; +use browser_use_llm::schema::{ContentPart, FinishReason, Message, MessageRole}; use tokio_util::sync::CancellationToken; use crate::decision::{SamplingOutcome, TokenStatus}; @@ -245,11 +245,14 @@ fn follow_up(msg: &str) -> SamplingOutcome { /// A terminal outcome (no follow-up; the model is done). fn complete(msg: &str) -> SamplingOutcome { + // A completed turn in these tests models a successful done() call, which + // ends with a tool-use finish; a text-only (Stop/None) completion in a + // bounded run now triggers one call-done nudge first. SamplingOutcome { model_needs_follow_up: false, last_agent_message: Some(msg.to_string()), defers_mailbox_delivery_to_next_turn: true, - finish_reason: None, + finish_reason: Some(FinishReason::ToolUse), } } diff --git a/crates/browser-use-agent/src/turn/sampling.rs b/crates/browser-use-agent/src/turn/sampling.rs index 531739f4..36b6e18f 100644 --- a/crates/browser-use-agent/src/turn/sampling.rs +++ b/crates/browser-use-agent/src/turn/sampling.rs @@ -721,6 +721,19 @@ fn calls_done_tool(tool_calls: &[ContentPart]) -> bool { .any(|p| matches!(p, ContentPart::ToolCall { name, .. } if name == DONE_TOOL_NAME)) } +fn done_tool_succeeded(tool_calls: &[ContentPart], outputs: &[Message]) -> bool { + tool_calls + .iter() + .zip(outputs.iter()) + .any(|(call, output)| match call { + ContentPart::ToolCall { name, .. } if name == DONE_TOOL_NAME => { + let (_, is_error) = tool_result_text_and_status(output); + !is_error + } + _ => false, + }) +} + /// The final summary carried by the model's `done` call, if any. /// /// Reads the `result` field from the first `done` tool call's JSON arguments, @@ -989,19 +1002,11 @@ impl SamplingDriver match (&self.dispatcher, &self.recorder, tool_calls.is_empty()) { // Fused path with at least one tool call. (Some(dispatcher), Some(recorder), false) => { - // A `done` call declares the turn finished: dispatch it (so the - // summary is recorded) but report NO follow-up, terminating the - // loop. Detect it BEFORE the calls vec is consumed by dispatch. - let is_terminal = calls_done_tool(&tool_calls); - // Surface the `done` summary as the turn result when the model - // declared completion via `done` and streamed no other text, so - // the loop returns the summary (codex keeps the final message). - if is_terminal && last_agent_message.is_none() { - last_agent_message = done_summary(&tool_calls); - if last_agent_message.is_some() { - acc.defers_mailbox_delivery_to_next_turn = true; - } - } + // A successful `done` call declares the turn finished. Detect + // the call before dispatch, but decide terminality only after + // the handler output has been recorded; eval-mode done audit + // can reject weak completions and should get a repair turn. + let has_done_call = calls_done_tool(&tool_calls); // 1. Record the assistant message (text + tool calls), so the // recorded transcript carries the call before its output. @@ -1025,14 +1030,29 @@ impl SamplingDriver recorder.record(&result.outputs_in_order).await; } - // A `done` call is TERMINAL: the model declared completion, so - // the loop must stop even though a tool ran. Otherwise, follow-up - // iff a tool actually ran (codex re-samples after feeding tool - // outputs back into history). - if is_terminal { + let done_succeeded = has_done_call + && done_tool_succeeded(&tool_calls, &result.outputs_in_order); + + // Surface the successful `done` summary as the turn result + // when the model declared completion and streamed no other + // text, so the loop returns the summary (codex keeps the final + // message). Rejected done calls keep following up. + if done_succeeded && last_agent_message.is_none() { + last_agent_message = done_summary(&tool_calls); + if last_agent_message.is_some() { + acc.defers_mailbox_delivery_to_next_turn = true; + } + } + + // A successful `done` call is TERMINAL. A rejected `done` call + // needs follow-up so the model can repair the final answer. + if done_succeeded { false } else { - decision::needs_follow_up(result.needs_follow_up, false) + decision::needs_follow_up( + result.needs_follow_up || has_done_call, + false, + ) } } // Text-only sampler, OR fusion configured but no tool call: the diff --git a/crates/browser-use-browser/src/lib.rs b/crates/browser-use-browser/src/lib.rs index ed46a265..4d3e5977 100644 --- a/crates/browser-use-browser/src/lib.rs +++ b/crates/browser-use-browser/src/lib.rs @@ -29,7 +29,7 @@ const BU_API: &str = "https://api.browser-use.com/api/v3"; const LOG_LIMIT: usize = 250; const SCRIPT_MAX_OUTPUT_CHARS: usize = 120_000; const BROWSER_SCRIPT_DEFAULT_INITIAL_WAIT_MS: u64 = 15_000; -const BROWSER_SCRIPT_DEFAULT_OBSERVE_MS: u64 = 30_000; +const BROWSER_SCRIPT_DEFAULT_OBSERVE_MS: u64 = 1_000; const BROWSER_SCRIPT_HELPERS: &str = include_str!("browser_script_helpers.py"); const BROWSER_CONNECT_LOCAL_HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(120); const BROWSER_CONNECT_ATTACH_DEADLINE: Duration = Duration::from_secs(8); @@ -276,8 +276,8 @@ static MANAGED_BROWSER_PIDS: OnceLock>> = OnceLock::new(); static LOCAL_CDP_CONNECTIONS: OnceLock>> = OnceLock::new(); static BROWSER_SCRIPT_RUN_COUNTER: AtomicU64 = AtomicU64::new(1); -const BROWSER_SCRIPT_COMPLETED_CACHE_TTL_MS: u128 = 10 * 60 * 1_000; -const BROWSER_SCRIPT_COMPLETED_CACHE_MAX: usize = 128; +const BROWSER_SCRIPT_COMPLETED_CACHE_TTL_MS: u128 = 60 * 60 * 1_000; +const BROWSER_SCRIPT_COMPLETED_CACHE_MAX: usize = 2048; const MANAGED_BROWSER_PROFILE_PREFIX: &str = "but-managed-browser."; const MANAGED_BROWSER_MARKER_FILE: &str = "BrowserUseManagedChrome.json"; @@ -1281,8 +1281,17 @@ pub fn observe_browser_script_with_registry( }; let mut run = match lookup { BrowserScriptRunLookup::Run(run) => run, - BrowserScriptRunLookup::Cached(output) => return Ok(output), - BrowserScriptRunLookup::Unknown => bail!("unknown browser_script run_id {run_id:?}"), + BrowserScriptRunLookup::Cached(mut output) => { + // Repeat observe of a finished run: the result was already + // delivered once. Make that unmissable so the model stops + // polling/cancelling completed run_ids (a measured top turn sink). + output.text.push_str( + "\n[note: this run already completed and its result was already delivered. Do not observe or cancel this run_id again; use the output above or read its checkpoint/result files.]", + ); + output.next_observe_ms = None; + return Ok(output); + } + BrowserScriptRunLookup::Unknown => return Ok(unknown_browser_script_run_output(run_id)), }; if run.session_id != session_id { let owner = run.session_id.clone(); @@ -1359,6 +1368,38 @@ pub fn observe_browser_script_with_registry( } } +fn unknown_browser_script_run_output(run_id: &str) -> BrowserScriptOutput { + BrowserScriptOutput { + ok: true, + status: Some("not_found".to_string()), + run_id: Some(run_id.to_string()), + text: format!( + "No active or recently completed browser_script run was found for run_id: {run_id}. The id may be stale, already consumed, or from an older turn. Do not retry the same observe blindly; inspect the current page state or start a new browser_script if more data is needed." + ), + data: json!({ + "status": "not_found", + "run_id": run_id, + }), + ..Default::default() + } +} + +fn unknown_browser_script_cancel_output(run_id: &str) -> BrowserScriptOutput { + BrowserScriptOutput { + ok: true, + status: Some("not_found".to_string()), + run_id: Some(run_id.to_string()), + text: format!( + "No active browser_script run was found to cancel for run_id: {run_id}. The run may have already finished, timed out, or been cancelled. Continue by inspecting the current page state or starting a new browser_script if more data is needed." + ), + data: json!({ + "status": "not_found", + "run_id": run_id, + }), + ..Default::default() + } +} + pub fn cancel_browser_script(session_id: &str, run_id: &str) -> Result { secrets_runtime::finish_with_redaction( session_id, @@ -1371,11 +1412,13 @@ pub fn cancel_browser_script_with_registry( run_id: &str, registry: &BrowserScriptRunRegistry, ) -> Result { - let mut run = registry + let Some(mut run) = registry .lock() .expect("browser_script run registry poisoned") .remove(run_id) - .ok_or_else(|| anyhow!("unknown browser_script run_id {run_id:?}"))?; + else { + return Ok(unknown_browser_script_cancel_output(run_id)); + }; if run.session_id != session_id { let owner = run.session_id.clone(); registry @@ -13230,14 +13273,12 @@ print("bridge retry ok") "private browser_script runs must not be inserted into the legacy global registry" ); let run_id = started.run_id.as_deref().unwrap(); - let global_err = observe_browser_script(session_id, run_id, 50) - .expect_err("global registry must not see private run"); - assert!( - global_err - .to_string() - .contains("unknown browser_script run_id"), - "unexpected global observe error: {global_err}" - ); + let global_output = + observe_browser_script(session_id, run_id, 50).expect("unknown observe is recoverable"); + assert_eq!(global_output.status.as_deref(), Some("not_found")); + assert!(global_output + .text + .contains("No active or recently completed browser_script run")); let mut finished = observe_browser_script_with_registry(session_id, run_id, 2_500, ®istry).unwrap(); @@ -13251,6 +13292,34 @@ print("bridge retry ok") assert_eq!(registry.active_run_count_for_session(session_id), 0); } + #[test] + fn browser_script_observe_unknown_run_id_is_recoverable() { + let registry = BrowserScriptRunRegistry::new(); + let output = + observe_browser_script_with_registry("script-unknown-run", "bs-missing", 50, ®istry) + .unwrap(); + + assert!(output.ok); + assert_eq!(output.status.as_deref(), Some("not_found")); + assert_eq!(output.run_id.as_deref(), Some("bs-missing")); + assert!(output.text.contains("inspect the current page state")); + } + + #[test] + fn browser_script_cancel_unknown_run_id_is_recoverable() { + let registry = BrowserScriptRunRegistry::new(); + let output = + cancel_browser_script_with_registry("script-unknown-cancel", "bs-missing", ®istry) + .unwrap(); + + assert!(output.ok); + assert_eq!(output.status.as_deref(), Some("not_found")); + assert_eq!(output.run_id.as_deref(), Some("bs-missing")); + assert!(output + .text + .contains("No active browser_script run was found to cancel")); + } + #[test] fn browser_script_observe_can_return_no_new_output() { let temp = tempfile::tempdir().unwrap(); diff --git a/crates/browser-use-secrets/src/lib.rs b/crates/browser-use-secrets/src/lib.rs index 0718eb73..cf992cc6 100644 --- a/crates/browser-use-secrets/src/lib.rs +++ b/crates/browser-use-secrets/src/lib.rs @@ -497,7 +497,10 @@ mod tests { // Inputs without `/` or `\` are emitted unchanged, so keys already // persisted on disk stay valid (backward compatibility). assert_eq!(account_for("github.com", "password"), "github.com/password"); - assert_eq!(account_for("\u{1}agentmail", "token"), "\u{1}agentmail/token"); + assert_eq!( + account_for("\u{1}agentmail", "token"), + "\u{1}agentmail/token" + ); // And the store actually keeps such pairs separate end-to-end. let store = InMemorySecretStore::new(); diff --git a/docs/eval-journal/quickpack-k2-validation-report.md b/docs/eval-journal/quickpack-k2-validation-report.md new file mode 100644 index 00000000..2b1d6602 --- /dev/null +++ b/docs/eval-journal/quickpack-k2-validation-report.md @@ -0,0 +1,55 @@ +# Quick-pack validation: k=2 runs, strict-judged (2026-06-10) + +Code: `fix/real-v8-restore-88-v2` @ e04ff76 (rungs 0–4 + quick-wins + efficiency pack). +Runs: `real-v8-qwpack-k1-…051745`, `real-v8-qwpack-k2-…055927` (conc 25, max_attempts 1, audit on). +Judge: strict calibrated panel (reproduces 88-baseline=89, 81-baseline=80). Matrices: `/home/exedev/eval-runs/k1k2_matrix.csv`. + +## The number + +| Run | Strict score | +|---|---| +| 88-baseline (old) | 89 | +| 81-baseline (regressed) | 80 | +| 85-run (rungs 0–4 only) | 85 | +| **K1** | **82** | +| **K2** | **90** ← first run to beat the old baseline | +| **k=2 mean** | **86** | + +The k1↔k2 spread (8 points on IDENTICAL code) is the loudest finding: single-run scores carry ±4 of pure stochastic path/site/judge luck. The mean moved 85→86; the distribution's ceiling moved 85→90. + +## Head-to-head vs the 88-run (same panel) + +**K2 vs 88: K2 wins net +1.** 88 beats K2 only on 24, 33, 59 — all three are *hallucination-luck* tasks (24: K2's final JSON contradicted its own saved result.json; 33: K2 invented the requested 10/30GB tiers — the exact anchor-8 sin — while K1 honestly reported they don't exist and PASSED; 59: K2 invented an email). K2 beats 88 on 36, 43, 75, 91. +**K1 vs 88:** K1 loses on 11 stochastic recurrences of known classes (see below). + +**Stable both-fail core (7):** 9 (entity-frame substitution), 15 (brand coverage at scale), 27 (extraction quality), 74 (product matching gates), 67/87/88 (impossible quantity specs — dataset issues, ~3 permanently dead points). + +## Did the quick-pack mechanisms work? (telemetry + verdicts) + +| Fix | Evidence | Verdict | +|---|---|---| +| Audit `""` ≠ placeholder | task 94 PASSES in BOTH runs (was audit-coerced fail); audit rejections 8→1/run | ✅ confirmed | +| Wording-police removal | k2-72 PASSED with an honest negative + documented negative control + screenshots — judges cited it as anchor-5 evidence; 75 passes honestly in both | ✅ confirmed | +| Anti-laundering prompts | 75 no longer domain-copies practice_name (both runs); 72 fixed in k2; k1-72 still hallucinated provenance → probabilistic, not deterministic | ◐ works, stochastic | +| Reality-probe audits | 26: both runs reconciled against the API total (687=687); 32-k2: 102 stores incl. all Chicago; 32-k1 still 70 | ◐ works, stochastic | +| Compaction fix | 6 compactions in k1 with ZERO task-24-style meltdowns | ✅ no recurrence | +| done-nudge | 0 firings — no leaked-monologue endings occurred | ✅/untested | +| Observe sentinel | "stop polling" note fired 106/76×; observes 287→270/212 (k2 −26%) | ✅ modest | +| Efficiency overall | minutes 629→669/615, turns 2101→2222/1970 | ≈flat (big items — events track, warm runtime — not in yet, by design) | + +## What is still broken (ranked, the next targets) + +1. **Hallucination-at-finalization is now the #1 real pathology** (k1-11 fabricated file contents; k2-24 final JSON contradicting its own result.json; k2-33 invented tiers; k2-59 invented email; k1-43 blank screenshots shipped). Mechanism: when the model RE-TYPES or recalls instead of reading from its saved artifact, it fabricates. Fix: make `done(result_file)` the enforced default when a result file exists (harness assembles the final from the file; inline `result` only when no file exists) — this converts a behavioral rule into a mechanical guarantee. +2. **Audit-rejection → turn-cap → nothing captured** (k1-22, k2-15): one rejection late in a run leaves no budget to repair; the artifact fallback found no `result.*` file because checkpoints used other names. Fixes: (a) fallback should also match the newest *.json/*.csv checkpoint, (b) audit rejections within N turns of the cap should accept-with-warning instead. +3. **Frame/entity substitution persists stochastically** (9 both runs, 40-k1, 60-k1's invented 24/page pagination): the page-frame rule needs to be harder ("mirror the site's own pagination size; verify the first row of your output appears on the rendered first page"). +4. **Coverage-at-scale tasks** (15: brand requires walking ~29 brand categories; 27: per-card provider quality; 61-k1: 5 of 28 products) — chunked-checkpoint loops exist now but the model still sometimes stops at the first pass; needs the reality-probe habit to bind on *category* coverage, not just totals. +5. **Variance itself**: 11 tasks flipped between identical runs. Until k≥2 is the standard protocol, single-run claims are noise. The scripted pipeline (`/home/exedev/eval-runs/run_real_v8.sh`) makes k=2 one command. + +## High-level heuristics (cumulative, validated across 4 strict-judged runs) + +- Feedback visibility > everything (the 4KB cap was worth ~5 points alone). +- Honesty must be mechanically cheaper than confidence: every gate that punished honest wording produced laundering or fabrication; every fix that made honesty acceptable (null/"" rules, wording-police removal, negative controls) flipped tasks to PASS. +- Audits must reconcile against external reality, not self-consistency. +- Final answers must be file-derived, never memory-retyped (the remaining hallucination class). +- The efficiency frontier is the events track (observe churn) + warm runtime (script startup), not more prompt rules. +- ~3 points of this dataset are unwinnable (67/87/88 impossible specs); the practical ceiling is ~96–97. diff --git a/docs/eval-journal/run-FULL-100-task-deep-dive.md b/docs/eval-journal/run-FULL-100-task-deep-dive.md new file mode 100644 index 00000000..fe8ffa9d --- /dev/null +++ b/docs/eval-journal/run-FULL-100-task-deep-dive.md @@ -0,0 +1,61 @@ +# 100-task deep-dive: RUNNEW (real-v8-restore88-FULL-20260610-004242, strict 85/100) + +Per-task autopsies by 10 agents over the run's state.db + artifacts. Full per-task blocks live in the session transcript; this doc is the synthesis. Stats skeleton: `task_stats.csv` in the run dir. + +## Top pathologies, ranked by total minutes/turns burned across 100 tasks + +1. **Observe-poll/cancel churn (~60–80 turns run-wide; the #1 turn sink).** Background scripts get polled 3–16× (often 1s thrash-polls or blind 120s blocks), then frequently cancelled — including cancels of runs that ALREADY finished (`status:not_found`, 5× on task 81 alone) and re-polls after result.json existed. Fixes: observe must state run status unambiguously; treat artifact-written as terminal; one generous poll instead of escalation; when a script checkpoints to disk, read the file from shell instead of polling. + +2. **Monolithic long scripts vs the 180s cap, without checkpoints (~15 min: tasks 68×3, 88, 27, 15×2, 74).** "Navigate N sites / click-wait loops" written as one script, dying at the cap with zero persisted state, then retried in the same shape (68 did this 3×; the timeout message's own checkpoint advice ignored). Fix: enforce/teach chunking ≤60% of cap + per-item checkpoint files. + +3. **Inline `done(result=...)` re-typing saved artifacts (~10+ min: 26=287s, 60=121s, 51=137s, 53=100s, 44=92s...).** The model regenerates a 41KB JSON token-by-token when result.json already exists and `done(result_file=)` finishes in seconds. Fix: nudge/hard rule — file exists & validates ⇒ pass the file; bounded inline sample only. + +4. **Self-inflicted script bugs, 3 recurring classes (~50 failed scripts run-wide, ~1 turn each):** + (a) `js()` return-shape guessing (KeyError/TypeError on dicts; arrow-vs-IIFE confusion — 6 failures in tasks 51–60 alone); + (b) Python-built regex/strings injected into JS → SyntaxError (22, 29, 38, 63, 70); + (c) cross-script variable reuse → NameError (every browser_script is a fresh process; bit task 33 twice consecutively). + Fixes: document js() semantics in error text; auto-JSON.stringify returns; "print shape once before consuming" convention. + +5. **Sequential fetching of independent URLs (task 68: 4 min for what a concurrent rewrite did in 0.4s).** Rule: ≥3 independent URLs ⇒ concurrent fetch with per-request timeouts; browser only for the JS-rendered residue. + +6. **Done-audit perversity — the run's own strict-fail generator:** + - Counts task-mandated `""` as placeholders (task 94: FIRST answer was correct per spec with `""`; audit rejected "149/418 placeholders"; agent rewrote them into literal placeholder text; strict judge failed it). The null-fix in done.rs must extend to empty strings (at minimum when the task mandates them). + - Rejects honest negative findings ("declares blocked source" 75, "unverified data" 77) → agents resubmit same data with laundered wording. The audit selects for confident phrasing over truth. + - Forces futile enrichment on impossible specs (87: 5.7 min / 56% of runtime after a correct source-exhausted answer). + - When it works it's cheap and good (41: rejection → 4 turns → genuine fix → pass). + +7. **Identifier/entity "laundering" — the new top strict-fail mechanism (72, 74, 75).** Agent ships a proxy as the required value with a rationalizing note, despite disconfirming evidence in its own output: Excel date-serial 46082 as "operator ID" (header literally said ReportDate); fuzzy product matching with no brand/model gate attaching wrong-product offers; website domain copied into practice_name for 155 rows. Fix: positive-label match or negative control before asserting equivalence (would a different operator get a different value?); audits should flag field_x==field_y duplication. + +8. **Audits verify self-consistency, never reality (31, 32, 36).** count==own-list while Chicago has zero Target stores; 322 accepted without checking the UI's 354 facet; 80/104 nulls shipped with a note. One ground-truth probe per audit (largest-city present? site-stated total? UI count?) flips all three. + +9. **Search engines are dead weight (~8 min run-wide: 72×8 attempts, 59, 68, 75).** Google returns a JS stub, Bing raw soup, DDG blocked from shell egress. Either restore a real search tool or stop reaching for engines. + +10. **Compaction handoff catastrophically broken (task 24, full task lost).** A 1.5MB emit_output forced mid-turn compaction; the summary pass produced EMPTY text (SUMMARY_PREFIX + ""), no "(no summary available)" fallback, no tool state; amnesiac model burned 30 turns polling `browser status`; a tool-free text blob became the final. Three bugs: unbounded emit_output, empty-summary handoff, plain-text-ends-session. + +11. **Model latency is a top-3 minute sink.** 20–60s gaps before each big script (task 41: 14.5 of 28.4 min was inter-tool latency at ~110K context; task 90: one 122s gap = 49% of runtime). Compounds with every rewrite/retry. Fewer, smaller scripts and less screenshot-bloat directly cut this. + +12. **Misc infra:** artifact save returned empty URL → 5 min of cloud-API archaeology (62); tool.failed double-emits per script failure (inflates metrics ~2×); missing openpyxl pushed manual XLSX parsing (72's fatal misread); /tmp scratch script named inspect.py shadowed stdlib (54); 455-bot-block on http_get correctly recovered via browser_fetch (29). + +## What WORKS (reinforce; these were the fast passes) +- API/endpoint-first after one quick probe: wp-json (65, 87), Algolia (14, 97), DataTables replay (66), XHR capture (53, 60), llms.txt (54), archive-URL guessing (52), JSON-LD (30), __NEXT_DATA__/Nuxt payload (25, 31, 39). +- The reference trajectory shape: recon → ONE extract+audit script → verify file → done(result_file). Tasks 56 (1.4 min), 84 (0.9), 69 (1.6), 49 (1.8), 83 (1.4) all follow it. +- In-page js fetch when http_get is bot-blocked (42); cookied requests.Session replay for ASP sites (67). +- Honest "not found + here's what WAS found" passes judges (66, 59). + +## Strict-fail census (15 fails) +- Impossible/over-ask specs: 67, 87, 88 (+72 arguably) — no agent behavior fixes these; dataset/judge issue. +- Audit-induced: 94 ("" coercion). +- Proxy-laundering: 72, 74, 75. +- Reality-check missing: 31, 32, 36. +- Frame drift: 23 (API reconstruction ≠ displayed table). +- Compaction: 24. +- Site-state vs rubric: 33 (promo tiers replaced requested base tiers). +- Required-field coverage at scale: 15, 27 (monolithic-script timeout induced). + +## Next-fix shortlist (by expected value) +1. done-audit: treat task-mandated `""` like null; add source-exhausted escape hatch; stop rejecting honest negatives. +2. Compaction: never ship an empty summary; cap emit_output; text-only turn ⇒ done-nudge, never terminal. +3. Observe/lifecycle: status clarity, artifact=terminal, no-poll-after-checkpoint. +4. Script hygiene pack: js() docs+stringify, chunk-by-default with checkpoints, concurrency rule, print-shape-first. +5. Audit reality-probe: one external reconciliation per collection task (site-stated total / UI count / largest-entity present). +6. done(result_file) nudge to kill inline re-typing. diff --git a/prompts/browser-agent-system.md b/prompts/browser-agent-system.md index 3458e691..061ff8a8 100644 --- a/prompts/browser-agent-system.md +++ b/prompts/browser-agent-system.md @@ -4,7 +4,7 @@ Raw CDP is the center of page interaction. Treat `cdp("Domain.method", ...)` ins The `browser` tool behaves like a CLI for browser runtime management. Use it for `browser status --json`, `browser connect local`, `browser local setup`, `browser connect managed`, `browser remote start`, `browser doctor`, explicit recovery, profile summaries, runtime logs, and ownership checks. It does not interact with pages. -The `browser_script` tool runs fresh Python in a browser-connected environment. Browser/CDP state persists in Rust; Python variables do not persist across calls. Important helpers include `cdp`, `new_tab`, `goto_url`, `page_info`, `js`, `capture_screenshot`, `screenshot`, `screenshot_clip`, `emit_image`, `click_at_xy`, `fill_input`, `type_text`, `press_key`, `scroll`, `wait_for_load`, `wait_for_element`, `wait_for_network_idle`, `current_tab`, `list_tabs`, `switch_tab`, `ensure_real_tab`, `upload_file`, `drain_events`, `http_get`, `http_get_many`, `browser_fetch`, `browser_fetch_many`, `copy_artifact`, `artifact_root`, `outputs_dir`, `session_metadata`, `audit_artifact`, `agent_workspace`, `load_agent_helpers`, `domain_skills_for_url`, and `last_domain_skills`. Use `js(function_source, *args)` when passing JSON-serializable Python values into JavaScript; use `target_id=` as a keyword for iframe targets. +The `browser_script` tool runs fresh Python in a browser-connected environment. Browser/CDP state persists in Rust; Python variables do not persist across calls. Important helpers include `cdp`, `new_tab`, `goto_url`, `page_info`, `js`, `capture_screenshot`, `screenshot`, `screenshot_clip`, `emit_image`, `click_at_xy`, `fill_input`, `type_text`, `press_key`, `scroll`, `wait_for_load`, `wait_for_element`, `wait_for_network_idle`, `current_tab`, `list_tabs`, `switch_tab`, `ensure_real_tab`, `upload_file`, `drain_events`, `http_get`, `browser_fetch`, `copy_artifact`, `artifact_root`, `outputs_dir`, `session_metadata`, `audit_artifact`, `agent_workspace`, `load_agent_helpers`, `domain_skills_for_url`, and `last_domain_skills`. Use `js(function_source, *args)` when passing JSON-serializable Python values into JavaScript; use `target_id=` as a keyword for iframe targets. `browser_script` has a start/listen lifecycle. A fast call returns final output immediately. A longer call returns `status: running` plus `run_id`; observe it with `action="observe"` until final status. If observe returns no new output for its wait window, back off instead of polling constantly. Images/artifacts emitted by the running script are returned by observe as soon as they exist. Use `action="cancel"` with the `run_id` only when the running script is no longer useful. @@ -29,12 +29,12 @@ Browser-harness workflow: - First navigation should usually be `new_tab(url)`, not `goto_url(url)`, because `goto_url` mutates the active tab. `new_tab(url)` and `goto_url(url)` have zero implicit wait: they send the CDP navigation command and then return without waiting for readyState, network idle, selectors, paint, or sleeps. If you chain more work in the same script after navigation, explicitly wait or poll before reading/clicking. If navigation is the last action before yielding to the model, the LLM call itself may provide enough elapsed time; the next call must still inspect state before assuming the page loaded. - When a task is site-specific and a matching domain skill exists, read it before inventing selectors, private API routes, or flows. Use `domain_skills_for_url(url, include_content=True)` before or immediately after navigation; `goto_url(url)` also records matching skill metadata in the tool result. -- Use screenshots as labeled temporal checkpoints when visual state matters. For text-heavy research, document reading, search, pricing, tables, and list extraction, default to `page_info()`, `js(...)`, targeted DOM text, `http_get_many`, or `browser_fetch_many`; screenshots add latency and usually do not help. Capture visual state before and after meaningful browser actions only when layout, coordinates, blockers, menus, dialogs, downloads, uploads, form submissions, or final visual verification matter. +- Use screenshots as labeled temporal checkpoints. Screenshots are often the fastest way to understand the page, spot blockers, read visible state, and verify what changed. Capture visual state before and after meaningful browser actions: initial load, clicks, scrolls, route changes, menus, dialogs, downloads, uploads, form submissions, and final verification. For screenshot or visual-output tasks, verify the artifact is contentful and nonblank before `done`. - Prefer coordinate clicks for visible targets. Use `screenshot` or `capture_screenshot`, inspect the pixels, `click_at_xy(x, y)`, then screenshot again to verify. Chrome hit-testing handles iframes, shadow DOM, and cross-origin content better than selector abstractions. - For forms, behave like a browser user. Inspect visually with screenshots first; use read-only JS only when pixels are insufficient to identify labels or stable selectors. Click into visible text fields before typing. Use `type_text(...)`, `press_key(...)`, or `fill_input(...)` for text, and real coordinate clicks for checkboxes, radios, buttons, dropdowns, and custom controls. Never bulk-fill a live form by setting DOM values, setting checked state, dispatching synthetic form events, or running a restore loop; this can desynchronize framework state from the visible DOM. - Prefer capturing the action timeline inside one `browser_script` tool call when possible: `screenshot("before_click")`, perform the action, wait for the state change, then `screenshot("after_click")`. - Do not call `screenshot` repeatedly on an unchanged viewport. Once you have a screenshot, either take an action, inspect with CDP/JS, navigate, scroll, call `screenshot_clip(...)` for a different region, wait for an async transition, or finish. Every screenshot should have a purpose: observe current state, verify an action, inspect a changed region, or preserve final evidence. -- Use raw `cdp(...)`, `page_info()`, `wait_for_element(...)`, `wait_for_network_idle(...)`, and `js(...)` when coordinates are the wrong tool or you need structured data. If you have three or more independent URLs, files, documents, or API endpoints to inspect, batch them in one `browser_script` call with `http_get_many` or `browser_fetch_many` instead of visiting them one at a time. +- Use raw `cdp(...)`, `page_info()`, `wait_for_element(...)`, `wait_for_network_idle(...)`, and `js(...)` when coordinates are the wrong tool or you need structured data. Use scripts to make source-completion faster, but keep each extraction chunk small enough to inspect progress, counts, missing fields, and source coverage before continuing. - `js(...)` returns Python values. After `text = js("document.body.innerText")`, use Python slicing like `text[:1000]`; only use JavaScript methods such as `.slice(...)` inside the JavaScript expression itself. - After actions that trigger loads, SPA transitions, XHR/fetch, menus, dialogs, downloads, uploads, or other visible state changes, be patient by making several cheap observations, not one long blind wait. Prefer short waits, then inspect again with `page_info()` or a screenshot. A wait returning false is not a task failure; inspect the current page and continue from the best available state or decide whether it is stuck. - If redirected to an auth wall or credential prompt, stop and ask the user. Do not infer or type credentials from screenshots. @@ -51,10 +51,14 @@ Python namespace rule: `browser_script` variables do not persist across calls. S Durable helper rule: if you discover a reusable selector, site quirk, private API, or interaction helper, put the smallest useful helper in `.browser-use/agent-workspace/agent_helpers.py` and use it on later calls. The file is auto-loaded when it changes; call `load_agent_helpers()` if you need to force reload. Keep helpers task-focused, CDP-friendly, and free of secrets. Do not build manager layers, retry frameworks, page-object frameworks, or wrapper abstractions unless the task itself absolutely requires it. -Multi-item collection rule: when the task asks for many products, countries, people, records, plans, prices, links, or fields, maintain a checklist of every required row and field. Spend work across the checklist, not indefinitely on one difficult item. For each item/source, use a small number of targeted attempts, then either record the best verified value, mark it unavailable/unknown with the source and reason, or move to a better source. Do not keep varying one search term while other required rows are untouched. Before `done`, audit the checklist: every requested row/field must be filled, explicitly unavailable/unknown, or clearly reported as partial with the remaining gaps. +Never fabricate or pattern-guess values (emails, IDs, prices, names) to fill required fields or to satisfy an audit; every value must come from a source you actually observed. An honest null or "not found" with the checked source noted is an acceptable final value; a fabricated value is never acceptable and is worse than an empty field. Never satisfy a required field with a proxy copied from another field (a website domain is not a practice name). Before asserting an identifier or entity match, require a positive label match or run a negative control: a different entity must yield a different value, and a value shared by every row is not an identifier. When the task specifies the exact representation for missing data (for example an empty string ""), use exactly that representation and put any explanation in a separate notes field, never in the value itself. -Single-site collection rule: when the task asks for data from one website, one vendor, one domain, or "a single website", choose one viable domain early and complete the checklist on that domain. Candidate scouting should be brief: verify the domain has the right category, currency, locale, or authority, then commit. If the task permits unavailable/missing rows, a domain is viable as soon as it has the requested category/source type and at least one requested row or a searchable catalog in the requested currency/locale; do not keep searching for a perfect domain that has every row. Do not stitch rows from multiple domains, and do not keep vendor-hopping after a viable domain exists. Switch domains only when the current domain clearly cannot satisfy the requested category/currency/authority after a bounded check. If an item is missing on the committed domain, mark it unavailable for that domain and move to the next checklist row. +Multi-item collection rule: when the task asks for many products, countries, people, records, plans, prices, links, or fields, maintain a checklist of every required row and field. Required rows and fields must be completed from the correct source type unless absence is proven from that source. Do not mark required values unknown just because one query, endpoint, page, or selector failed. Spend work across the checklist, but before `done`, audit row count, required fields, null rates, dedupe keys, and source coverage — and reconcile the collection against at least one EXTERNAL reality signal: the site's own stated total, the rendered page's count, or the presence of the largest expected entity (a list of Illinois stores with zero Chicago entries is wrong even if internally consistent). An audit that only checks your own collection against your own link list proves nothing. Missing required values must either be fixed or reported as an explicitly incomplete fallback. -Use the browser to discover and verify. Once the browser reveals stable data endpoints, static links, downloadable assets, XHR/fetch patterns, or predictable pagination URLs, switch to `http_get_many` for independent public URLs or `browser_fetch_many` when browser cookies/session state are needed. Use single `http_get`/`browser_fetch` calls for one-off checks. For long extraction loops, split work into bounded chunks, use explicit timeouts, checkpoint partial results to files, and resume from checkpoints instead of restarting. Use one global deadline plus per-item micro timeouts, and check the global deadline before every navigation, wait, and sleep. Any loop over multiple pages/items must emit short progress every item or every 2 seconds, whichever comes first. For list/profile extraction, filter candidates before navigating when possible, and poll for record readiness rather than nullable answer fields; if a loaded record has a missing optional field, record it as missing and continue. Extract only task-relevant fields; do not emit full profile text, full DOM text, cookies, localStorage, or entire app caches unless smaller field-level extraction failed. Use `outputs_dir()` for generated result files; files written there are collected as artifacts automatically. Use `copy_artifact(path)` only for files created elsewhere, and `emit_image(path)` for screenshots or visual artifacts. When a task expects a large JSON/CSV/list output, write the full file; if the final answer must be inline structured content, return that content with `done(result=...)` and optionally include `result_file=path`, otherwise finish with `done(result_file=path)`. +Single-site collection rule: when the task asks for data from one website, one vendor, one domain, or "a single website", choose one viable domain early and complete the checklist on that domain. Candidate scouting should be brief, but completion on the chosen domain is not optional. Do not stitch rows from multiple domains, do not silently source-switch, and do not mark missing rows unavailable unless the correct source path was checked and the absence is supported by evidence. Switch domains only when the current domain clearly cannot satisfy the requested category, currency, locale, or authority. + +Bounded audit rule: verification should catch real mistakes without spinning forever. After writing a substantial artifact, run a small independent audit of count/schema/required-fields/source coverage, then repair the specific gaps it finds. Keep repairing until the required rows and fields are satisfied or the run-level task timebox is nearly spent — do not stop after a single repair pass while required data is still missing and reachable from its correct source. What you must avoid is blindly restarting the same full crawl, pagination sweep, or detail scrape just because counts fluctuate or a site intermittently returns empty pages; target the specific missing items instead of starting over. Finalize as explicitly incomplete only when the source genuinely does not expose the required data, or the run is nearly out of turns. + +Use the browser to discover and verify. Once the browser reveals a stable data endpoint, static link, downloadable asset, XHR/fetch pattern, or predictable pagination URL, use single targeted `http_get` or `browser_fetch` calls when they help verify or complete the source. Do not replace source completion with blind bulk fetching. If a script, `http_get`, `browser_fetch`, endpoint, or selector errors, returns empty, or is blocked, fall back to navigating and reading the rendered page before marking anything unavailable — do not re-run the same failing script or abandon the source for a degraded one. For long extraction loops, split work into bounded chunks sized well under the script timeout (a loop with mandated waits or more than ~20 navigations can never fit one script), checkpoint partial results to a file after EVERY item, and resume from the checkpoint instead of restarting; never retry a timed-out monolithic script in the same shape. When fetching 3 or more independent URLs, fetch them concurrently with per-request timeouts (concurrent.futures) so one hanging site cannot stall the batch — a serial loop over independent URLs is always wrong. Each chunk must emit compact progress with source URL/pattern, count, missing-field count, failure count, and next cursor/page when relevant. Extract only task-relevant fields; do not emit full profile text, full DOM text, cookies, localStorage, or entire app caches unless smaller field-level extraction failed. Use `outputs_dir()` for generated result files; files written there are collected as artifacts automatically. Use `copy_artifact(path)` only for files created elsewhere, and `emit_image(path)` for screenshots or visual artifacts. When a task expects a large JSON/CSV/list output, write the full file; if the task demands inline structured content, return it with `done(result=...)`; otherwise, when the full content is already saved and verified in a result file, finish with `done(result_file=path)` plus a SHORT inline summary (counts, sample row) — never re-type a large saved artifact into `result`. Before `done`, block claimed-complete results with empty arrays, high null rates in required fields, missing required fields, wrong known counts, source drift, blank screenshots, or partial/incomplete markers. Use helper agents only when the user explicitly asks for sub-agents, delegation, or parallel agent work. Requests for depth, thoroughness, research, investigation, or detailed codebase analysis do not by themselves authorize spawning a helper. When delegation is authorized, give each helper a narrow, self-contained task that materially advances the work, keep urgent blocking work local, avoid duplicate helper work, and continue useful non-overlapping local work while the helper runs. Use the `explorer` role for authorized read-only repository questions and `worker` for authorized implementation work with a bounded write scope. diff --git a/prompts/browser-script-tool-description.md b/prompts/browser-script-tool-description.md index 646b9df6..2bf5ee0e 100644 --- a/prompts/browser-script-tool-description.md +++ b/prompts/browser-script-tool-description.md @@ -10,7 +10,8 @@ Important execution model: - Python variables do not persist across calls. - Browser/CDP state persists in Rust. - Fast calls return their final result immediately. Long calls return `status: running` with a `run_id`; keep observing that same run until it finishes, fails, or is cancelled. -- To listen to a running script, call this tool with `action="observe"`, the returned `run_id`, and optionally `observe_timeout_ms`. Prefer coarse waits such as 30000-120000 ms for long navigation or extraction scripts; do not burn many turns polling the same `run_id` with short waits. +- To listen to a running script, call this tool with `action="observe"`, the returned `run_id`, and optionally `observe_timeout_ms`. Prefer coarse waits such as 30000-120000 ms for long navigation or extraction scripts; do not burn many turns polling the same `run_id` with short waits. One generous observe beats several short polls. If a script checkpoints progress to a file, read the file from `shell` instead of observing again. If an observe reports the run completed or `not_found`, STOP: do not observe or cancel that run_id again — the result has already been delivered. +- `js()` returns the JSON-serialized value of your expression: wrap complex returns in `JSON.stringify(...)` inside the JS and parse with `json.loads(...)`; print the raw return once before indexing into it. Never embed Python-built regex literals directly in JS strings (escaping differs) — build the pattern inside the JS or use string methods. - To stop a running script, call this tool with `action="cancel"` and the `run_id`. Partial images and artifacts emitted before cancellation are preserved. - A failed `browser_script` call may include a short diagnosis. Read that diagnosis first: if it says the browser is still connected or the same page is usable, continue from the same page instead of reconnecting. - Helpers are preimported; you do not need imports for normal browser work. @@ -56,9 +57,7 @@ email_address() email_inbox(limit=20, sent_after=None) email_message(message_id) http_get(url, **kwargs) -http_get_many(urls, **kwargs) browser_fetch(url, **kwargs) -browser_fetch_many(requests, **kwargs) copy_artifact(path, kind="file") emit_output(value, label=None) @@ -82,7 +81,7 @@ Usage guidance: - Do not combine `Input.dispatchKeyEvent` carrying printable `text` with a manual `char` event for the same character; that double-inserts text in Chrome. - If the task is site-specific, call `domain_skills_for_url(url, include_content=True)` before inventing selectors, private API routes, or flows. `goto_url(url)` also returns matching `domain_skills` metadata when a skill root is available. - Be patient with loading pages by making several cheap observations, not one long blind wait. Prefer short waits such as `wait_for_load(1)`, `wait_for_element(selector, timeout=2)`, or `wait_for_network_idle(2)`, then inspect again. If a wait returns false, that is not a task failure; inspect the current page and continue from the best available state or decide whether it is stuck. -- Use screenshots as labeled temporal checkpoints when visual state matters: before/after meaningful clicks, scrolls, route changes, dialogs, uploads, downloads, and visual final verification. For text-heavy research, document reading, search, pricing, tables, and list extraction, prefer `page_info()`, `js(...)`, targeted DOM text, `http_get_many`, or `browser_fetch_many`; screenshots add latency and usually do not help. +- Use screenshots as labeled temporal checkpoints: initial load, before/after meaningful clicks, scrolls, route changes, dialogs, uploads, downloads, and final verification. For screenshot or visual-output tasks, verify the saved image is contentful and nonblank before `done`. - The common screenshot call is `screenshot(label)`, for example `screenshot("before_submit")`. - Screenshot/image artifacts are sent as `input_image` content to the next model turn. The user does not see those pixels inline in the terminal; describe what you see or provide the saved artifact path when the user asks for the screenshot. - If a script emits screenshots/images and then fails, the next model turn still receives the images alongside the failure diagnosis. Use those pixels to decide the next smaller retry. @@ -118,45 +117,16 @@ emit_output(rows, label="employee_rows") - Use `js(...)` for DOM inspection and raw `cdp(...)` for lower-level browser actions. - Use `js(function_source, *args)` when passing JSON-serializable Python values into JavaScript; use `target_id=` as a keyword for iframe targets. - For real user forms, act like a browser user: screenshot, click the visible field/control, type with `type_text(...)`, `press_key(...)`, or `fill_input(...)`, then screenshot or otherwise verify. Use coordinate clicks for checkboxes, radios, buttons, dropdowns, and custom controls. Do not assign `element.value`, `element.checked`, `selectedIndex`, React private state, or MutationObserver restore loops on live forms. Do not synthesize `input`, `change`, `click`, or keyboard events in page JavaScript to make a form look filled. Those anti-patterns can desynchronize framework state from the visible DOM. -- Use `http_get(...)` for one static page/API URL after the browser reveals a stable endpoint, and `http_get_many(...)` for several independent public URLs. Use `browser_fetch(...)` or `browser_fetch_many(...)` when the page's cookies, auth headers, or browser session are needed. Returned bodies are strings by default, bytes with `binary=True`, and expose `.status_code`, `.headers`, `.url`, `.text`, `.content`, and `.json()` for convenience. `browser_fetch(...)` and the batch helpers return error records by default so one bad endpoint does not waste the whole extraction chunk; pass `return_error=False` or `return_errors=False` only when a hard failure is intended. If direct HTTP hits bot or login protection, retry with `browser_fetch(...)`, site-specific headers/cookies, or the configured Browser Use fetch proxy. -- Batch recipe after discovering stable links or endpoints: - -```python -# browser_summary: -# { -# "fetch_progress": { -# "kind": "extracted", -# "message": "Fetched ${$.ok_count}/${$.total} independent URLs" -# }, -# "records": { -# "kind": "extracted", -# "message": "Extracted ${$.length} records from fetched pages" -# } -# } - -urls = [...] -responses = http_get_many(urls, timeout=12, max_workers=8) -ok = [r for r in responses if not isinstance(r, dict) and getattr(r, "status_code", 0) < 400] -emit_output({"total": len(responses), "ok_count": len(ok)}, label="fetch_progress") - -records = [] -for url, response in zip(urls, responses): - if isinstance(response, dict) and response.get("error"): - records.append({"url": url, "status": "error", "error": response["error"]}) - continue - text = response.text - records.append({"url": url, "status": response.status_code, "title": text[:200]}) - -emit_output(records, label="records") -``` +- Use `http_get(...)` for one static page/API URL after the browser reveals a stable endpoint. Use `browser_fetch(...)` when the page's cookies, auth headers, or browser session are needed. Returned bodies are strings by default, bytes with `binary=True`, and expose `.status_code`, `.headers`, `.url`, `.text`, `.content`, and `.json()` for convenience. If direct HTTP hits bot or login protection, retry with `browser_fetch(...)`, site-specific headers/cookies, or the configured Browser Use fetch proxy. Do not replace source completion with blind bulk fetching; use small inspected chunks with progress, counts, missing fields, and source coverage. - Extract only fields needed for the task. Do not emit full profile text, full DOM text, cookies, localStorage, or entire app caches unless you are debugging and the smaller field-level extraction failed. - Save complete generated result files under `outputs_dir()` or relative paths in the current working directory. Files written there are collected as artifacts automatically; `copy_artifact(...)` is for files created elsewhere. - For large structured results, write the full JSON/CSV/text to a file. If the task asks for an exact inline final format, return that content with `done(result=...)` and optionally include `result_file=path`; otherwise finish with `done(result_file=path)`. - For loops over multiple pages/items, emit short progress every item or every 2 seconds, whichever comes first. Progress can be a short `print(...)` line or compact `emit_output(..., label="progress")`. +- For audits after a large result, run a small independent sample/count/schema check, then repair the specific gaps it finds until the required rows/fields are complete or the run is nearly out of turns. Do not rerun the whole crawl or full detail scrape just because counts fluctuate or some pages are intermittently empty; target the missing items, and mark a gap as a genuine absence only after checking its correct source path. - For list/profile extraction, filter the candidate list before navigating when the list page already contains enough information, such as employee versus contractor. Do not visit rows that cannot affect the final answer. -- Poll for record readiness, not for nullable answer fields. If the app cache or DOM record for a person exists but `birthday`, phone, address, or another optional field is missing/null, record that value as missing and continue instead of waiting for the optional field to appear. -- For long extraction or verification loops, prefer bounded chunks with checkpoints written to files. Use one global deadline plus per-item micro timeouts; check the global deadline before every navigation, wait, and sleep. If a chunk fails with a usable-page diagnosis, shrink the next chunk and resume from the last checkpoint. +- Poll until the record itself is ready before extracting fields. If a loaded record is missing a required field, inspect the correct source path before marking it absent; do not record required values as missing just because the first record view is null. +- For long extraction or verification loops, prefer bounded chunks with checkpoints written to files. Use per-item micro timeouts and inspect progress after each chunk. If a chunk fails with a usable-page diagnosis, shrink the next chunk and resume from the last checkpoint. Signing in / sign-ups: before signing up with a new email, check whether you're already logged in (you often drive the user's own profile) or have a saved credential for the site (listed under "Saved credentials") — if so, use it. If there's no existing login, ask the user whether to sign in with their own account (they save it via `/secrets`) or have you create a disposable account (you generate a throwaway inbox with `email_address()` and read its verification emails yourself), and wait for their choice. For the disposable path, call `email_address()`, record whatever context you need before submitting (`current_datetime()["utc"]`, existing `message_id`s from `email_inbox()`, or both), fill the email field, submit, then inspect/poll `email_inbox(sent_after=...)` or compare `timestamp`/`message_id` yourself (newest-first; `preview` already holds the code; `email_message(message_id)` has the full `text`/`html` for magic links). diff --git a/prompts/dataset-case-user.md b/prompts/dataset-case-user.md index d668d259..1efc0d61 100644 --- a/prompts/dataset-case-user.md +++ b/prompts/dataset-case-user.md @@ -6,7 +6,7 @@ Task ID: {{task_id}} Task: {{task}} -Use `browser` for browser connection/status/recovery and `browser_script` for browser interaction. Rust owns the browser connection; `browser_script` exposes helpers plus raw CDP access when needed. Prefer robust CDP/DOM observations over guessing. For text-heavy research, document reading, search, pricing, or list extraction, prefer DOM/text/API observations and batch fetches over screenshots. Attach screenshots only after meaningful visual transitions or when visible layout, coordinates, blockers, or final visual state matter. +Use `browser` for browser connection/status/recovery and `browser_script` for browser interaction. Rust owns the browser connection; `browser_script` exposes helpers plus raw CDP access when needed. Prefer robust CDP/DOM observations over guessing. Use screenshots as verification checkpoints after meaningful visual transitions and whenever visible state, layout, blockers, coordinates, dynamic content, or final browser state matters. Filesystem contract: if the task asks you to save files, write them in the current working directory using relative paths. For large JSON/CSV/list results, save the full result to `result.json` or `result.csv` so it is available as an artifact. If the requested final answer is not an exact inline format, return a compact final answer with the output path, record count, schema/columns, and one sample row instead of pasting a giant blob. @@ -16,9 +16,11 @@ File result contract: after writing a complete JSON/CSV/text result file, use `d Remote browser contract: browser automation may run on a different machine from the local filesystem. Files downloaded by the remote browser are not automatically available in the local working directory. If a task needs a downloaded file locally, transfer or fetch it into the current working directory, then verify the local path exists before referencing, opening, or finalizing it. For uploads, make sure the file you intend to upload is available to the browser context you are controlling. -Long extraction contract: if the task needs many pages, rows, files, or detail records, work in bounded chunks. Discover the endpoint or pagination pattern first, then fetch in batches with explicit timeouts, checkpoint partial results in the current working directory, and print compact progress counts. A timed-out all-in-one crawl with no saved artifact is not progress; resume from checkpoints when a chunk fails. +Long extraction contract: if the task needs many pages, rows, files, or detail records, work in bounded chunks. Discover the source and pagination pattern first, inspect progress from the browser/source itself, checkpoint partial results in the current working directory, and print compact progress counts. A timed-out all-in-one crawl with no saved artifact is not progress; resume from checkpoints when a chunk fails. -Timebox contract: dataset runs have a short wall-clock budget. For long research, document, or extraction tasks, set a soft deadline before starting broad collection, about 7 minutes from now, and a hard deadline about 8.5 minutes from now. Check the deadline before each new page, document, query, or file. After the soft deadline, stop broad research and fill remaining fields from the strongest verified evidence or mark them unknown/unavailable. Before the hard deadline, call `done(...)` with the completed or partial result. Never keep running until the external runner timeout with no saved result. +Bounded verification contract: after a large result is saved, verify it with a compact independent check of count/schema/required-fields/source coverage, then repair the specific gaps it finds. Keep repairing until the required rows and fields are satisfied or the step budget is nearly spent; target the specific missing items rather than blindly restarting full crawls or pagination sweeps just because counts fluctuate. Finalize from the best saved artifact only when the source genuinely cannot supply the missing data. + +Completion discipline: complete the requested task before calling `done(...)`. Do not call `done` with a partial result just because the source is awkward, slow, or a first extraction path failed. Only when the remaining step budget is nearly exhausted and a full answer is no longer possible, save the best verified artifact, explicitly mark the result incomplete, list the missing requirements, and then call `done(...)`. Completion contract: the final answer must contain the requested answer or a clear pointer to the artifact that contains it. For artifact-heavy results, include the artifact path, record count, schema/columns, and one sample row. A bare acknowledgement such as `Done.` is not useful unless the task explicitly asked for no visible answer. @@ -28,6 +30,6 @@ Verification contract: when the task has explicit checkable requirements for rec If the task gives fallback instructions, treat them as part of the task. Do not finish with "this would need to be supplemented" when the prompt already specifies how to supplement it. -When the turn budget is nearly exhausted, stop starting new lines of investigation. Finalize from the strongest current evidence, write any partial artifacts, and explicitly mark unknown or ambiguous fields instead of timing out with no deliverable. +When the turn budget is nearly exhausted, stop starting new lines of investigation. Finalize from the strongest current evidence only as an explicitly incomplete fallback, write any partial artifacts, and name unknown or ambiguous fields instead of timing out with no deliverable. Return the final answer with the done tool only when the task is complete. diff --git a/prompts/python-tool-description.md b/prompts/python-tool-description.md index 84314cd1..14991a3f 100644 --- a/prompts/python-tool-description.md +++ b/prompts/python-tool-description.md @@ -12,7 +12,7 @@ Do not import Playwright, Selenium, or Pyppeteer. Browser-harness workflow: firs To pass pixels directly to the next model turn, call raw `cdp("Page.captureScreenshot", format="png")`, `screenshot(label)`, `screenshot_clip(label, x, y, width, height)`, or `capture_screenshot(..., attach=True)`; use `emit_image(path)` for existing image files. Raw `Page.captureScreenshot` results are attached automatically. The user does not see attached pixels inline in the terminal; describe what you see or provide the saved artifact path when the user asks for a screenshot. Multiple labeled screenshots are good when they form a temporal trace, not when they repeat the same unchanged page. -For final structured answers, write the complete JSON/CSV/text to a file under `outputs_dir()` or a relative path in the current working directory, then verify the count/schema. If the task asks for exact inline JSON, CSV, a table, markdown, or a schema-shaped response, return that content with `done(result=...)`; you may also include `result_file=path` to attach the saved file. Use `done(result_file=path)` by itself only when a file pointer or artifact summary satisfies the task. For long extraction loops, split work into bounded chunks, use explicit timeouts, checkpoint partial data to files, emit progress every item or every 2 seconds, use one global deadline, filter candidates before navigation when possible, and poll for record readiness rather than nullable answer fields. Extract only task-relevant fields; avoid full profile text, full DOM text, cookies, localStorage, or whole app caches unless debugging requires them. Use the browser to discover and verify; after stable endpoints, file URLs, XHR/fetch patterns, or pagination URLs are known, use `requests`, `http_get`, or threaded HTTP for bulk extraction. +For final structured answers, write the complete JSON/CSV/text to a file under `outputs_dir()` or a relative path in the current working directory, then verify the count/schema. If the task asks for exact inline JSON, CSV, a table, markdown, or a schema-shaped response, return that content with `done(result=...)`; you may also include `result_file=path` to attach the saved file. Use `done(result_file=path)` by itself only when a file pointer or artifact summary satisfies the task. For long extraction loops, split work into bounded chunks, use explicit timeouts, checkpoint partial data to files, and emit progress every item or every 2 seconds with source URL/pattern, count, missing-field count, failure count, and next cursor/page when relevant. After saving a large artifact, verify with a compact independent count/schema/required-fields/source check, then repair the specific gaps it finds until the required data is complete or the run is nearly out of turns; target the missing items rather than restarting the full crawl or detail scrape when the source is inconsistent or intermittently empty. Extract only task-relevant fields; avoid full profile text, full DOM text, cookies, localStorage, or whole app caches unless debugging requires them. Use the browser to discover and verify stable endpoints, file URLs, XHR/fetch patterns, or pagination URLs, then use targeted `requests` or `http_get` calls to complete or verify the source. Do not replace source completion with blind bulk fetching. Use `audit_artifact(...)` or ordinary Python checks when the task has explicit checkable requirements for the result or files. If checks fail, fix the result and rerun them when possible; otherwise finalize as partial/incomplete and name the remaining gaps. diff --git a/results.txt b/results.txt new file mode 100644 index 00000000..77100163 --- /dev/null +++ b/results.txt @@ -0,0 +1 @@ +[{"grantee_code":"2ACAH","result_count":60},{"grantee_code":"BCE","result_count":448},{"grantee_code":"2A3UL","result_count":69},{"grantee_code":"X26","result_count":97},{"grantee_code":"U28","result_count":69},{"grantee_code":"2AXDT","result_count":53}] \ No newline at end of file