From 3bc399f94cd69b9b588a9b25354ce8179f28dc12 Mon Sep 17 00:00:00 2001 From: Neel Gupta Date: Fri, 1 May 2026 11:27:09 +0100 Subject: [PATCH 01/13] feat(agent): Initial click model integration --- .../apps/agent/lib/tool-labels.ts | 6 +- .../apps/server/src/agent/ai-sdk-agent.ts | 65 +++- .../apps/server/src/agent/gui-click-only.ts | 17 + .../apps/server/src/agent/prompt.ts | 94 +++++- .../apps/server/src/browser/browser.ts | 45 ++- .../apps/server/src/tools/acl/acl-guard.ts | 4 + .../apps/server/src/tools/framework.ts | 5 +- .../server/src/tools/gui-click-resolver.ts | 89 ++++++ .../apps/server/src/tools/input.ts | 153 +++++++-- .../server/src/tools/molmo-point-client.ts | 170 ++++++++++ .../server/src/tools/molmo-point-config.ts | 5 + .../apps/server/src/tools/registry.ts | 4 +- .../apps/server/src/tools/response.ts | 17 +- .../server/src/tools/tool-label-registry.ts | 6 +- .../server/tests/agent/gui-click-only.test.ts | 30 ++ .../apps/server/tests/agent/prompt.test.ts | 27 ++ .../apps/server/tests/tools/input.test.ts | 292 +++++++++++++++--- .../apps/server/tests/tools/response.test.ts | 22 ++ 18 files changed, 950 insertions(+), 101 deletions(-) create mode 100644 packages/browseros-agent/apps/server/src/agent/gui-click-only.ts create mode 100644 packages/browseros-agent/apps/server/src/tools/gui-click-resolver.ts create mode 100644 packages/browseros-agent/apps/server/src/tools/molmo-point-client.ts create mode 100644 packages/browseros-agent/apps/server/src/tools/molmo-point-config.ts create mode 100644 packages/browseros-agent/apps/server/tests/agent/gui-click-only.test.ts diff --git a/packages/browseros-agent/apps/agent/lib/tool-labels.ts b/packages/browseros-agent/apps/agent/lib/tool-labels.ts index 5657605e4..ad659ac8e 100644 --- a/packages/browseros-agent/apps/agent/lib/tool-labels.ts +++ b/packages/browseros-agent/apps/agent/lib/tool-labels.ts @@ -36,6 +36,7 @@ const VERB_OVERRIDES: Record = { hover: 'Hovered', hover_at: 'Hovered at coordinates', type_at: 'Typed at coordinates', + type_text: 'Typed text', drag_at: 'Dragged', focus: 'Focused element', fill: 'Filled field', @@ -186,8 +187,8 @@ const SUBJECT_EXTRACTORS: Record = { find_files: (i) => quote(stringField(i, 'pattern', 'query')), // Element interactions - click: (i) => stringField(i, 'element'), - hover: (i) => stringField(i, 'element'), + click: (i) => stringField(i, 'prompt'), + hover: (i) => stringField(i, 'prompt', 'element'), focus: (i) => stringField(i, 'element'), clear: (i) => stringField(i, 'element'), check: (i) => stringField(i, 'element'), @@ -199,6 +200,7 @@ const SUBJECT_EXTRACTORS: Record = { return target ?? truncate(text, 40) }, press_key: (i) => stringField(i, 'key'), + type_text: (i) => truncate(stringField(i, 'text'), 40), // Coordinate-based input click_at: (i) => coords(i.x, i.y), diff --git a/packages/browseros-agent/apps/server/src/agent/ai-sdk-agent.ts b/packages/browseros-agent/apps/server/src/agent/ai-sdk-agent.ts index 737c7bf6e..5ddce23f0 100644 --- a/packages/browseros-agent/apps/server/src/agent/ai-sdk-agent.ts +++ b/packages/browseros-agent/apps/server/src/agent/ai-sdk-agent.ts @@ -32,6 +32,11 @@ import { buildMemoryToolSet } from '../tools/memory/build-toolset' import type { ToolRegistry } from '../tools/tool-registry' import { CHAT_MODE_ALLOWED_TOOLS } from './chat-mode' import { createCompactionPrepareStep, type StepWithUsage } from './compaction' +import { + GUI_CLICK_ONLY_BROWSER_TOOL_NAMES, + GUI_CLICK_ONLY_MODE, + isGuiClickOnlyBrowserToolAllowed, +} from './gui-click-only' import { buildMcpServerSpecs, createMcpClients } from './mcp-builder' import { getMessageNormalizationOptions, @@ -101,6 +106,7 @@ export class AiSdkAgent { session: { origin: config.resolvedConfig.origin, originPageId, + suppressSnapshotOutputs: GUI_CLICK_ONLY_MODE, }, aclRules: config.aclRules, } @@ -109,32 +115,48 @@ export class AiSdkAgent { toolContext, config.resolvedConfig.toolApprovalConfig, ) - const browserTools = config.resolvedConfig.chatMode - ? Object.fromEntries( - Object.entries(allBrowserTools).filter(([name]) => - CHAT_MODE_ALLOWED_TOOLS.has(name), - ), - ) - : allBrowserTools + let browserTools = allBrowserTools + if (GUI_CLICK_ONLY_MODE) { + browserTools = Object.fromEntries( + Object.entries(allBrowserTools).filter(([name]) => + isGuiClickOnlyBrowserToolAllowed(name), + ), + ) + } else if (config.resolvedConfig.chatMode) { + browserTools = Object.fromEntries( + Object.entries(allBrowserTools).filter(([name]) => + CHAT_MODE_ALLOWED_TOOLS.has(name), + ), + ) + } if (config.resolvedConfig.chatMode) { logger.info('Chat mode enabled, restricting to read-only browser tools', { allowedTools: Array.from(CHAT_MODE_ALLOWED_TOOLS), }) } + if (GUI_CLICK_ONLY_MODE) { + logger.info('GUI click-only mode enabled, restricting browser tools', { + allowedTools: Array.from(GUI_CLICK_ONLY_BROWSER_TOOL_NAMES), + }) + } // Get Klavis tools from shared background handle (no per-session connection). // Only expose when user has enabled servers — matches old per-session gating. const klavisTools = + !GUI_CLICK_ONLY_MODE && config.klavisRef?.handle && config.browserContext?.enabledMcpServers?.length ? buildKlavisToolSet(config.klavisRef.handle) : {} // Connect custom (non-Klavis) MCP servers per-session - const specs = await buildMcpServerSpecs({ - browserContext: config.browserContext, - }) - const { clients, tools: customMcpTools } = await createMcpClients(specs) + const { clients, tools: customMcpTools } = GUI_CLICK_ONLY_MODE + ? { clients: [] as Array<{ close(): Promise }>, tools: {} } + : await createMcpClients( + await buildMcpServerSpecs({ + browserContext: config.browserContext, + }), + ) const collidingToolNames = Object.keys(customMcpTools).filter( (name) => name in klavisTools, ) @@ -183,12 +205,15 @@ export class AiSdkAgent { // Add filesystem tools — skip in chat mode (read-only) and when no workspace is selected const filesystemTools = - !config.resolvedConfig.chatMode && config.resolvedConfig.workingDir + !GUI_CLICK_ONLY_MODE && + !config.resolvedConfig.chatMode && + config.resolvedConfig.workingDir ? buildFilesystemToolSet(config.resolvedConfig.workingDir) : {} - const memoryTools = config.resolvedConfig.chatMode - ? {} - : buildMemoryToolSet() + const memoryTools = + config.resolvedConfig.chatMode || GUI_CLICK_ONLY_MODE + ? {} + : buildMemoryToolSet() const tools = { ...browserTools, ...externalMcpTools, @@ -212,6 +237,15 @@ export class AiSdkAgent { ) { excludeSections.push('nudges') } + if (GUI_CLICK_ONLY_MODE) { + excludeSections.push( + 'external-integrations', + 'memory-and-identity', + 'workspace', + 'skills', + 'nudges', + ) + } const soulContent = await readSoul() const isBootstrap = await isSoulBootstrap() @@ -233,6 +267,7 @@ export class AiSdkAgent { declinedApps: config.resolvedConfig.declinedApps, skillsCatalog, origin: config.resolvedConfig.origin, + guiClickOnly: GUI_CLICK_ONLY_MODE, }) // Configure compaction for context window management diff --git a/packages/browseros-agent/apps/server/src/agent/gui-click-only.ts b/packages/browseros-agent/apps/server/src/agent/gui-click-only.ts new file mode 100644 index 000000000..db420356f --- /dev/null +++ b/packages/browseros-agent/apps/server/src/agent/gui-click-only.ts @@ -0,0 +1,17 @@ +export const GUI_CLICK_ONLY_MODE = true + +export const GUI_CLICK_ONLY_BROWSER_TOOL_NAMES = new Set([ + 'click', + 'hover', + 'scroll', + 'type_text', + 'get_active_page', + 'list_pages', + 'navigate_page', + 'new_page', + 'close_page', +]) + +export function isGuiClickOnlyBrowserToolAllowed(name: string): boolean { + return GUI_CLICK_ONLY_BROWSER_TOOL_NAMES.has(name) +} diff --git a/packages/browseros-agent/apps/server/src/agent/prompt.ts b/packages/browseros-agent/apps/server/src/agent/prompt.ts index 52be6842c..a3f579ebb 100644 --- a/packages/browseros-agent/apps/server/src/agent/prompt.ts +++ b/packages/browseros-agent/apps/server/src/agent/prompt.ts @@ -31,6 +31,14 @@ function getRoleAndMode( _exclude: Set, options?: BuildSystemPromptOptions, ): string { + if (options?.guiClickOnly) { + return ` +You are BrowserOS running in an experimental GUI click model mode. Page clicks are mediated through the \`click\` tool, which uses a visual model to choose coordinates from the current screenshot. + +Your tool surface is intentionally small: open or manage pages, then interact with visible page targets through GUI-backed clicks. You cannot read page content or inspect elements through DOM, accessibility-tree, snapshot, page-text, screenshot, link-extraction, or script-evaluation tools in this mode. +` + } + const hasWorkspace = !!options?.workspaceDir let role: string @@ -124,6 +132,22 @@ function getCapabilities( ): string { const hasWorkspace = !!options?.workspaceDir + if (options?.guiClickOnly) { + return ` +## Your Capabilities + +### Browser Control +Use these browser tools under the GUI click model constraint: +- \`click\` captures the current page screenshot internally, asks the GUI click model where to click based on your prompt, then executes that coordinate click. +- \`hover\` captures the current page screenshot internally, asks the GUI model where to hover based on your prompt, then moves the cursor there. +- \`type_text\` types into the currently focused element. Use it after \`click\` focuses a text field. +- \`scroll\` scrolls the page viewport. +- \`get_active_page\`, \`list_pages\`, \`navigate_page\`, \`new_page\`, and \`close_page\` are available for opening and managing pages. + +You cannot inspect the DOM, accessibility tree, snapshots, page text, screenshots, links, or scripts. Use the Page ID from Browser Context directly and issue concise visual click prompts for page targets. +` + } + let capabilities = ` ## Your Capabilities @@ -195,6 +219,20 @@ function getExecution( _exclude: Set, options?: BuildSystemPromptOptions, ): string { + if (options?.guiClickOnly) { + return ` +## Execution + +- Use \`click\` for visible page targets. It is the only click path that should choose page coordinates. +- Use \`hover\` for visible hover targets, \`type_text\` after focusing a field, and \`scroll\` to move the viewport. +- Use \`new_page\` or \`navigate_page\` to open websites. Use \`get_active_page\`, \`list_pages\`, and \`close_page\` only when needed for page management. +- Use the Page ID from Browser Context directly. +- Do not try to observe the page with snapshots, DOM, accessibility trees, screenshots, scripts, link extraction, or text extraction. +- You are blind to page content between clicks. Make one concise visual click prompt at a time, then continue from your best estimate of the resulting page state. +- If the task clearly cannot proceed without page observation, say what blocked you. +` + } + const isNewTab = options?.origin === 'newtab' let executionContent = ` @@ -283,6 +321,20 @@ function getToolSelection( _exclude: Set, options?: BuildSystemPromptOptions, ): string { + if (options?.guiClickOnly) { + return ` +## Tool Selection + +- Use \`click\` for visible page targets. +- Use \`hover\` for hover menus or targets that reveal content. +- Use \`type_text\` only after a prior GUI click likely focused a text input. Include a newline in \`text\` when you need to submit with Enter. +- Use \`scroll\` to move the page viewport when the target is likely below or above the visible area. +- The \`prompt\` argument should describe the visible target to click, for example: "click the search box", "click the Add to Cart button", or "click the first product result". +- Use page-opening and page-management tools only to get to the website or manage tabs; they do not replace visual page clicking. +- Do not request or rely on element IDs, snapshots, DOM nodes, page text, screenshots, scripts, link extraction, or coordinate click tools. +` + } + const isNewTab = options?.origin === 'newtab' const navTable = isNewTab @@ -413,6 +465,19 @@ function getErrorRecovery( ): string { const hasWorkspace = !!options?.workspaceDir + if (options?.guiClickOnly) { + return ` +## Error Recovery + +### Browser interaction errors +- If a click does not appear to make progress, try one more click with a more specific visual prompt. +- After 2 failed attempts, describe the blocker and ask the user for guidance. + +### Page errors +- If you infer that a site is blocked by login, CAPTCHA, 2FA, geo-blocking, or payment confirmation, pause and ask the user to handle it. +` + } + let recovery = ` ## Error Recovery @@ -601,6 +666,21 @@ function getStyle( _exclude: Set, options?: BuildSystemPromptOptions, ): string { + if (options?.guiClickOnly) { + return ` +## Style + + +- Keep click prompts concise and visual. +- Do not narrate routine clicks before calling the tool. +- After a click, continue from your best estimate of the page state. + + +- Be concise. +- Report blockers plainly when GUI clicks and page opening are insufficient. +` + } + const hasWorkspace = !!options?.workspaceDir let style = ` @@ -664,8 +744,9 @@ function getUserContext( '\nYou are running as a **scheduled background task** on a system-managed hidden page.' } - pageCtx += - '\n\n**CRITICAL RULES:**\n1. **Do NOT call `get_active_page` or `list_pages` to find your starting page.** Use the **page ID from the Browser Context** directly.' + pageCtx += options?.guiClickOnly + ? '\n\n**CRITICAL RULE:** Use the **page ID from the Browser Context** directly when calling `click`.' + : '\n\n**CRITICAL RULES:**\n1. **Do NOT call `get_active_page` or `list_pages` to find your starting page.** Use the **page ID from the Browser Context** directly.' if (options?.isScheduledTask) { const pageRef = options.scheduledTaskPageId @@ -751,10 +832,19 @@ export interface BuildSystemPromptOptions { skillsCatalog?: string /** Where the chat session originates from — determines navigation behavior. */ origin?: 'sidepanel' | 'newtab' + /** Experimental mode: browser control is limited to GUI-backed click only. */ + guiClickOnly?: boolean } export function buildSystemPrompt(options?: BuildSystemPromptOptions): string { const exclude = new Set(options?.exclude) + if (options?.guiClickOnly) { + exclude.add('external-integrations') + exclude.add('memory-and-identity') + exclude.add('workspace') + exclude.add('skills') + exclude.add('nudges') + } const sections = Object.entries(promptSections) .filter(([key]) => !exclude.has(key)) diff --git a/packages/browseros-agent/apps/server/src/browser/browser.ts b/packages/browseros-agent/apps/server/src/browser/browser.ts index e4a45cd83..7657a5ca8 100644 --- a/packages/browseros-agent/apps/server/src/browser/browser.ts +++ b/packages/browseros-agent/apps/server/src/browser/browser.ts @@ -364,6 +364,31 @@ export class Browser { } } + async resolveFocusedElement(pageId: number): Promise { + const session = await this.resolveSession(pageId) + try { + const result = await session.Runtime.evaluate({ + expression: `(() => { + let element = document.activeElement; + while (element?.shadowRoot?.activeElement) { + element = element.shadowRoot.activeElement; + } + return element instanceof Element ? element : null; + })()`, + }) + const objectId = result.result?.objectId + if (!objectId) return null + + const desc = await session.DOM.describeNode({ objectId }) + const backendNodeId = desc.node?.backendNodeId + if (!backendNodeId) return null + + return await this.resolveActionableElement(pageId, backendNodeId) + } catch { + return null + } + } + async resolveElementProperties( pageId: number, backendNodeId: number, @@ -911,6 +936,16 @@ export class Browser { } } + async viewportSize(page: number): Promise<{ width: number; height: number }> { + const session = await this.resolveSession(page) + const metrics = await session.Page.getLayoutMetrics() + const viewport = metrics.cssVisualViewport ?? metrics.cssLayoutViewport + return { + width: viewport.clientWidth, + height: viewport.clientHeight, + } + } + async evaluate( page: number, expression: string, @@ -1099,6 +1134,11 @@ export class Browser { await keyboard.typeText(session, text) } + async typeText(page: number, text: string): Promise { + const session = await this.resolveSession(page) + await keyboard.typeText(session, text) + } + async dragAt( page: number, from: { x: number; y: number }, @@ -1220,8 +1260,9 @@ export class Browser { y = center.y } else { const metrics = await session.Page.getLayoutMetrics() - x = metrics.layoutViewport.clientWidth / 2 - y = metrics.layoutViewport.clientHeight / 2 + const viewport = metrics.cssVisualViewport ?? metrics.cssLayoutViewport + x = viewport.clientWidth / 2 + y = viewport.clientHeight / 2 } const beforeWindowPosition = diff --git a/packages/browseros-agent/apps/server/src/tools/acl/acl-guard.ts b/packages/browseros-agent/apps/server/src/tools/acl/acl-guard.ts index 637362cca..f81e0fbac 100644 --- a/packages/browseros-agent/apps/server/src/tools/acl/acl-guard.ts +++ b/packages/browseros-agent/apps/server/src/tools/acl/acl-guard.ts @@ -19,6 +19,7 @@ const GUARDED_TOOLS = new Set([ 'uncheck', 'select_option', 'press_key', + 'type_text', 'upload_file', ]) @@ -39,6 +40,9 @@ async function resolveTargetElementId( if (toolName === 'drag' && typeof args.sourceElement === 'number') { return args.sourceElement } + if (toolName === 'type_text') { + return (await browser.resolveFocusedElement(pageId)) ?? undefined + } if (typeof args.x === 'number' && typeof args.y === 'number') { return ( diff --git a/packages/browseros-agent/apps/server/src/tools/framework.ts b/packages/browseros-agent/apps/server/src/tools/framework.ts index b5f421918..b2608e6f5 100644 --- a/packages/browseros-agent/apps/server/src/tools/framework.ts +++ b/packages/browseros-agent/apps/server/src/tools/framework.ts @@ -29,6 +29,7 @@ export interface ToolDirectories { export interface ToolSessionContext { origin?: 'sidepanel' | 'newtab' originPageId?: number + suppressSnapshotOutputs?: boolean } export type ToolContext = { @@ -135,7 +136,9 @@ export async function executeTool( response.error(`Internal error in ${tool.name}: ${message}`) } - const result = await response.build(ctx.browser) + const result = await response.build(ctx.browser, { + suppressSnapshots: ctx.session?.suppressSnapshotOutputs, + }) const pageId = (args as Record).page if (typeof pageId === 'number') { diff --git a/packages/browseros-agent/apps/server/src/tools/gui-click-resolver.ts b/packages/browseros-agent/apps/server/src/tools/gui-click-resolver.ts new file mode 100644 index 000000000..53f22c59c --- /dev/null +++ b/packages/browseros-agent/apps/server/src/tools/gui-click-resolver.ts @@ -0,0 +1,89 @@ +import type { ToolContext } from './framework' +import { + getPngDimensionsFromBase64, + requestMolmoPoint, +} from './molmo-point-client' + +const LOG_TEXT_MAX_CHARS = 500 + +function truncateForLog(value: string | undefined): string | undefined { + if (!value) return value + if (value.length <= LOG_TEXT_MAX_CHARS) return value + return `${value.slice(0, LOG_TEXT_MAX_CHARS)}... (+${value.length - LOG_TEXT_MAX_CHARS} chars)` +} + +export interface GuiPointResult { + x: number + y: number + log: Record +} + +export async function resolveGuiPoint( + ctx: ToolContext, + page: number, + prompt: string, +): Promise { + const screenshot = await ctx.browser.screenshot(page, { + format: 'png', + fullPage: false, + }) + const point = await requestMolmoPoint({ + instruction: prompt, + imageB64: screenshot.data, + }) + + const dimensions = getPngDimensionsFromBase64(screenshot.data) + const viewport = await ctx.browser.viewportSize(page).catch(() => null) + const scaleX = + dimensions && viewport?.width + ? dimensions.width / viewport.width + : screenshot.devicePixelRatio + const scaleY = + dimensions && viewport?.height + ? dimensions.height / viewport.height + : screenshot.devicePixelRatio + const x = point.x / (scaleX || 1) + const y = point.y / (scaleY || 1) + const pageInfo = await ctx.browser.refreshPageInfo(page).catch(() => null) + const hitElementId = await ctx.browser + .resolveElementAtPoint(page, x, y) + .catch(() => null) + const hitElement = + hitElementId !== null + ? await ctx.browser + .resolveElementProperties(page, hitElementId) + .catch(() => null) + : null + + return { + x, + y, + log: { + page, + pageUrl: truncateForLog(pageInfo?.url), + pageTitle: truncateForLog(pageInfo?.title), + prompt: truncateForLog(prompt), + promptLength: prompt.length, + promptTruncated: prompt.length > LOG_TEXT_MAX_CHARS, + modelPoint: point, + resolvedPoint: { x, y }, + scale: { x: scaleX, y: scaleY }, + screenshot: { + width: dimensions?.width, + height: dimensions?.height, + devicePixelRatio: screenshot.devicePixelRatio, + }, + viewport, + hitElementId, + hitElement: hitElement + ? { + tagName: hitElement.tagName, + role: hitElement.role, + ariaLabel: truncateForLog(hitElement.ariaLabel), + labelText: truncateForLog(hitElement.labelText), + textContent: truncateForLog(hitElement.textContent), + } + : null, + }, + } +} diff --git a/packages/browseros-agent/apps/server/src/tools/input.ts b/packages/browseros-agent/apps/server/src/tools/input.ts index 72047284a..ad9160ff0 100644 --- a/packages/browseros-agent/apps/server/src/tools/input.ts +++ b/packages/browseros-agent/apps/server/src/tools/input.ts @@ -1,5 +1,8 @@ import { z } from 'zod' -import { defineToolWithCategory } from './framework' +import { logger } from '../lib/logger' +import { defineToolWithCategory, type ToolContext } from './framework' +import { resolveGuiPoint } from './gui-click-resolver' +import type { ToolResponse } from './response' const pageParam = z.number().describe('Page ID (from list_pages)') const defineInputTool = defineToolWithCategory('input') @@ -7,12 +10,46 @@ const elementParam = z .number() .describe('Element ID from snapshot (the number in [N])') +async function enforceAcl( + toolName: string, + args: Record, + ctx: ToolContext, + response: ToolResponse, +): Promise { + if (!ctx.aclRules?.length) return false + + const { checkAcl } = await import('./acl/acl-guard') + const check = await checkAcl(toolName, args, ctx.browser, ctx.aclRules) + if (!check.blocked) return false + + const desc = + check.rule?.description ?? + check.rule?.textMatch ?? + check.rule?.sitePattern ?? + 'ACL rule' + if (check.pageId !== undefined && check.elementId !== undefined) { + await ctx.browser.highlightBlockedElement( + check.pageId, + check.elementId, + desc, + ) + } + response.error( + `Action blocked by ACL rule: "${desc}". The element on this page is restricted. Choose a different action or skip this step.`, + ) + return true +} + export const click = defineInputTool({ name: 'click', - description: 'Click an element by its ID from the last snapshot', + description: + 'Click a visible page target using the GUI click model. Provide a concise visual prompt for what to click.', input: z.object({ page: pageParam, - element: elementParam, + prompt: z + .string() + .min(1) + .describe('Visual click instruction, e.g. "click the search box"'), button: z .enum(['left', 'right', 'middle']) .default('left') @@ -25,27 +62,43 @@ export const click = defineInputTool({ output: z.object({ action: z.literal('click'), page: z.number(), - element: z.number(), + prompt: z.string(), button: z.enum(['left', 'right', 'middle']), clickCount: z.number(), + x: z.number(), + y: z.number(), }), handler: async (args, ctx, response) => { - const coords = await ctx.browser.click(args.page, args.element, { + const { x, y, log } = await resolveGuiPoint(ctx, args.page, args.prompt) + const clickLog = { + ...log, + clickPoint: { x, y }, + button: args.button, + clickCount: args.clickCount, + } + + const blocked = await enforceAcl('click', { ...args, x, y }, ctx, response) + if (blocked) { + logger.info('GUI click blocked by ACL', clickLog) + return + } + + logger.info('GUI click dispatching', clickLog) + await ctx.browser.clickAt(args.page, x, y, { button: args.button, clickCount: args.clickCount, }) - const coordText = coords - ? ` at (${Math.round(coords.x)}, ${Math.round(coords.y)})` - : '' - response.text(`Clicked [${args.element}]${coordText}`) + logger.info('GUI click dispatched', clickLog) + response.text('tool call executed successfully') response.data({ action: 'click', page: args.page, - element: args.element, + prompt: args.prompt, button: args.button, clickCount: args.clickCount, + x, + y, }) - response.includeSnapshot(args.page) }, }) @@ -146,22 +199,43 @@ export const drag_at = defineInputTool({ export const hover = defineInputTool({ name: 'hover', - description: 'Hover over an element by its ID', + description: + 'Hover over a visible page target using the GUI click model. Provide a concise visual prompt for what to hover.', input: z.object({ page: pageParam, - element: elementParam, + prompt: z + .string() + .min(1) + .describe('Visual hover instruction, e.g. "hover the account menu"'), }), output: z.object({ action: z.literal('hover'), page: z.number(), - element: z.number(), + prompt: z.string(), + x: z.number(), + y: z.number(), }), handler: async (args, ctx, response) => { - const coords = await ctx.browser.hover(args.page, args.element) - response.text( - `Hovered over [${args.element}] at (${Math.round(coords.x)}, ${Math.round(coords.y)})`, - ) - response.data({ action: 'hover', page: args.page, element: args.element }) + const { x, y, log } = await resolveGuiPoint(ctx, args.page, args.prompt) + const hoverLog = { ...log, hoverPoint: { x, y } } + + const blocked = await enforceAcl('hover', { ...args, x, y }, ctx, response) + if (blocked) { + logger.info('GUI hover blocked by ACL', hoverLog) + return + } + + logger.info('GUI hover dispatching', hoverLog) + await ctx.browser.hoverAt(args.page, x, y) + logger.info('GUI hover dispatched', hoverLog) + response.text('tool call executed successfully') + response.data({ + action: 'hover', + page: args.page, + prompt: args.prompt, + x, + y, + }) }, }) @@ -251,6 +325,32 @@ export const press_key = defineInputTool({ }, }) +export const type_text = defineInputTool({ + name: 'type_text', + description: + 'Type text into the currently focused element. Use after GUI click focuses a text field.', + input: z.object({ + page: pageParam, + text: z + .string() + .describe('Text to type into the currently focused element'), + }), + output: z.object({ + action: z.literal('type_text'), + page: z.number(), + textLength: z.number(), + }), + handler: async (args, ctx, response) => { + await ctx.browser.typeText(args.page, args.text) + response.text('tool call executed successfully') + response.data({ + action: 'type_text', + page: args.page, + textLength: args.text.length, + }) + }, +}) + export const drag = defineInputTool({ name: 'drag', description: @@ -303,7 +403,7 @@ export const drag = defineInputTool({ export const scroll = defineInputTool({ name: 'scroll', - description: 'Scroll the page or a specific element', + description: 'Scroll the page viewport', input: z.object({ page: pageParam, direction: z @@ -311,32 +411,21 @@ export const scroll = defineInputTool({ .default('down') .describe('Scroll direction'), amount: z.number().default(3).describe('Number of scroll ticks'), - element: z - .number() - .optional() - .describe('Element ID to scroll at (scrolls page center if omitted)'), }), output: z.object({ action: z.literal('scroll'), page: z.number(), direction: z.enum(['up', 'down', 'left', 'right']), amount: z.number(), - element: z.number().optional(), }), handler: async (args, ctx, response) => { - await ctx.browser.scroll( - args.page, - args.direction, - args.amount, - args.element, - ) + await ctx.browser.scroll(args.page, args.direction, args.amount) response.text(`Scrolled ${args.direction} by ${args.amount}`) response.data({ action: 'scroll', page: args.page, direction: args.direction, amount: args.amount, - element: args.element, }) }, }) diff --git a/packages/browseros-agent/apps/server/src/tools/molmo-point-client.ts b/packages/browseros-agent/apps/server/src/tools/molmo-point-client.ts new file mode 100644 index 000000000..0ea8f5191 --- /dev/null +++ b/packages/browseros-agent/apps/server/src/tools/molmo-point-client.ts @@ -0,0 +1,170 @@ +import { Buffer } from 'node:buffer' +import { logger } from '../lib/logger' +import { + MOLMO_POINT_ENDPOINT, + MOLMO_POINT_MAX_NEW_TOKENS, + MOLMO_POINT_TIMEOUT_MS, +} from './molmo-point-config' + +interface MolmoPoint { + object_id?: unknown + image_num?: unknown + x?: unknown + y?: unknown +} + +interface MolmoPointResponse { + text?: unknown + points?: unknown +} + +export interface ClickPoint { + x: number + y: number +} + +export interface PngDimensions { + width: number + height: number +} + +const MOLMO_POINT_RESPONSE_LOG_MAX_CHARS = 2_000 +const MOLMO_POINT_ERROR_BODY_MAX_CHARS = 500 +const MOLMO_POINT_INSTRUCTION_LOG_MAX_CHARS = 1_000 + +function pointUrl(): string { + return new URL('/point', MOLMO_POINT_ENDPOINT).toString() +} + +function truncateText(text: string, maxChars: number): string { + if (text.length <= maxChars) return text + return `${text.slice(0, maxChars)}... (+${text.length - maxChars} chars)` +} + +function firstValidPoint(points: unknown): ClickPoint | null { + if (!Array.isArray(points)) return null + + for (const rawPoint of points) { + const point = rawPoint as MolmoPoint + if (typeof point.x !== 'number' || typeof point.y !== 'number') continue + if (!Number.isFinite(point.x) || !Number.isFinite(point.y)) continue + return { x: point.x, y: point.y } + } + + return null +} + +export async function requestMolmoPoint(args: { + instruction: string + imageB64: string +}): Promise { + const endpoint = pointUrl() + const instruction = truncateText( + args.instruction, + MOLMO_POINT_INSTRUCTION_LOG_MAX_CHARS, + ) + const instructionLength = args.instruction.length + const instructionTruncated = + instructionLength > MOLMO_POINT_INSTRUCTION_LOG_MAX_CHARS + + logger.info('Molmo point request started', { + endpoint, + instruction, + instructionLength, + instructionTruncated, + }) + + const response = await fetch(endpoint, { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ + instruction: args.instruction, + image_b64: args.imageB64, + max_new_tokens: MOLMO_POINT_MAX_NEW_TOKENS, + }), + signal: AbortSignal.timeout(MOLMO_POINT_TIMEOUT_MS), + }) + + if (!response.ok) { + const body = await response.text().catch(() => '') + logger.warn('Molmo point request failed', { + endpoint, + instruction, + instructionLength, + instructionTruncated, + status: response.status, + statusText: response.statusText, + rawResponseText: truncateText(body, MOLMO_POINT_RESPONSE_LOG_MAX_CHARS), + rawResponseTextLength: body.length, + rawResponseTextTruncated: + body.length > MOLMO_POINT_RESPONSE_LOG_MAX_CHARS, + }) + const suffix = body + ? `: ${truncateText(body, MOLMO_POINT_ERROR_BODY_MAX_CHARS)}` + : '' + throw new Error(`Molmo point request failed (${response.status})${suffix}`) + } + + const rawResponseText = await response.text() + let payload: MolmoPointResponse + try { + payload = JSON.parse(rawResponseText) as MolmoPointResponse + } catch (error) { + logger.warn('Molmo point response parse failed', { + endpoint, + instruction, + instructionLength, + instructionTruncated, + status: response.status, + statusText: response.statusText, + rawResponseText: truncateText( + rawResponseText, + MOLMO_POINT_RESPONSE_LOG_MAX_CHARS, + ), + rawResponseTextLength: rawResponseText.length, + rawResponseTextTruncated: + rawResponseText.length > MOLMO_POINT_RESPONSE_LOG_MAX_CHARS, + error: error instanceof Error ? error.message : String(error), + }) + throw error + } + + const point = firstValidPoint(payload.points) + logger.info('Molmo point response received', { + endpoint, + instruction, + instructionLength, + instructionTruncated, + status: response.status, + statusText: response.statusText, + rawResponseText: truncateText( + rawResponseText, + MOLMO_POINT_RESPONSE_LOG_MAX_CHARS, + ), + rawResponseTextLength: rawResponseText.length, + rawResponseTextTruncated: + rawResponseText.length > MOLMO_POINT_RESPONSE_LOG_MAX_CHARS, + parsedPoint: point, + }) + + if (!point) { + throw new Error('Molmo point response did not include a valid point') + } + + return point +} + +export function getPngDimensionsFromBase64( + imageB64: string, +): PngDimensions | null { + const buffer = Buffer.from(imageB64, 'base64') + if (buffer.length < 24) return null + + const pngSignature = '89504e470d0a1a0a' + if (buffer.subarray(0, 8).toString('hex') !== pngSignature) return null + + return { + width: buffer.readUInt32BE(16), + height: buffer.readUInt32BE(20), + } +} diff --git a/packages/browseros-agent/apps/server/src/tools/molmo-point-config.ts b/packages/browseros-agent/apps/server/src/tools/molmo-point-config.ts new file mode 100644 index 000000000..80c684089 --- /dev/null +++ b/packages/browseros-agent/apps/server/src/tools/molmo-point-config.ts @@ -0,0 +1,5 @@ +export const MOLMO_POINT_ENDPOINT = + 'https://gseb9k0a2n2vhl-8000.proxy.runpod.net/' + +export const MOLMO_POINT_MAX_NEW_TOKENS = 64 +export const MOLMO_POINT_TIMEOUT_MS = 60_000 diff --git a/packages/browseros-agent/apps/server/src/tools/registry.ts b/packages/browseros-agent/apps/server/src/tools/registry.ts index ae119d70b..5f42560ae 100644 --- a/packages/browseros-agent/apps/server/src/tools/registry.ts +++ b/packages/browseros-agent/apps/server/src/tools/registry.ts @@ -31,6 +31,7 @@ import { scroll, select_option, type_at, + type_text, uncheck, upload_file, } from './input' @@ -95,12 +96,13 @@ export const registry = createRegistry([ evaluate_script, get_console_logs, - // Input (17) + // Input (18) click, click_at, hover, hover_at, type_at, + type_text, drag_at, focus, clear, diff --git a/packages/browseros-agent/apps/server/src/tools/response.ts b/packages/browseros-agent/apps/server/src/tools/response.ts index 5432fba49..80846a1c4 100644 --- a/packages/browseros-agent/apps/server/src/tools/response.ts +++ b/packages/browseros-agent/apps/server/src/tools/response.ts @@ -25,6 +25,10 @@ interface ToolResponseOptions { postActionTimeoutMs?: number } +interface ToolResponseBuildOptions { + suppressSnapshots?: boolean +} + export class ToolResponse { private content: ContentItem[] = [] private hasError = false @@ -123,12 +127,19 @@ export class ToolResponse { } } - async build(browser: Browser): Promise { - if (this.postActions.length > 0) { + async build( + browser: Browser, + options: ToolResponseBuildOptions = {}, + ): Promise { + const postActions = options.suppressSnapshots + ? this.postActions.filter((action) => action.type !== 'snapshot') + : this.postActions + + if (postActions.length > 0) { this.text('\n--- Additional context (auto-included) ---') } - for (const action of this.postActions) { + for (const action of postActions) { try { await this.withTimeout(this.runPostAction(action, browser)) } catch { diff --git a/packages/browseros-agent/apps/server/src/tools/tool-label-registry.ts b/packages/browseros-agent/apps/server/src/tools/tool-label-registry.ts index 5657605e4..ad659ac8e 100644 --- a/packages/browseros-agent/apps/server/src/tools/tool-label-registry.ts +++ b/packages/browseros-agent/apps/server/src/tools/tool-label-registry.ts @@ -36,6 +36,7 @@ const VERB_OVERRIDES: Record = { hover: 'Hovered', hover_at: 'Hovered at coordinates', type_at: 'Typed at coordinates', + type_text: 'Typed text', drag_at: 'Dragged', focus: 'Focused element', fill: 'Filled field', @@ -186,8 +187,8 @@ const SUBJECT_EXTRACTORS: Record = { find_files: (i) => quote(stringField(i, 'pattern', 'query')), // Element interactions - click: (i) => stringField(i, 'element'), - hover: (i) => stringField(i, 'element'), + click: (i) => stringField(i, 'prompt'), + hover: (i) => stringField(i, 'prompt', 'element'), focus: (i) => stringField(i, 'element'), clear: (i) => stringField(i, 'element'), check: (i) => stringField(i, 'element'), @@ -199,6 +200,7 @@ const SUBJECT_EXTRACTORS: Record = { return target ?? truncate(text, 40) }, press_key: (i) => stringField(i, 'key'), + type_text: (i) => truncate(stringField(i, 'text'), 40), // Coordinate-based input click_at: (i) => coords(i.x, i.y), diff --git a/packages/browseros-agent/apps/server/tests/agent/gui-click-only.test.ts b/packages/browseros-agent/apps/server/tests/agent/gui-click-only.test.ts new file mode 100644 index 000000000..cd1948ecb --- /dev/null +++ b/packages/browseros-agent/apps/server/tests/agent/gui-click-only.test.ts @@ -0,0 +1,30 @@ +import { describe, expect, it } from 'bun:test' +import { isGuiClickOnlyBrowserToolAllowed } from '../../src/agent/gui-click-only' + +describe('GUI click-only browser tool gating', () => { + it('keeps GUI click and basic page-opening tools available', () => { + expect(isGuiClickOnlyBrowserToolAllowed('click')).toBe(true) + expect(isGuiClickOnlyBrowserToolAllowed('hover')).toBe(true) + expect(isGuiClickOnlyBrowserToolAllowed('scroll')).toBe(true) + expect(isGuiClickOnlyBrowserToolAllowed('type_text')).toBe(true) + expect(isGuiClickOnlyBrowserToolAllowed('new_page')).toBe(true) + expect(isGuiClickOnlyBrowserToolAllowed('navigate_page')).toBe(true) + expect(isGuiClickOnlyBrowserToolAllowed('list_pages')).toBe(true) + expect(isGuiClickOnlyBrowserToolAllowed('get_active_page')).toBe(true) + expect(isGuiClickOnlyBrowserToolAllowed('close_page')).toBe(true) + }) + + it('blocks page observation and legacy interaction tools', () => { + expect(isGuiClickOnlyBrowserToolAllowed('take_snapshot')).toBe(false) + expect(isGuiClickOnlyBrowserToolAllowed('take_enhanced_snapshot')).toBe( + false, + ) + expect(isGuiClickOnlyBrowserToolAllowed('get_dom')).toBe(false) + expect(isGuiClickOnlyBrowserToolAllowed('get_page_content')).toBe(false) + expect(isGuiClickOnlyBrowserToolAllowed('take_screenshot')).toBe(false) + expect(isGuiClickOnlyBrowserToolAllowed('click_at')).toBe(false) + expect(isGuiClickOnlyBrowserToolAllowed('fill')).toBe(false) + expect(isGuiClickOnlyBrowserToolAllowed('press_key')).toBe(false) + expect(isGuiClickOnlyBrowserToolAllowed('type_at')).toBe(false) + }) +}) diff --git a/packages/browseros-agent/apps/server/tests/agent/prompt.test.ts b/packages/browseros-agent/apps/server/tests/agent/prompt.test.ts index b5917bde3..aef9b54dc 100644 --- a/packages/browseros-agent/apps/server/tests/agent/prompt.test.ts +++ b/packages/browseros-agent/apps/server/tests/agent/prompt.test.ts @@ -92,6 +92,17 @@ function buildScheduled(overrides?: Partial): string { }) } +/** Build a prompt for experimental GUI click-only mode */ +function buildGuiClickOnly( + overrides?: Partial, +): string { + return buildSystemPrompt({ + guiClickOnly: true, + origin: 'sidepanel', + ...overrides, + }) +} + // --------------------------------------------------------------------------- // 1. SECTION PRESENCE // @@ -284,6 +295,22 @@ describe('mode-aware framing', () => { expect(prompt).toContain('cannot interact with them') }) + it('GUI click-only mode exposes only GUI click and page-opening guidance', () => { + const prompt = buildGuiClickOnly() + expect(prompt).toContain('experimental GUI click model mode') + expect(prompt).toContain('Use `click` for visible page targets') + expect(prompt).toContain('Use `hover` for hover menus') + expect(prompt).toContain('Use `type_text` only after') + expect(prompt).toContain('Use `scroll` to move the page viewport') + expect(prompt).toContain('`new_page`') + expect(prompt).toContain('`navigate_page`') + expect(prompt).toContain('`close_page`') + expect(prompt).not.toContain('take_snapshot') + expect(prompt).not.toContain('get_dom') + expect(prompt).not.toContain('`press_key`') + expect(prompt).not.toContain('') + }) + it('chat mode excludes memory-and-identity section', () => { // Why: chat mode is read-only — no memory writes, no soul updates. // The agent shouldn't even see memory tool instructions. diff --git a/packages/browseros-agent/apps/server/tests/tools/input.test.ts b/packages/browseros-agent/apps/server/tests/tools/input.test.ts index 0abbed401..8dd1a30e1 100644 --- a/packages/browseros-agent/apps/server/tests/tools/input.test.ts +++ b/packages/browseros-agent/apps/server/tests/tools/input.test.ts @@ -13,8 +13,13 @@ import { scroll, select_option, type_at, + type_text, uncheck, } from '../../src/tools/input' +import { + type ClickPoint, + getPngDimensionsFromBase64, +} from '../../src/tools/molmo-point-client' import { close_page, navigate_page, new_page } from '../../src/tools/navigation' import { evaluate_script, take_snapshot } from '../../src/tools/snapshot' import { cleanupWithBrowser, withBrowser } from '../__helpers__/with-browser' @@ -121,6 +126,39 @@ async function pointInsideElement( return { x: point.x, y: point.y } } +async function withMockedGuiPoint( + browser: Browser, + pageId: number, + viewportPoint: ClickPoint, + fn: () => Promise, +): Promise { + const screenshot = await browser.screenshot(pageId, { + format: 'png', + fullPage: false, + }) + const dimensions = getPngDimensionsFromBase64(screenshot.data) + const viewport = await browser.viewportSize(pageId) + const scaleX = dimensions + ? dimensions.width / viewport.width + : screenshot.devicePixelRatio + const scaleY = dimensions + ? dimensions.height / viewport.height + : screenshot.devicePixelRatio + const originalFetch = globalThis.fetch + globalThis.fetch = (async () => + new Response( + JSON.stringify({ + points: [{ x: viewportPoint.x * scaleX, y: viewportPoint.y * scaleY }], + }), + { status: 200, headers: { 'content-type': 'application/json' } }, + )) as typeof fetch + try { + await fn() + } finally { + globalThis.fetch = originalFetch + } +} + const FORM_PAGE = `data:text/html,${encodeURIComponent(`

Test Form

@@ -196,7 +234,7 @@ describe('input tools', () => { }, 60_000) it('click triggers a button', async () => { - await withBrowser(async ({ execute }) => { + await withBrowser(async ({ browser, execute }) => { const newResult = await execute(new_page, { url: FORM_PAGE }) const pageId = pageIdOf(newResult) @@ -206,18 +244,28 @@ describe('input tools', () => { const inputId = findElementId(snapText, 'Enter name') await execute(fill, { page: pageId, element: inputId, text: 'Alice' }) - // Click submit - const btnId = findElementId(snapText, 'Submit') - const clickResult = await execute(click, { - page: pageId, - element: btnId, - }) - assert.ok(!clickResult.isError, textOf(clickResult)) - const clickData = structuredOf<{ action: string; element: number }>( - clickResult, + // Click submit via the GUI point model response. + const buttonPoint = await pointInsideElement( + { browser, directories: { workingDir: process.cwd() } }, + pageId, + 'submit-btn', ) - assert.strictEqual(clickData.action, 'click') - assert.strictEqual(clickData.element, btnId) + await withMockedGuiPoint(browser, pageId, buttonPoint, async () => { + const clickResult = await execute(click, { + page: pageId, + prompt: 'click the Submit button', + }) + assert.ok(!clickResult.isError, textOf(clickResult)) + assert.strictEqual( + textOf(clickResult), + 'tool call executed successfully', + ) + const clickData = structuredOf<{ action: string; prompt: string }>( + clickResult, + ) + assert.strictEqual(clickData.action, 'click') + assert.strictEqual(clickData.prompt, 'click the Submit button') + }) const output = await execute(evaluate_script, { page: pageId, @@ -229,6 +277,61 @@ describe('input tools', () => { }) }, 60_000) + it('click is blocked by ACL after the GUI point resolves', async () => { + await withBrowser(async ({ browser }) => { + const ctx: ToolContext = { + browser, + directories: { workingDir: process.cwd() }, + aclRules: [ + { + id: 'submit-rule', + sitePattern: '*', + textMatch: 'Submit', + enabled: true, + }, + ], + } + + const newResult = await executeTool( + new_page, + { url: FORM_PAGE }, + ctx, + AbortSignal.timeout(30_000), + ) + const pageId = pageIdOf(newResult) + const buttonPoint = await pointInsideElement(ctx, pageId, 'submit-btn') + + await withMockedGuiPoint(browser, pageId, buttonPoint, async () => { + const clickResult = await executeTool( + click, + { page: pageId, prompt: 'click the Submit button' }, + ctx, + AbortSignal.timeout(30_000), + ) + assert.ok(clickResult.isError, 'Expected ACL to block GUI click') + assert.ok(textOf(clickResult).includes('Action blocked by ACL rule')) + }) + + const output = await executeTool( + evaluate_script, + { + page: pageId, + expression: 'document.getElementById("output").textContent', + }, + ctx, + AbortSignal.timeout(30_000), + ) + assert.strictEqual(textOf(output), '') + + await executeTool( + close_page, + { page: pageId }, + ctx, + AbortSignal.timeout(30_000), + ) + }) + }, 60_000) + it('check and uncheck toggle a checkbox', async () => { await withBrowser(async ({ execute }) => { const newResult = await execute(new_page, { url: FORM_PAGE }) @@ -397,6 +500,104 @@ describe('input tools', () => { }) }, 60_000) + it('type_text types into the focused element', async () => { + await withBrowser(async ({ execute }) => { + const newResult = await execute(new_page, { url: FORM_PAGE }) + const pageId = pageIdOf(newResult) + + const snap = await execute(take_snapshot, { page: pageId }) + const inputId = findElementId(textOf(snap), 'Enter name') + await execute(fill, { page: pageId, element: inputId, text: 'hello' }) + + const typeResult = await execute(type_text, { + page: pageId, + text: ' world', + }) + assert.ok(!typeResult.isError, textOf(typeResult)) + assert.strictEqual(textOf(typeResult), 'tool call executed successfully') + assert.deepStrictEqual(structuredOf(typeResult), { + action: 'type_text', + page: pageId, + textLength: ' world'.length, + }) + + const val = await execute(evaluate_script, { + page: pageId, + expression: 'document.getElementById("name").value', + }) + assert.strictEqual(textOf(val), 'hello world') + + await execute(close_page, { page: pageId }) + }) + }, 60_000) + + it('type_text is blocked by ACL on the focused element', async () => { + await withBrowser(async ({ browser }) => { + const ctx: ToolContext = { + browser, + directories: { workingDir: process.cwd() }, + } + + const newResult = await executeTool( + new_page, + { url: FORM_PAGE }, + ctx, + AbortSignal.timeout(30_000), + ) + const pageId = pageIdOf(newResult) + + const snap = await executeTool( + take_snapshot, + { page: pageId }, + ctx, + AbortSignal.timeout(30_000), + ) + const inputId = findElementId(textOf(snap), 'Enter name') + await executeTool( + fill, + { page: pageId, element: inputId, text: 'hello' }, + ctx, + AbortSignal.timeout(30_000), + ) + + ctx.aclRules = [ + { + id: 'name-rule', + sitePattern: '*', + textMatch: 'Enter name', + enabled: true, + }, + ] + + const typeResult = await executeTool( + type_text, + { page: pageId, text: ' blocked' }, + ctx, + AbortSignal.timeout(30_000), + ) + assert.ok(typeResult.isError, 'Expected ACL to block focused typing') + assert.ok(textOf(typeResult).includes('Action blocked by ACL rule')) + + const val = await executeTool( + evaluate_script, + { + page: pageId, + expression: 'document.getElementById("name").value', + }, + ctx, + AbortSignal.timeout(30_000), + ) + assert.strictEqual(textOf(val), 'hello') + + await executeTool( + close_page, + { page: pageId }, + ctx, + AbortSignal.timeout(30_000), + ) + }) + }, 60_000) + it('scroll dispatches without error', async () => { const calls: Array<{ page: number @@ -437,24 +638,35 @@ describe('input tools', () => { page: 7, direction: 'down', amount: 5, - element: undefined, }) }) - it('hover moves cursor over element', async () => { - await withBrowser(async ({ execute }) => { + it('hover moves cursor via the GUI point model response', async () => { + await withBrowser(async ({ browser, execute }) => { const newResult = await execute(new_page, { url: FORM_PAGE }) const pageId = pageIdOf(newResult) - const snap = await execute(take_snapshot, { page: pageId }) - const btnId = findElementId(textOf(snap), 'Submit') - - const hoverResult = await execute(hover, { - page: pageId, - element: btnId, + const buttonPoint = await pointInsideElement( + { browser, directories: { workingDir: process.cwd() } }, + pageId, + 'submit-btn', + ) + await withMockedGuiPoint(browser, pageId, buttonPoint, async () => { + const hoverResult = await execute(hover, { + page: pageId, + prompt: 'hover the Submit button', + }) + assert.ok(!hoverResult.isError, textOf(hoverResult)) + assert.strictEqual( + textOf(hoverResult), + 'tool call executed successfully', + ) + const hoverData = structuredOf<{ action: string; prompt: string }>( + hoverResult, + ) + assert.strictEqual(hoverData.action, 'hover') + assert.strictEqual(hoverData.prompt, 'hover the Submit button') }) - assert.ok(!hoverResult.isError, textOf(hoverResult)) - assert.ok(textOf(hoverResult).includes('Hovered')) await execute(close_page, { page: pageId }) }) @@ -467,7 +679,7 @@ describe('input tools', () => { directories: { workingDir: process.cwd() }, } const run = - (tool: typeof new_page | typeof take_snapshot | typeof click) => + (tool: typeof new_page | typeof take_snapshot | typeof fill) => (args: unknown) => executeTool(tool, args, ctx, AbortSignal.timeout(30_000)) @@ -475,21 +687,29 @@ describe('input tools', () => { const pageId = pageIdOf(newResult) const snap = await run(take_snapshot)({ page: pageId }) - const btnId = findElementId(textOf(snap), 'Submit') + const inputId = findElementId(textOf(snap), 'Enter name') - const beforeBlock = await run(click)({ page: pageId, element: btnId }) + const beforeBlock = await run(fill)({ + page: pageId, + element: inputId, + text: 'Allowed', + }) assert.ok(!beforeBlock.isError, textOf(beforeBlock)) ctx.aclRules = [ { - id: 'submit-rule', + id: 'name-rule', sitePattern: '*', - textMatch: 'Submit', + textMatch: 'Enter name', enabled: true, }, ] - const afterBlock = await run(click)({ page: pageId, element: btnId }) + const afterBlock = await run(fill)({ + page: pageId, + element: inputId, + text: 'Blocked', + }) assert.ok(afterBlock.isError, 'Expected ACL block after updating rules') assert.ok(textOf(afterBlock).includes('Action blocked by ACL rule')) @@ -598,19 +818,9 @@ describe('input tools', () => { ) assert.ok(!navResult.isError, textOf(navResult)) - const snap = await executeTool( - take_snapshot, - { page: pageId }, - ctx, - AbortSignal.timeout(30_000), - ) - const linkMatch = textOf(snap).match(/\[(\d+)\]\s*link/) - assert.ok(linkMatch, `Expected a link in snapshot:\n${textOf(snap)}`) - const linkId = Number(linkMatch?.[1]) - const blockedClick = await executeTool( - click, - { page: pageId, element: linkId }, + click_at, + { page: pageId, x: 10, y: 10 }, ctx, AbortSignal.timeout(30_000), ) diff --git a/packages/browseros-agent/apps/server/tests/tools/response.test.ts b/packages/browseros-agent/apps/server/tests/tools/response.test.ts index a8d81fd41..c782dc947 100644 --- a/packages/browseros-agent/apps/server/tests/tools/response.test.ts +++ b/packages/browseros-agent/apps/server/tests/tools/response.test.ts @@ -73,4 +73,26 @@ describe('ToolResponse', () => { assert.ok(text.includes('[Page 1 snapshot]')) assert.ok(text.includes('[42] button "Submit"')) }) + + it('suppresses snapshot post-actions when requested', async () => { + const response = new ToolResponse({ postActionTimeoutMs: 200 }) + response.text('ok') + response.includeSnapshot(1) + + let called = false + const browser = { + snapshot: async () => { + called = true + return '[42] button "Submit"' + }, + } as unknown as Browser + + const result = await response.build(browser, { suppressSnapshots: true }) + const text = textOf(result) + + assert.equal(called, false) + assert.ok(text.includes('ok')) + assert.ok(!text.includes('Additional context')) + assert.ok(!text.includes('[Page 1 snapshot]')) + }) }) From 940ef6dbd5816849bd86eb9c055162444d2de4cc Mon Sep 17 00:00:00 2001 From: Neel Gupta Date: Fri, 1 May 2026 14:16:00 +0100 Subject: [PATCH 02/13] feat: added screenshot for agent perception --- .../apps/server/src/agent/gui-click-only.ts | 1 + .../browseros-agent/apps/server/src/agent/prompt.ts | 11 +++++++---- .../apps/server/src/tools/molmo-point-config.ts | 2 +- .../apps/server/tests/agent/gui-click-only.test.ts | 2 +- .../apps/server/tests/agent/prompt.test.ts | 1 + 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/packages/browseros-agent/apps/server/src/agent/gui-click-only.ts b/packages/browseros-agent/apps/server/src/agent/gui-click-only.ts index db420356f..65a248c4a 100644 --- a/packages/browseros-agent/apps/server/src/agent/gui-click-only.ts +++ b/packages/browseros-agent/apps/server/src/agent/gui-click-only.ts @@ -5,6 +5,7 @@ export const GUI_CLICK_ONLY_BROWSER_TOOL_NAMES = new Set([ 'hover', 'scroll', 'type_text', + 'take_screenshot', 'get_active_page', 'list_pages', 'navigate_page', diff --git a/packages/browseros-agent/apps/server/src/agent/prompt.ts b/packages/browseros-agent/apps/server/src/agent/prompt.ts index a3f579ebb..cab96e701 100644 --- a/packages/browseros-agent/apps/server/src/agent/prompt.ts +++ b/packages/browseros-agent/apps/server/src/agent/prompt.ts @@ -142,9 +142,10 @@ Use these browser tools under the GUI click model constraint: - \`hover\` captures the current page screenshot internally, asks the GUI model where to hover based on your prompt, then moves the cursor there. - \`type_text\` types into the currently focused element. Use it after \`click\` focuses a text field. - \`scroll\` scrolls the page viewport. +- \`take_screenshot\` returns a visual screenshot for feedback. It does not expose DOM, accessibility tree, page text, links, or scripts. - \`get_active_page\`, \`list_pages\`, \`navigate_page\`, \`new_page\`, and \`close_page\` are available for opening and managing pages. -You cannot inspect the DOM, accessibility tree, snapshots, page text, screenshots, links, or scripts. Use the Page ID from Browser Context directly and issue concise visual click prompts for page targets. +You cannot inspect the DOM, accessibility tree, snapshots, page text, links, or scripts. Use the Page ID from Browser Context directly and issue concise visual click prompts for page targets.
` } @@ -225,10 +226,11 @@ function getExecution( - Use \`click\` for visible page targets. It is the only click path that should choose page coordinates. - Use \`hover\` for visible hover targets, \`type_text\` after focusing a field, and \`scroll\` to move the viewport. +- Use \`take_screenshot\` when you need explicit visual feedback about the current page before choosing the next action. Do not call it after every action by default. - Use \`new_page\` or \`navigate_page\` to open websites. Use \`get_active_page\`, \`list_pages\`, and \`close_page\` only when needed for page management. - Use the Page ID from Browser Context directly. -- Do not try to observe the page with snapshots, DOM, accessibility trees, screenshots, scripts, link extraction, or text extraction. -- You are blind to page content between clicks. Make one concise visual click prompt at a time, then continue from your best estimate of the resulting page state. +- Do not try to observe the page with snapshots, DOM, accessibility trees, scripts, link extraction, or text extraction. +- You are blind to page content except for explicit \`take_screenshot\` results. Make one concise visual click prompt at a time, then continue from your best estimate of the resulting page state. - If the task clearly cannot proceed without page observation, say what blocked you. ` } @@ -329,9 +331,10 @@ function getToolSelection( - Use \`hover\` for hover menus or targets that reveal content. - Use \`type_text\` only after a prior GUI click likely focused a text input. Include a newline in \`text\` when you need to submit with Enter. - Use \`scroll\` to move the page viewport when the target is likely below or above the visible area. +- Use \`take_screenshot\` sparingly when you need visual feedback before deciding what to click, type, hover, or scroll next. - The \`prompt\` argument should describe the visible target to click, for example: "click the search box", "click the Add to Cart button", or "click the first product result". - Use page-opening and page-management tools only to get to the website or manage tabs; they do not replace visual page clicking. -- Do not request or rely on element IDs, snapshots, DOM nodes, page text, screenshots, scripts, link extraction, or coordinate click tools. +- Do not request or rely on element IDs, snapshots, DOM nodes, page text, scripts, link extraction, or coordinate click tools. ` } diff --git a/packages/browseros-agent/apps/server/src/tools/molmo-point-config.ts b/packages/browseros-agent/apps/server/src/tools/molmo-point-config.ts index 80c684089..2a4b12273 100644 --- a/packages/browseros-agent/apps/server/src/tools/molmo-point-config.ts +++ b/packages/browseros-agent/apps/server/src/tools/molmo-point-config.ts @@ -2,4 +2,4 @@ export const MOLMO_POINT_ENDPOINT = 'https://gseb9k0a2n2vhl-8000.proxy.runpod.net/' export const MOLMO_POINT_MAX_NEW_TOKENS = 64 -export const MOLMO_POINT_TIMEOUT_MS = 60_000 +export const MOLMO_POINT_TIMEOUT_MS = 10_000 diff --git a/packages/browseros-agent/apps/server/tests/agent/gui-click-only.test.ts b/packages/browseros-agent/apps/server/tests/agent/gui-click-only.test.ts index cd1948ecb..641672bf7 100644 --- a/packages/browseros-agent/apps/server/tests/agent/gui-click-only.test.ts +++ b/packages/browseros-agent/apps/server/tests/agent/gui-click-only.test.ts @@ -7,6 +7,7 @@ describe('GUI click-only browser tool gating', () => { expect(isGuiClickOnlyBrowserToolAllowed('hover')).toBe(true) expect(isGuiClickOnlyBrowserToolAllowed('scroll')).toBe(true) expect(isGuiClickOnlyBrowserToolAllowed('type_text')).toBe(true) + expect(isGuiClickOnlyBrowserToolAllowed('take_screenshot')).toBe(true) expect(isGuiClickOnlyBrowserToolAllowed('new_page')).toBe(true) expect(isGuiClickOnlyBrowserToolAllowed('navigate_page')).toBe(true) expect(isGuiClickOnlyBrowserToolAllowed('list_pages')).toBe(true) @@ -21,7 +22,6 @@ describe('GUI click-only browser tool gating', () => { ) expect(isGuiClickOnlyBrowserToolAllowed('get_dom')).toBe(false) expect(isGuiClickOnlyBrowserToolAllowed('get_page_content')).toBe(false) - expect(isGuiClickOnlyBrowserToolAllowed('take_screenshot')).toBe(false) expect(isGuiClickOnlyBrowserToolAllowed('click_at')).toBe(false) expect(isGuiClickOnlyBrowserToolAllowed('fill')).toBe(false) expect(isGuiClickOnlyBrowserToolAllowed('press_key')).toBe(false) diff --git a/packages/browseros-agent/apps/server/tests/agent/prompt.test.ts b/packages/browseros-agent/apps/server/tests/agent/prompt.test.ts index aef9b54dc..60956aaad 100644 --- a/packages/browseros-agent/apps/server/tests/agent/prompt.test.ts +++ b/packages/browseros-agent/apps/server/tests/agent/prompt.test.ts @@ -302,6 +302,7 @@ describe('mode-aware framing', () => { expect(prompt).toContain('Use `hover` for hover menus') expect(prompt).toContain('Use `type_text` only after') expect(prompt).toContain('Use `scroll` to move the page viewport') + expect(prompt).toContain('Use `take_screenshot` sparingly') expect(prompt).toContain('`new_page`') expect(prompt).toContain('`navigate_page`') expect(prompt).toContain('`close_page`') From 3c66cbabaeebddcf644ac63e0d48dd86292a9294 Mon Sep 17 00:00:00 2001 From: Neel Gupta Date: Fri, 1 May 2026 17:11:34 +0100 Subject: [PATCH 03/13] fix: evals & timeout fix: extend molmo point timeout feat: modal endpoint --- .../configs/legacy/agisdk-real-smoke.json | 2 +- .../apps/eval/configs/legacy/agisdk-real.json | 2 +- .../eval/configs/suites/agisdk-daily-10.json | 2 +- .../configs/suites/agisdk-real-smoke.json | 2 +- .../apps/eval/configs/suites/agisdk-real.json | 2 +- .../browseros-agent/apps/eval/src/cli/args.ts | 12 ++++++++++ .../apps/eval/src/cli/commands/run.ts | 3 +++ .../apps/eval/src/cli/commands/suite.ts | 13 ++++++++++- .../apps/eval/src/cli/index.ts | 1 + .../apps/eval/tests/cli/args.test.ts | 22 +++++++++++++++++++ .../apps/eval/tests/cli/suite-command.test.ts | 6 +++++ .../apps/server/src/agent/prompt.ts | 2 +- .../server/src/tools/molmo-point-config.ts | 4 ++-- 13 files changed, 64 insertions(+), 9 deletions(-) diff --git a/packages/browseros-agent/apps/eval/configs/legacy/agisdk-real-smoke.json b/packages/browseros-agent/apps/eval/configs/legacy/agisdk-real-smoke.json index 9d52ddbf9..46ce9d518 100644 --- a/packages/browseros-agent/apps/eval/configs/legacy/agisdk-real-smoke.json +++ b/packages/browseros-agent/apps/eval/configs/legacy/agisdk-real-smoke.json @@ -8,7 +8,7 @@ "supportsImages": true }, "dataset": "../../data/agisdk-real-smoke.jsonl", - "num_workers": 1, + "num_workers": 20, "restart_server_per_task": true, "browseros": { "server_url": "http://127.0.0.1:9110", diff --git a/packages/browseros-agent/apps/eval/configs/legacy/agisdk-real.json b/packages/browseros-agent/apps/eval/configs/legacy/agisdk-real.json index 32d8becad..6544b9f52 100644 --- a/packages/browseros-agent/apps/eval/configs/legacy/agisdk-real.json +++ b/packages/browseros-agent/apps/eval/configs/legacy/agisdk-real.json @@ -8,7 +8,7 @@ "supportsImages": true }, "dataset": "../../data/agisdk-real.jsonl", - "num_workers": 4, + "num_workers": 20, "restart_server_per_task": true, "browseros": { "server_url": "http://127.0.0.1:9110", diff --git a/packages/browseros-agent/apps/eval/configs/suites/agisdk-daily-10.json b/packages/browseros-agent/apps/eval/configs/suites/agisdk-daily-10.json index e08d7013b..81acdf19b 100644 --- a/packages/browseros-agent/apps/eval/configs/suites/agisdk-daily-10.json +++ b/packages/browseros-agent/apps/eval/configs/suites/agisdk-daily-10.json @@ -5,7 +5,7 @@ "type": "single" }, "graders": ["agisdk_state_diff"], - "workers": 1, + "workers": 20, "restartBrowserPerTask": true, "timeoutMs": 1800000, "browseros": { diff --git a/packages/browseros-agent/apps/eval/configs/suites/agisdk-real-smoke.json b/packages/browseros-agent/apps/eval/configs/suites/agisdk-real-smoke.json index 0ff97e302..ab9783466 100644 --- a/packages/browseros-agent/apps/eval/configs/suites/agisdk-real-smoke.json +++ b/packages/browseros-agent/apps/eval/configs/suites/agisdk-real-smoke.json @@ -5,7 +5,7 @@ "type": "single" }, "graders": ["agisdk_state_diff"], - "workers": 1, + "workers": 20, "restartBrowserPerTask": true, "timeoutMs": 1800000, "browseros": { diff --git a/packages/browseros-agent/apps/eval/configs/suites/agisdk-real.json b/packages/browseros-agent/apps/eval/configs/suites/agisdk-real.json index 51fa98999..eefcb4747 100644 --- a/packages/browseros-agent/apps/eval/configs/suites/agisdk-real.json +++ b/packages/browseros-agent/apps/eval/configs/suites/agisdk-real.json @@ -5,7 +5,7 @@ "type": "single" }, "graders": ["agisdk_state_diff"], - "workers": 1, + "workers": 20, "restartBrowserPerTask": true, "timeoutMs": 1800000, "browseros": { diff --git a/packages/browseros-agent/apps/eval/src/cli/args.ts b/packages/browseros-agent/apps/eval/src/cli/args.ts index 73b2b94de..fae401fe1 100644 --- a/packages/browseros-agent/apps/eval/src/cli/args.ts +++ b/packages/browseros-agent/apps/eval/src/cli/args.ts @@ -18,6 +18,9 @@ export interface SuiteCliArgs { apiKey?: string baseUrl?: string publishTarget?: PublishTarget + query?: string + startUrl?: string + outputDir?: string } export interface RunCliArgs @@ -83,6 +86,9 @@ function parseSuiteLikeArgs( 'api-key': { type: 'string' }, 'base-url': { type: 'string' }, publish: { type: 'string' }, + query: { type: 'string' }, + 'start-url': { type: 'string' }, + 'output-dir': { type: 'string' }, }, }) @@ -104,6 +110,12 @@ function parseSuiteLikeArgs( if (apiKey) parsed.apiKey = apiKey const baseUrl = stringValue(values['base-url']) if (baseUrl) parsed.baseUrl = baseUrl + const query = stringValue(values.query) + if (query) parsed.query = query + const startUrl = stringValue(values['start-url']) + if (startUrl) parsed.startUrl = startUrl + const outputDir = stringValue(values['output-dir']) + if (outputDir) parsed.outputDir = outputDir if (command === 'suite') { const target = publishTarget(stringValue(values.publish)) diff --git a/packages/browseros-agent/apps/eval/src/cli/commands/run.ts b/packages/browseros-agent/apps/eval/src/cli/commands/run.ts index 37228870b..2c1dcd316 100644 --- a/packages/browseros-agent/apps/eval/src/cli/commands/run.ts +++ b/packages/browseros-agent/apps/eval/src/cli/commands/run.ts @@ -15,6 +15,9 @@ export async function runRunCommand( model: args.model, apiKey: args.apiKey, baseUrl: args.baseUrl, + query: args.query, + startUrl: args.startUrl, + outputDir: args.outputDir, }, deps, ) diff --git a/packages/browseros-agent/apps/eval/src/cli/commands/suite.ts b/packages/browseros-agent/apps/eval/src/cli/commands/suite.ts index 05abcc62d..242f20ffd 100644 --- a/packages/browseros-agent/apps/eval/src/cli/commands/suite.ts +++ b/packages/browseros-agent/apps/eval/src/cli/commands/suite.ts @@ -21,6 +21,9 @@ export interface SuiteCommandOptions { apiKey?: string baseUrl?: string publishTarget?: PublishTarget + query?: string + startUrl?: string + outputDir?: string env?: Env } @@ -179,11 +182,19 @@ export async function runSuiteCommand( const resolved = await resolveSuiteCommand(options) const runOptions: RunEvalOptions = resolved.kind === 'config' - ? { configPath: resolved.configPath } + ? { + configPath: resolved.configPath, + query: options.query, + startUrl: options.startUrl, + outputDir: options.outputDir, + } : { configPath: resolved.suitePath, dataPath: resolved.datasetPath, config: resolved.evalConfig, + query: options.query, + startUrl: options.startUrl, + outputDir: options.outputDir, } const result = await runEval(runOptions) diff --git a/packages/browseros-agent/apps/eval/src/cli/index.ts b/packages/browseros-agent/apps/eval/src/cli/index.ts index 9a894c4c1..f4d127e74 100644 --- a/packages/browseros-agent/apps/eval/src/cli/index.ts +++ b/packages/browseros-agent/apps/eval/src/cli/index.ts @@ -15,6 +15,7 @@ Usage: bun run eval suite --suite --variant [--publish r2] bun run eval run --config bun run eval run --suite --variant + bun run eval run --config --query "..." --start-url bun run eval grade --run bun run eval publish --run --target r2 bun run eval -c diff --git a/packages/browseros-agent/apps/eval/tests/cli/args.test.ts b/packages/browseros-agent/apps/eval/tests/cli/args.test.ts index 810e5e094..518abef03 100644 --- a/packages/browseros-agent/apps/eval/tests/cli/args.test.ts +++ b/packages/browseros-agent/apps/eval/tests/cli/args.test.ts @@ -52,6 +52,28 @@ describe('parseEvalCliArgs', () => { }) }) + it('parses one-off query overrides for a single eval run', () => { + expect( + parseEvalCliArgs([ + 'run', + '--config', + 'configs/legacy/gui-click-amazon-smoke.json', + '--query', + 'open the cart', + '--start-url', + 'https://www.amazon.com/', + '--output-dir', + '/tmp/gui-click-eval', + ]), + ).toEqual({ + command: 'run', + configPath: 'configs/legacy/gui-click-amazon-smoke.json', + query: 'open the cart', + startUrl: 'https://www.amazon.com/', + outputDir: '/tmp/gui-click-eval', + }) + }) + it('rejects missing required command options with targeted errors', () => { expect(() => parseEvalCliArgs(['run'])).toThrow( 'run requires --config or --suite', diff --git a/packages/browseros-agent/apps/eval/tests/cli/suite-command.test.ts b/packages/browseros-agent/apps/eval/tests/cli/suite-command.test.ts index 54e06f9a9..1e32a7fbf 100644 --- a/packages/browseros-agent/apps/eval/tests/cli/suite-command.test.ts +++ b/packages/browseros-agent/apps/eval/tests/cli/suite-command.test.ts @@ -116,6 +116,9 @@ describe('suite command', () => { suitePath, model: 'moonshotai/kimi-k2.5', provider: 'openai-compatible', + query: 'Open Amazon cart', + startUrl: 'https://www.amazon.com/', + outputDir: '/tmp/gui-click-eval', env: {}, }, { @@ -132,5 +135,8 @@ describe('suite command', () => { expect(basename(calls[1].configPath)).toBe('agisdk-daily-10.json') expect(calls[1].config).toBeDefined() expect(calls[1].dataPath?.endsWith('tasks.jsonl')).toBe(true) + expect(calls[1].query).toBe('Open Amazon cart') + expect(calls[1].startUrl).toBe('https://www.amazon.com/') + expect(calls[1].outputDir).toBe('/tmp/gui-click-eval') }) }) diff --git a/packages/browseros-agent/apps/server/src/agent/prompt.ts b/packages/browseros-agent/apps/server/src/agent/prompt.ts index cab96e701..c3fcf48e2 100644 --- a/packages/browseros-agent/apps/server/src/agent/prompt.ts +++ b/packages/browseros-agent/apps/server/src/agent/prompt.ts @@ -35,7 +35,7 @@ function getRoleAndMode( return ` You are BrowserOS running in an experimental GUI click model mode. Page clicks are mediated through the \`click\` tool, which uses a visual model to choose coordinates from the current screenshot. -Your tool surface is intentionally small: open or manage pages, then interact with visible page targets through GUI-backed clicks. You cannot read page content or inspect elements through DOM, accessibility-tree, snapshot, page-text, screenshot, link-extraction, or script-evaluation tools in this mode. +Your tool surface is intentionally small: open or manage pages, then interact with visible page targets through GUI-backed clicks. You cannot read page content or inspect elements through DOM, accessibility-tree, snapshot, page-text, link-extraction, or script-evaluation tools in this mode. ` } diff --git a/packages/browseros-agent/apps/server/src/tools/molmo-point-config.ts b/packages/browseros-agent/apps/server/src/tools/molmo-point-config.ts index 2a4b12273..5c68fd784 100644 --- a/packages/browseros-agent/apps/server/src/tools/molmo-point-config.ts +++ b/packages/browseros-agent/apps/server/src/tools/molmo-point-config.ts @@ -1,5 +1,5 @@ export const MOLMO_POINT_ENDPOINT = - 'https://gseb9k0a2n2vhl-8000.proxy.runpod.net/' + 'https://browseros--molmopoint-gui-molmopointserver-web.modal.run/' export const MOLMO_POINT_MAX_NEW_TOKENS = 64 -export const MOLMO_POINT_TIMEOUT_MS = 10_000 +export const MOLMO_POINT_TIMEOUT_MS = 60_000 From b444afa117d533ac2e9c57c5c925aa3463a0adb2 Mon Sep 17 00:00:00 2001 From: Neel Gupta Date: Wed, 6 May 2026 11:51:36 +0100 Subject: [PATCH 04/13] feat: enabled reasoning and added configs --- ...on => agisdk-real-gpt-5-5-openrouter.json} | 11 ++++---- ...agent-opus-4-7-openrouter-agisdk-real.json | 28 +++++++++++++++++++ .../apps/server/src/agent/provider-factory.ts | 14 +++++++++- .../apps/server/src/agent/types.ts | 1 + .../packages/shared/src/schemas/llm.ts | 17 +++++++++++ 5 files changed, 64 insertions(+), 7 deletions(-) rename packages/browseros-agent/apps/eval/configs/legacy/{browseros-agent-opus-4-6-agisdk-real.json => agisdk-real-gpt-5-5-openrouter.json} (68%) create mode 100644 packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-7-openrouter-agisdk-real.json diff --git a/packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-6-agisdk-real.json b/packages/browseros-agent/apps/eval/configs/legacy/agisdk-real-gpt-5-5-openrouter.json similarity index 68% rename from packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-6-agisdk-real.json rename to packages/browseros-agent/apps/eval/configs/legacy/agisdk-real-gpt-5-5-openrouter.json index 02ccf3ef4..bfa7447f5 100644 --- a/packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-6-agisdk-real.json +++ b/packages/browseros-agent/apps/eval/configs/legacy/agisdk-real-gpt-5-5-openrouter.json @@ -1,15 +1,14 @@ { "agent": { "type": "single", - "provider": "bedrock", - "model": "global.anthropic.claude-opus-4-6-v1", - "region": "AWS_REGION", - "accessKeyId": "AWS_ACCESS_KEY_ID", - "secretAccessKey": "AWS_SECRET_ACCESS_KEY", + "provider": "openai-compatible", + "model": "openai/gpt-5.5", + "apiKey": "OPENROUTER_API_KEY", + "baseUrl": "https://openrouter.ai/api/v1", "supportsImages": true }, "dataset": "../../data/agisdk-real.jsonl", - "num_workers": 2, + "num_workers": 20, "restart_server_per_task": true, "browseros": { "server_url": "http://127.0.0.1:9110", diff --git a/packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-7-openrouter-agisdk-real.json b/packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-7-openrouter-agisdk-real.json new file mode 100644 index 000000000..3970af46c --- /dev/null +++ b/packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-7-openrouter-agisdk-real.json @@ -0,0 +1,28 @@ +{ + "agent": { + "type": "single", + "provider": "openrouter", + "model": "anthropic/claude-opus-4.7", + "apiKey": "OPENROUTER_API_KEY", + "supportsImages": true, + "reasoning": { + "enabled": true + } + }, + "dataset": "../../data/agisdk-real.jsonl", + "num_workers": 10, + "restart_server_per_task": true, + "browseros": { + "server_url": "http://127.0.0.1:9110", + "base_cdp_port": 9010, + "base_server_port": 9110, + "base_extension_port": 9310, + "load_extensions": false, + "headless": false + }, + "captcha": { + "api_key_env": "NOPECHA_API_KEY" + }, + "graders": ["agisdk_state_diff"], + "timeout_ms": 1800000 +} diff --git a/packages/browseros-agent/apps/server/src/agent/provider-factory.ts b/packages/browseros-agent/apps/server/src/agent/provider-factory.ts index 263c09ed3..a452ade5a 100644 --- a/packages/browseros-agent/apps/server/src/agent/provider-factory.ts +++ b/packages/browseros-agent/apps/server/src/agent/provider-factory.ts @@ -44,13 +44,25 @@ function createGoogleFactory( return createGoogleGenerativeAI({ apiKey: config.apiKey }) } +function buildOpenRouterReasoning( + config: ResolvedAgentConfig, +): Record { + const r = config.reasoning + if (!r) return {} + const body: Record = {} + if (r.enabled !== undefined) body.enabled = r.enabled + if (r.maxTokens !== undefined) body.max_tokens = r.maxTokens + if (r.effort !== undefined) body.effort = r.effort + return body +} + function createOpenRouterFactory( config: ResolvedAgentConfig, ): (modelId: string) => unknown { if (!config.apiKey) throw new Error('OpenRouter provider requires apiKey') return createOpenRouter({ apiKey: config.apiKey, - extraBody: { reasoning: {} }, + extraBody: { reasoning: buildOpenRouterReasoning(config) }, fetch: createOpenRouterCompatibleFetch(), }) } diff --git a/packages/browseros-agent/apps/server/src/agent/types.ts b/packages/browseros-agent/apps/server/src/agent/types.ts index 764704274..f8eb7ed30 100644 --- a/packages/browseros-agent/apps/server/src/agent/types.ts +++ b/packages/browseros-agent/apps/server/src/agent/types.ts @@ -34,6 +34,7 @@ export interface ResolvedAgentConfig { accountId?: string reasoningEffort?: string reasoningSummary?: string + reasoning?: { enabled?: boolean; maxTokens?: number; effort?: string } contextWindowSize?: number userSystemPrompt?: string workingDir?: string diff --git a/packages/browseros-agent/packages/shared/src/schemas/llm.ts b/packages/browseros-agent/packages/shared/src/schemas/llm.ts index 45e7cc029..dade9d62a 100644 --- a/packages/browseros-agent/packages/shared/src/schemas/llm.ts +++ b/packages/browseros-agent/packages/shared/src/schemas/llm.ts @@ -84,6 +84,15 @@ export const LLMConfigSchema: z.ZodObject<{ sessionToken: z.ZodOptional reasoningEffort: z.ZodOptional> reasoningSummary: z.ZodOptional> + reasoning: z.ZodOptional< + z.ZodObject<{ + enabled: z.ZodOptional + maxTokens: z.ZodOptional + effort: z.ZodOptional< + z.ZodEnum<['minimal', 'low', 'medium', 'high', 'xhigh']> + > + }> + > }> = z.object({ provider: LLMProviderSchema, model: z.string().optional(), @@ -99,6 +108,14 @@ export const LLMConfigSchema: z.ZodObject<{ // ChatGPT Pro (Codex) reasoningEffort: z.enum(['none', 'low', 'medium', 'high']).optional(), reasoningSummary: z.enum(['auto', 'concise', 'detailed']).optional(), + // Provider-specific reasoning controls. + reasoning: z + .object({ + enabled: z.boolean().optional(), + maxTokens: z.number().optional(), + effort: z.enum(['minimal', 'low', 'medium', 'high', 'xhigh']).optional(), + }) + .optional(), }) export type LLMConfig = z.infer From daeb33fc6c39b5f1c9faa6150707c951900dd4c4 Mon Sep 17 00:00:00 2001 From: Neel Gupta Date: Wed, 6 May 2026 12:55:05 +0100 Subject: [PATCH 05/13] feat(eval): explain AGISDK state-diff failures --- .../graders/benchmark/agisdk-state-diff.ts | 87 ++++++++++++++++++- .../src/graders/python/agisdk-evaluate.py | 23 ++++- .../apps/eval/src/runs/eval-runner.ts | 17 ++++ .../tests/grading/agisdk-artifacts.test.ts | 35 +++++++- 4 files changed, 157 insertions(+), 5 deletions(-) diff --git a/packages/browseros-agent/apps/eval/src/graders/benchmark/agisdk-state-diff.ts b/packages/browseros-agent/apps/eval/src/graders/benchmark/agisdk-state-diff.ts index c3e7843a8..4007e8650 100644 --- a/packages/browseros-agent/apps/eval/src/graders/benchmark/agisdk-state-diff.ts +++ b/packages/browseros-agent/apps/eval/src/graders/benchmark/agisdk-state-diff.ts @@ -31,6 +31,16 @@ interface AgisdkEvaluatorOutput { per_criterion: unknown[] } +interface FailedAgisdkCriterion { + index: number + detail: string + expected?: unknown + actual?: unknown +} + +const MAX_REASONING_CRITERIA = 8 +const MAX_REASONING_DETAIL_CHARS = 700 + export class AgisdkStateDiffGrader implements Grader { name = 'agisdk_state_diff' @@ -99,15 +109,23 @@ export class AgisdkStateDiffGrader implements Grader { 'stderr.txt', evaluation.stderr, ) + const failedCriteria = this.extractFailedCriteria(result.per_criterion) + if (failedCriteria.length > 0) { + await writeGraderJsonArtifact( + input, + this.name, + 'failed-criteria.json', + failedCriteria, + ) + } return { score: result.reward, pass: result.pass, - reasoning: - result.message || - (result.pass ? 'All criteria passed' : 'Some criteria failed'), + reasoning: this.buildReasoning(result, failedCriteria), details: { reward: result.reward, per_criterion: result.per_criterion, + failed_criteria: failedCriteria, origin, agisdk_task_id: taskId, }, @@ -148,6 +166,69 @@ export class AgisdkStateDiffGrader implements Grader { return null } + private buildReasoning( + result: AgisdkEvaluatorOutput, + failedCriteria: FailedAgisdkCriterion[], + ): string { + const base = + result.message || + (result.pass ? 'All criteria passed' : 'Some criteria failed') + + if (result.pass || failedCriteria.length === 0) return base + + const shown = failedCriteria.slice(0, MAX_REASONING_CRITERIA) + const lines = shown.map( + (criterion) => + `${criterion.index + 1}. ${this.formatFailedCriterion(criterion)}`, + ) + const remaining = failedCriteria.length - shown.length + if (remaining > 0) { + lines.push(`... ${remaining} more failed criteria`) + } + + return `${base}\nFailed criteria:\n${lines.join('\n')}` + } + + private extractFailedCriteria( + perCriterion: unknown[], + ): FailedAgisdkCriterion[] { + return perCriterion.flatMap((criterion, index) => { + if (!criterion || typeof criterion !== 'object') return [] + const record = criterion as Record + if (record.passed === true) return [] + + const detail = + typeof record.detail === 'string' + ? record.detail + : this.stringifyCriterionValue(record.raw_detail ?? record) + const failed: FailedAgisdkCriterion = { + index, + detail, + } + if ('expected_value' in record) failed.expected = record.expected_value + if ('actual_value' in record) failed.actual = record.actual_value + return [failed] + }) + } + + private formatFailedCriterion(criterion: FailedAgisdkCriterion): string { + const parts = [criterion.detail] + if ('expected' in criterion) { + parts.push(`expected=${this.stringifyCriterionValue(criterion.expected)}`) + } + if ('actual' in criterion) { + parts.push(`actual=${this.stringifyCriterionValue(criterion.actual)}`) + } + + const text = parts.join(' | ') + if (text.length <= MAX_REASONING_DETAIL_CHARS) return text + return `${text.slice(0, MAX_REASONING_DETAIL_CHARS)}... (+${text.length - MAX_REASONING_DETAIL_CHARS} chars)` + } + + private stringifyCriterionValue(value: unknown): string { + return typeof value === 'string' ? value : JSON.stringify(value) + } + private async fetchFinishState( origin: string, mcpEndpoint: string, diff --git a/packages/browseros-agent/apps/eval/src/graders/python/agisdk-evaluate.py b/packages/browseros-agent/apps/eval/src/graders/python/agisdk-evaluate.py index 0ec46eacf..b1a32cca2 100644 --- a/packages/browseros-agent/apps/eval/src/graders/python/agisdk-evaluate.py +++ b/packages/browseros-agent/apps/eval/src/graders/python/agisdk-evaluate.py @@ -26,6 +26,18 @@ _STRICT = os.environ.get("AGISDK_STRICT_STRINGS", "").lower() in ("1", "true", "yes") +def _json_safe(value: object) -> object: + try: + json.dumps(value) + return value + except TypeError: + if isinstance(value, dict): + return {str(k): _json_safe(v) for k, v in value.items()} + if isinstance(value, (list, tuple)): + return [_json_safe(v) for v in value] + return str(value) + + def _soft_string_match(detail: object) -> bool: """Return True iff `detail` is `{actual_value, expected_value}` with both strings and a non-empty `expected_value` that is contained in `actual_value` @@ -87,7 +99,16 @@ def main(): for r in results: passed = bool(r[0]) detail = r[1] if len(r) > 1 else "" - entry: dict = {"passed": passed, "detail": str(detail)} + entry: dict = { + "passed": passed, + "detail": str(detail), + "raw_detail": _json_safe(detail), + } + if isinstance(detail, dict): + if "actual_value" in detail: + entry["actual_value"] = _json_safe(detail.get("actual_value")) + if "expected_value" in detail: + entry["expected_value"] = _json_safe(detail.get("expected_value")) if not _STRICT and not passed and _soft_string_match(detail): entry["passed"] = True entry["softened"] = True diff --git a/packages/browseros-agent/apps/eval/src/runs/eval-runner.ts b/packages/browseros-agent/apps/eval/src/runs/eval-runner.ts index e8308dba7..33b285551 100644 --- a/packages/browseros-agent/apps/eval/src/runs/eval-runner.ts +++ b/packages/browseros-agent/apps/eval/src/runs/eval-runner.ts @@ -256,10 +256,27 @@ function printTaskProgress( for (const [name, gr] of Object.entries(result.graderResults)) { const icon = gr.pass ? 'PASS' : 'FAIL' console.log(` ${name}: ${icon}`) + if (!gr.pass && gr.reasoning) { + printIndentedReasoning(gr.reasoning, ' ') + } } } } +function printIndentedReasoning( + reasoning: string, + indent: string, + maxLines = 12, +): void { + const lines = reasoning.trim().split('\n') + for (const line of lines.slice(0, maxLines)) { + console.log(`${indent}${line}`) + } + if (lines.length > maxLines) { + console.log(`${indent}... ${lines.length - maxLines} more lines`) + } +} + // ============================================================================ // Summary // ============================================================================ diff --git a/packages/browseros-agent/apps/eval/tests/grading/agisdk-artifacts.test.ts b/packages/browseros-agent/apps/eval/tests/grading/agisdk-artifacts.test.ts index 9a9a5f70f..bab1c9efe 100644 --- a/packages/browseros-agent/apps/eval/tests/grading/agisdk-artifacts.test.ts +++ b/packages/browseros-agent/apps/eval/tests/grading/agisdk-artifacts.test.ts @@ -31,7 +31,14 @@ describe('AgisdkStateDiffGrader artifacts', () => { reward: 0, pass: false, message: 'Missing entree', - per_criterion: [{ passed: false, detail: 'entree missing' }], + per_criterion: [ + { + passed: false, + detail: 'cart item mismatch', + expected_value: 'Entree', + actual_value: 'Soup', + }, + ], }, stderr: 'criterion log', }) @@ -53,6 +60,17 @@ describe('AgisdkStateDiffGrader artifacts', () => { const result = await grader.grade(input) expect(result.pass).toBe(false) + expect(result.reasoning).toContain('Failed criteria:') + expect(result.reasoning).toContain('expected=Entree') + expect(result.reasoning).toContain('actual=Soup') + expect(result.details?.failed_criteria).toEqual([ + { + index: 0, + detail: 'cart item mismatch', + expected: 'Entree', + actual: 'Soup', + }, + ]) expect( JSON.parse( await readFile( @@ -69,6 +87,21 @@ describe('AgisdkStateDiffGrader artifacts', () => { ), ), ).toMatchObject({ message: 'Missing entree' }) + expect( + JSON.parse( + await readFile( + join(dir, 'grader-artifacts/agisdk_state_diff/failed-criteria.json'), + 'utf-8', + ), + ), + ).toEqual([ + { + index: 0, + detail: 'cart item mismatch', + expected: 'Entree', + actual: 'Soup', + }, + ]) expect( await readFile( join(dir, 'grader-artifacts/agisdk_state_diff/stderr.txt'), From fc2f669445d53c39f51fa130afb84695d01ef17e Mon Sep 17 00:00:00 2001 From: Neel Gupta Date: Wed, 6 May 2026 12:56:45 +0100 Subject: [PATCH 06/13] feat(eval): retry provider errors before failing --- .../apps/eval/src/agents/single-agent.ts | 161 ++++++++------- .../eval/src/utils/provider-error-retry.ts | 184 ++++++++++++++++++ .../apps/eval/src/utils/with-eval-timeout.ts | 8 + .../tests/utils/provider-error-retry.test.ts | 99 ++++++++++ 4 files changed, 378 insertions(+), 74 deletions(-) create mode 100644 packages/browseros-agent/apps/eval/src/utils/provider-error-retry.ts create mode 100644 packages/browseros-agent/apps/eval/tests/utils/provider-error-retry.test.ts diff --git a/packages/browseros-agent/apps/eval/src/agents/single-agent.ts b/packages/browseros-agent/apps/eval/src/agents/single-agent.ts index 8a181bd48..3fd55df76 100644 --- a/packages/browseros-agent/apps/eval/src/agents/single-agent.ts +++ b/packages/browseros-agent/apps/eval/src/agents/single-agent.ts @@ -10,6 +10,10 @@ import { registry } from '@browseros/server/tools/registry' import { CaptchaWaiter } from '../capture/captcha-waiter' import { DEFAULT_TIMEOUT_MS } from '../constants' import type { TaskMetadata } from '../types' +import { + isProviderExecutionError, + retryProviderErrors, +} from '../utils/provider-error-retry' import { resolveProviderConfig } from '../utils/resolve-provider-config' import { withEvalTimeout } from '../utils/with-eval-timeout' import type { AgentContext, AgentEvaluator, AgentResult } from './types' @@ -89,87 +93,96 @@ export class SingleAgentEvaluator implements AgentEvaluator { capture, async (signal) => { if (!agent) throw new Error('Agent was not initialized') + const activeAgent = agent // Format prompt with browser context so the agent knows what page it's on // (same formatting as chat-service.ts → formatUserMessage) const prompt = formatUserMessage(task.query, browserContext) - const result = await agent.toolLoopAgent.generate({ - prompt, - abortSignal: signal, - - experimental_onToolCallStart: ({ toolCall }) => { - const input = toolCall.input as - | Record - | undefined - if (input && typeof input.page === 'number') { - capture.setActivePageId(input.page) - } - }, - - experimental_onToolCallFinish: async () => { - try { - if (captchaWaiter) { - await captchaWaiter.waitIfCaptchaPresent( - browser, - capture.getActivePageId(), - ) - } - const screenshotNum = await capture.screenshot.capture( - capture.getActivePageId(), - ) - capture.emitEvent(task.query_id, { - type: 'screenshot-captured', - screenshot: screenshotNum, - }) - } catch { - // Screenshot failures are non-fatal - } - }, - - onStepFinish: async ({ toolCalls, toolResults, text }) => { - if (toolCalls) { - for (const tc of toolCalls) { - const inputEvent = { - type: 'tool-input-available', - toolCallId: tc.toolCallId, - toolName: tc.toolName, - input: tc.input, - } as any - await capture.messageLogger.logStreamEvent(inputEvent) - capture.emitEvent(task.query_id, inputEvent) - } - } - - if (toolResults) { - for (const tr of toolResults) { - const outputEvent = { - type: 'tool-output-available', - toolCallId: tr.toolCallId, - output: tr.output, - } as any - await capture.messageLogger.logStreamEvent(outputEvent) - capture.emitEvent(task.query_id, outputEvent) - } - } - - if (text) { - const textId = randomUUID() - const startEvent = { type: 'text-start', id: textId } as any - const deltaEvent = { - type: 'text-delta', - id: textId, - delta: text, - } as any - const endEvent = { type: 'text-end', id: textId } as any - await capture.messageLogger.logStreamEvent(startEvent) - await capture.messageLogger.logStreamEvent(deltaEvent) - await capture.messageLogger.logStreamEvent(endEvent) - capture.emitEvent(task.query_id, deltaEvent) - } + const result = await retryProviderErrors( + () => + activeAgent.toolLoopAgent.generate({ + prompt, + abortSignal: signal, + + experimental_onToolCallStart: ({ toolCall }) => { + const input = toolCall.input as + | Record + | undefined + if (input && typeof input.page === 'number') { + capture.setActivePageId(input.page) + } + }, + + experimental_onToolCallFinish: async () => { + try { + if (captchaWaiter) { + await captchaWaiter.waitIfCaptchaPresent( + browser, + capture.getActivePageId(), + ) + } + const screenshotNum = await capture.screenshot.capture( + capture.getActivePageId(), + ) + capture.emitEvent(task.query_id, { + type: 'screenshot-captured', + screenshot: screenshotNum, + }) + } catch { + // Screenshot failures are non-fatal + } + }, + + onStepFinish: async ({ toolCalls, toolResults, text }) => { + if (toolCalls) { + for (const tc of toolCalls) { + const inputEvent = { + type: 'tool-input-available', + toolCallId: tc.toolCallId, + toolName: tc.toolName, + input: tc.input, + } as any + await capture.messageLogger.logStreamEvent(inputEvent) + capture.emitEvent(task.query_id, inputEvent) + } + } + + if (toolResults) { + for (const tr of toolResults) { + const outputEvent = { + type: 'tool-output-available', + toolCallId: tr.toolCallId, + output: tr.output, + } as any + await capture.messageLogger.logStreamEvent(outputEvent) + capture.emitEvent(task.query_id, outputEvent) + } + } + + if (text) { + const textId = randomUUID() + const startEvent = { type: 'text-start', id: textId } as any + const deltaEvent = { + type: 'text-delta', + id: textId, + delta: text, + } as any + const endEvent = { type: 'text-end', id: textId } as any + await capture.messageLogger.logStreamEvent(startEvent) + await capture.messageLogger.logStreamEvent(deltaEvent) + await capture.messageLogger.logStreamEvent(endEvent) + capture.emitEvent(task.query_id, deltaEvent) + } + }, + }), + { + label: `single-agent ${task.query_id}`, + signal, }, - }) + ) finalText = result.text || null }, + { rethrowError: isProviderExecutionError }, ) const endTime = Date.now() diff --git a/packages/browseros-agent/apps/eval/src/utils/provider-error-retry.ts b/packages/browseros-agent/apps/eval/src/utils/provider-error-retry.ts new file mode 100644 index 000000000..428a76e21 --- /dev/null +++ b/packages/browseros-agent/apps/eval/src/utils/provider-error-retry.ts @@ -0,0 +1,184 @@ +import { sleep } from './sleep' + +const DEFAULT_PROVIDER_ERROR_RETRIES = 5 +const DEFAULT_PROVIDER_ERROR_RETRY_WINDOW_MS = 10_000 +const PROVIDER_ERROR_LOG_MAX_STRING_CHARS = 10_000 +const PROVIDER_ERROR_LOG_MAX_DEPTH = 5 + +const REDACTED_KEYS = /authorization|api[-_]?key|token|secret|cookie/i + +export interface ProviderErrorRetryEvent { + retryNumber: number + maxRetries: number + delayMs: number + error: unknown +} + +export interface ProviderErrorRetryOptions { + label: string + signal?: AbortSignal + retries?: number + windowMs?: number + onRetry?: (event: ProviderErrorRetryEvent) => void +} + +function readStringProperty(value: unknown, key: string): string | undefined { + if (!value || typeof value !== 'object') return undefined + const raw = (value as Record)[key] + return typeof raw === 'string' ? raw : undefined +} + +function readArrayProperty(value: unknown, key: string): unknown[] { + if (!value || typeof value !== 'object') return [] + const raw = (value as Record)[key] + return Array.isArray(raw) ? raw : [] +} + +function errorMarkers(error: unknown, seen = new Set()): string[] { + if (!error || seen.has(error)) return [] + seen.add(error) + + const markers = [ + readStringProperty(error, 'name'), + error instanceof Error ? error.message : undefined, + ].filter((value): value is string => !!value) + + if (error && typeof error === 'object') { + const record = error as Record + if ('isRetryable' in record) markers.push('isRetryable') + if ('statusCode' in record) markers.push('statusCode') + if ('responseBody' in record) markers.push('responseBody') + if ('cause' in record) { + markers.push(...errorMarkers(record.cause, seen)) + } + } + + for (const nestedError of readArrayProperty(error, 'errors')) { + markers.push(...errorMarkers(nestedError, seen)) + } + + return markers +} + +export function isProviderExecutionError(error: unknown): boolean { + const markerText = errorMarkers(error).join('\n') + return ( + markerText.includes('Provider returned error') || + markerText.includes('APICallError') || + markerText.includes('AI_RetryError') || + markerText.includes('RetryError') || + markerText.includes('isRetryable') || + markerText.includes('statusCode') || + markerText.includes('responseBody') + ) +} + +function errorMessage(error: unknown): string { + return error instanceof Error ? error.message : String(error) +} + +function truncateString(value: string): string { + if (value.length <= PROVIDER_ERROR_LOG_MAX_STRING_CHARS) return value + return `${value.slice(0, PROVIDER_ERROR_LOG_MAX_STRING_CHARS)}... (+${value.length - PROVIDER_ERROR_LOG_MAX_STRING_CHARS} chars)` +} + +function serializeForLog( + value: unknown, + depth = 0, + seen = new Set(), +): unknown { + if (typeof value === 'string') return truncateString(value) + if (value === null || typeof value !== 'object') return value + if (seen.has(value)) return '[Circular]' + if (depth >= PROVIDER_ERROR_LOG_MAX_DEPTH) return '[MaxDepth]' + + seen.add(value) + + if (value instanceof Error) { + const serialized: Record = { + name: value.name, + message: value.message, + stack: value.stack, + } + + for (const key of Object.getOwnPropertyNames(value)) { + if (key in serialized) continue + serialized[key] = REDACTED_KEYS.test(key) + ? '[Redacted]' + : serializeForLog( + (value as unknown as Record)[key], + depth + 1, + seen, + ) + } + + if ('cause' in value) { + serialized.cause = serializeForLog(value.cause, depth + 1, seen) + } + + return serialized + } + + if (Array.isArray(value)) { + return value.map((item) => serializeForLog(item, depth + 1, seen)) + } + + const serialized: Record = {} + for (const [key, item] of Object.entries(value)) { + serialized[key] = REDACTED_KEYS.test(key) + ? '[Redacted]' + : serializeForLog(item, depth + 1, seen) + } + return serialized +} + +function logFinalProviderError( + label: string, + error: unknown, + attempts: number, +): void { + console.error( + `[provider-retry] ${label}: provider error persisted after ${attempts} attempts. Final error:\n${JSON.stringify( + serializeForLog(error), + null, + 2, + )}`, + ) +} + +export async function retryProviderErrors( + operation: () => Promise, + options: ProviderErrorRetryOptions, +): Promise { + const maxRetries = options.retries ?? DEFAULT_PROVIDER_ERROR_RETRIES + const windowMs = options.windowMs ?? DEFAULT_PROVIDER_ERROR_RETRY_WINDOW_MS + const delayMs = maxRetries > 0 ? Math.floor(windowMs / maxRetries) : 0 + + for (let attempt = 0; ; attempt++) { + try { + return await operation() + } catch (error) { + const isProviderError = isProviderExecutionError(error) + if (options.signal?.aborted || !isProviderError) { + throw error + } + + if (attempt >= maxRetries) { + logFinalProviderError(options.label, error, attempt + 1) + throw error + } + + const event = { + retryNumber: attempt + 1, + maxRetries, + delayMs, + error, + } + options.onRetry?.(event) + console.warn( + `[provider-retry] ${options.label}: retry ${event.retryNumber}/${maxRetries} in ${delayMs}ms after provider error: ${errorMessage(error)}`, + ) + await sleep(delayMs, options.signal) + } + } +} diff --git a/packages/browseros-agent/apps/eval/src/utils/with-eval-timeout.ts b/packages/browseros-agent/apps/eval/src/utils/with-eval-timeout.ts index 87073daa7..5f9f378e4 100644 --- a/packages/browseros-agent/apps/eval/src/utils/with-eval-timeout.ts +++ b/packages/browseros-agent/apps/eval/src/utils/with-eval-timeout.ts @@ -13,10 +13,15 @@ export interface TimeoutResult { terminationReason: TerminationReason } +export interface EvalTimeoutOptions { + rethrowError?: (error: Error) => boolean +} + export async function withEvalTimeout( timeoutMs: number, capture: CaptureContext, fn: (signal: AbortSignal) => Promise, + options: EvalTimeoutOptions = {}, ): Promise> { const abortController = new AbortController() const timeoutHandle = setTimeout(() => abortController.abort(), timeoutMs) @@ -39,6 +44,9 @@ export async function withEvalTimeout( capture.addError('agent_execution', error.message, { stack: error.stack, }) + if (options.rethrowError?.(error)) { + throw error + } } return { terminationReason } diff --git a/packages/browseros-agent/apps/eval/tests/utils/provider-error-retry.test.ts b/packages/browseros-agent/apps/eval/tests/utils/provider-error-retry.test.ts new file mode 100644 index 000000000..0b1b5f2f5 --- /dev/null +++ b/packages/browseros-agent/apps/eval/tests/utils/provider-error-retry.test.ts @@ -0,0 +1,99 @@ +import { describe, expect, it } from 'bun:test' +import { + isProviderExecutionError, + retryProviderErrors, +} from '../../src/utils/provider-error-retry' + +function providerError(message = 'Provider returned error'): Error { + const error = new Error(message) + error.name = 'APICallError' + ;(error as unknown as Record).statusCode = 500 + ;(error as unknown as Record).responseBody = + '{"error":"upstream failed"}' + return error +} + +async function withoutRetryWarnings(fn: () => Promise): Promise { + const originalWarn = console.warn + const originalError = console.error + console.warn = () => {} + console.error = () => {} + try { + return await fn() + } finally { + console.warn = originalWarn + console.error = originalError + } +} + +describe('provider error retries', () => { + it('detects provider errors from SDK-style markers', () => { + expect(isProviderExecutionError(providerError())).toBe(true) + expect(isProviderExecutionError(new Error('regular tool failure'))).toBe( + false, + ) + }) + + it('retries provider errors and returns a later success', async () => { + await withoutRetryWarnings(async () => { + let calls = 0 + const result = await retryProviderErrors( + async () => { + calls++ + if (calls <= 3) throw providerError() + return 'ok' + }, + { label: 'test', retries: 5, windowMs: 0 }, + ) + + expect(result).toBe('ok') + expect(calls).toBe(4) + }) + }) + + it('throws the final provider error after retries are exhausted', async () => { + const originalWarn = console.warn + const originalError = console.error + const errorLogs: string[] = [] + console.warn = () => {} + console.error = (message?: unknown) => { + errorLogs.push(String(message)) + } + + try { + let calls = 0 + await expect( + retryProviderErrors( + async () => { + calls++ + throw providerError() + }, + { label: 'test', retries: 5, windowMs: 0 }, + ), + ).rejects.toThrow('Provider returned error') + expect(calls).toBe(6) + expect(errorLogs.join('\n')).toContain( + 'provider error persisted after 6 attempts', + ) + expect(errorLogs.join('\n')).toContain('responseBody') + expect(errorLogs.join('\n')).toContain('upstream failed') + } finally { + console.warn = originalWarn + console.error = originalError + } + }) + + it('does not retry non-provider errors', async () => { + let calls = 0 + await expect( + retryProviderErrors( + async () => { + calls++ + throw new Error('tool failed') + }, + { label: 'test', retries: 5, windowMs: 0 }, + ), + ).rejects.toThrow('tool failed') + expect(calls).toBe(1) + }) +}) From 0816e217e861e95f9091378794a4f360d50a1eff Mon Sep 17 00:00:00 2001 From: Neel Gupta Date: Wed, 6 May 2026 12:57:34 +0100 Subject: [PATCH 07/13] feat(server): log agent finish reasons --- .../apps/server/src/agent/ai-sdk-agent.ts | 39 +++++++++++++++++++ .../server/src/api/services/chat-service.ts | 34 +++++++++++++++- .../apps/server/src/tools/input.ts | 4 -- .../server/src/tools/molmo-point-client.ts | 24 ------------ 4 files changed, 72 insertions(+), 29 deletions(-) diff --git a/packages/browseros-agent/apps/server/src/agent/ai-sdk-agent.ts b/packages/browseros-agent/apps/server/src/agent/ai-sdk-agent.ts index 5ddce23f0..11cf307a3 100644 --- a/packages/browseros-agent/apps/server/src/agent/ai-sdk-agent.ts +++ b/packages/browseros-agent/apps/server/src/agent/ai-sdk-agent.ts @@ -302,6 +302,45 @@ export class AiSdkAgent { tools, stopWhen: [stepCountIs(AGENT_LIMITS.MAX_TURNS)], prepareStep, + onFinish: (event) => { + const previousStep = event.steps.at(-2) + const totalToolCalls = event.steps.reduce( + (sum, step) => sum + step.toolCalls.length, + 0, + ) + const totalToolResults = event.steps.reduce( + (sum, step) => sum + step.toolResults.length, + 0, + ) + + logger.info('Agent tool loop finished', { + conversationId: config.resolvedConfig.conversationId, + provider: config.resolvedConfig.provider, + model: config.resolvedConfig.model, + finishReason: event.finishReason, + rawFinishReason: event.rawFinishReason, + stepNumber: event.stepNumber, + stepCount: event.steps.length, + finalTextLength: event.text.length, + emptyFinalText: event.text.trim().length === 0, + lastStepToolCallCount: event.toolCalls.length, + lastStepToolResultCount: event.toolResults.length, + previousStepFinishReason: previousStep?.finishReason, + previousStepToolCallCount: previousStep?.toolCalls.length, + previousStepToolResultCount: previousStep?.toolResults.length, + endedAfterToolResult: + event.toolCalls.length === 0 && + (previousStep?.toolResults.length ?? 0) > 0, + totalToolCalls, + totalToolResults, + totalInputTokens: event.totalUsage.inputTokens, + totalOutputTokens: event.totalUsage.outputTokens, + totalTokens: event.totalUsage.totalTokens, + finalStepInputTokens: event.usage.inputTokens, + finalStepOutputTokens: event.usage.outputTokens, + finalStepTotalTokens: event.usage.totalTokens, + }) + }, ...(isChatGPTPro && { providerOptions: { openai: { diff --git a/packages/browseros-agent/apps/server/src/api/services/chat-service.ts b/packages/browseros-agent/apps/server/src/api/services/chat-service.ts index 7cba8ea91..c466bccb2 100644 --- a/packages/browseros-agent/apps/server/src/api/services/chat-service.ts +++ b/packages/browseros-agent/apps/server/src/api/services/chat-service.ts @@ -4,7 +4,11 @@ * SPDX-License-Identifier: AGPL-3.0-or-later */ -import { createAgentUIStreamResponse, type UIMessage } from 'ai' +import { + createAgentUIStreamResponse, + type FinishReason, + type UIMessage, +} from 'ai' import { AiSdkAgent } from '../../agent/ai-sdk-agent' import { formatUserMessage } from '../../agent/format-message' import { @@ -272,6 +276,9 @@ export class ChatService { // Handle tool approval responses: patch the agent's messages and re-run if (request.toolApprovalResponses?.length) { + let finishReason: FinishReason | undefined + let rawFinishReason: string | undefined + let stepNumber: number | undefined this.applyToolApprovalResponses( session.agent.messages, request.toolApprovalResponses, @@ -284,8 +291,21 @@ export class ChatService { agent: session.agent.toolLoopAgent, uiMessages: filterValidMessages(session.agent.messages), abortSignal, + onStepFinish: (step) => { + finishReason = step.finishReason + rawFinishReason = step.rawFinishReason + stepNumber = step.stepNumber + }, onFinish: async ({ messages }: { messages: UIMessage[] }) => { session.agent.messages = filterValidMessages(messages) + logger.info('Agent execution complete', { + conversationId: request.conversationId, + totalMessages: messages.length, + finishReason, + rawFinishReason, + stepNumber, + isToolApprovalResponse: true, + }) }, }) } @@ -333,10 +353,19 @@ export class ChatService { : msg, ) + let finishReason: FinishReason | undefined + let rawFinishReason: string | undefined + let stepNumber: number | undefined + return createAgentUIStreamResponse({ agent: session.agent.toolLoopAgent, uiMessages: promptUiMessages, abortSignal, + onStepFinish: (step) => { + finishReason = step.finishReason + rawFinishReason = step.rawFinishReason + stepNumber = step.stepNumber + }, onFinish: async ({ messages }: { messages: UIMessage[] }) => { // The agent loop returns `messages` containing the prompt- // wrapped user text. Restore the raw form before persisting @@ -354,6 +383,9 @@ export class ChatService { logger.info('Agent execution complete', { conversationId: request.conversationId, totalMessages: restored.length, + finishReason, + rawFinishReason, + stepNumber, }) if (session?.hiddenPageId) { diff --git a/packages/browseros-agent/apps/server/src/tools/input.ts b/packages/browseros-agent/apps/server/src/tools/input.ts index ad9160ff0..bc98b7708 100644 --- a/packages/browseros-agent/apps/server/src/tools/input.ts +++ b/packages/browseros-agent/apps/server/src/tools/input.ts @@ -83,12 +83,10 @@ export const click = defineInputTool({ return } - logger.info('GUI click dispatching', clickLog) await ctx.browser.clickAt(args.page, x, y, { button: args.button, clickCount: args.clickCount, }) - logger.info('GUI click dispatched', clickLog) response.text('tool call executed successfully') response.data({ action: 'click', @@ -225,9 +223,7 @@ export const hover = defineInputTool({ return } - logger.info('GUI hover dispatching', hoverLog) await ctx.browser.hoverAt(args.page, x, y) - logger.info('GUI hover dispatched', hoverLog) response.text('tool call executed successfully') response.data({ action: 'hover', diff --git a/packages/browseros-agent/apps/server/src/tools/molmo-point-client.ts b/packages/browseros-agent/apps/server/src/tools/molmo-point-client.ts index 0ea8f5191..90ee238f8 100644 --- a/packages/browseros-agent/apps/server/src/tools/molmo-point-client.ts +++ b/packages/browseros-agent/apps/server/src/tools/molmo-point-client.ts @@ -67,13 +67,6 @@ export async function requestMolmoPoint(args: { const instructionTruncated = instructionLength > MOLMO_POINT_INSTRUCTION_LOG_MAX_CHARS - logger.info('Molmo point request started', { - endpoint, - instruction, - instructionLength, - instructionTruncated, - }) - const response = await fetch(endpoint, { method: 'POST', headers: { 'content-type': 'application/json' }, @@ -130,23 +123,6 @@ export async function requestMolmoPoint(args: { } const point = firstValidPoint(payload.points) - logger.info('Molmo point response received', { - endpoint, - instruction, - instructionLength, - instructionTruncated, - status: response.status, - statusText: response.statusText, - rawResponseText: truncateText( - rawResponseText, - MOLMO_POINT_RESPONSE_LOG_MAX_CHARS, - ), - rawResponseTextLength: rawResponseText.length, - rawResponseTextTruncated: - rawResponseText.length > MOLMO_POINT_RESPONSE_LOG_MAX_CHARS, - parsedPoint: point, - }) - if (!point) { throw new Error('Molmo point response did not include a valid point') } From a62f9771a22eb0ecdc0c2f74da0d26810ec15536 Mon Sep 17 00:00:00 2001 From: Neel Gupta Date: Wed, 6 May 2026 13:33:43 +0100 Subject: [PATCH 08/13] feat: return GUI click hit element feedback --- .../apps/server/src/agent/prompt.ts | 8 ++-- .../server/src/tools/gui-click-resolver.ts | 37 +++++++++++---- .../apps/server/src/tools/input.ts | 45 +++++++++++++++++-- .../apps/server/tests/tools/input.test.ts | 13 +++--- 4 files changed, 82 insertions(+), 21 deletions(-) diff --git a/packages/browseros-agent/apps/server/src/agent/prompt.ts b/packages/browseros-agent/apps/server/src/agent/prompt.ts index c3fcf48e2..82c85a71d 100644 --- a/packages/browseros-agent/apps/server/src/agent/prompt.ts +++ b/packages/browseros-agent/apps/server/src/agent/prompt.ts @@ -138,7 +138,7 @@ function getCapabilities( ### Browser Control Use these browser tools under the GUI click model constraint: -- \`click\` captures the current page screenshot internally, asks the GUI click model where to click based on your prompt, then executes that coordinate click. +- \`click\` captures the current page screenshot internally, asks the GUI click model where to click based on your prompt, then executes that coordinate click. Make sure to be brief, concise and capture the semantic essence of where you want to click. - \`hover\` captures the current page screenshot internally, asks the GUI model where to hover based on your prompt, then moves the cursor there. - \`type_text\` types into the currently focused element. Use it after \`click\` focuses a text field. - \`scroll\` scrolls the page viewport. @@ -226,7 +226,8 @@ function getExecution( - Use \`click\` for visible page targets. It is the only click path that should choose page coordinates. - Use \`hover\` for visible hover targets, \`type_text\` after focusing a field, and \`scroll\` to move the viewport. -- Use \`take_screenshot\` when you need explicit visual feedback about the current page before choosing the next action. Do not call it after every action by default. +- Use \`take_screenshot\` when you need explicit visual feedback about the current page before choosing the next action. +- After each \`click\` or \`hover\`, inspect the returned \`hitElement\` before choosing the next action. If it is null or does not match the intended target, use \`take_screenshot\` or retry with a more specific visual prompt. - Use \`new_page\` or \`navigate_page\` to open websites. Use \`get_active_page\`, \`list_pages\`, and \`close_page\` only when needed for page management. - Use the Page ID from Browser Context directly. - Do not try to observe the page with snapshots, DOM, accessibility trees, scripts, link extraction, or text extraction. @@ -331,7 +332,7 @@ function getToolSelection( - Use \`hover\` for hover menus or targets that reveal content. - Use \`type_text\` only after a prior GUI click likely focused a text input. Include a newline in \`text\` when you need to submit with Enter. - Use \`scroll\` to move the page viewport when the target is likely below or above the visible area. -- Use \`take_screenshot\` sparingly when you need visual feedback before deciding what to click, type, hover, or scroll next. +- Use \`take_screenshot\` when you need visual feedback before deciding what to click, type, hover, or scroll next. - The \`prompt\` argument should describe the visible target to click, for example: "click the search box", "click the Add to Cart button", or "click the first product result". - Use page-opening and page-management tools only to get to the website or manage tabs; they do not replace visual page clicking. - Do not request or rely on element IDs, snapshots, DOM nodes, page text, scripts, link extraction, or coordinate click tools. @@ -676,7 +677,6 @@ function getStyle( - Keep click prompts concise and visual. - Do not narrate routine clicks before calling the tool. -- After a click, continue from your best estimate of the page state. - Be concise. diff --git a/packages/browseros-agent/apps/server/src/tools/gui-click-resolver.ts b/packages/browseros-agent/apps/server/src/tools/gui-click-resolver.ts index 53f22c59c..addfa8948 100644 --- a/packages/browseros-agent/apps/server/src/tools/gui-click-resolver.ts +++ b/packages/browseros-agent/apps/server/src/tools/gui-click-resolver.ts @@ -15,9 +15,34 @@ function truncateForLog(value: string | undefined): string | undefined { export interface GuiPointResult { x: number y: number + hitElement: GuiHitElement | null log: Record } +export interface GuiHitElement { + tagName: string + role?: string + ariaLabel?: string + labelText?: string + textContent?: string +} + +function summarizeHitElement( + hitElement: Awaited< + ReturnType + >, +): GuiHitElement | null { + if (!hitElement) return null + + return { + tagName: hitElement.tagName, + role: hitElement.role, + ariaLabel: truncateForLog(hitElement.ariaLabel), + labelText: truncateForLog(hitElement.labelText), + textContent: truncateForLog(hitElement.textContent), + } +} + export async function resolveGuiPoint( ctx: ToolContext, page: number, @@ -54,10 +79,12 @@ export async function resolveGuiPoint( .resolveElementProperties(page, hitElementId) .catch(() => null) : null + const hitElementSummary = summarizeHitElement(hitElement) return { x, y, + hitElement: hitElementSummary, log: { page, pageUrl: truncateForLog(pageInfo?.url), @@ -75,15 +102,7 @@ export async function resolveGuiPoint( }, viewport, hitElementId, - hitElement: hitElement - ? { - tagName: hitElement.tagName, - role: hitElement.role, - ariaLabel: truncateForLog(hitElement.ariaLabel), - labelText: truncateForLog(hitElement.labelText), - textContent: truncateForLog(hitElement.textContent), - } - : null, + hitElement: hitElementSummary, }, } } diff --git a/packages/browseros-agent/apps/server/src/tools/input.ts b/packages/browseros-agent/apps/server/src/tools/input.ts index bc98b7708..ad945a490 100644 --- a/packages/browseros-agent/apps/server/src/tools/input.ts +++ b/packages/browseros-agent/apps/server/src/tools/input.ts @@ -1,7 +1,7 @@ import { z } from 'zod' import { logger } from '../lib/logger' import { defineToolWithCategory, type ToolContext } from './framework' -import { resolveGuiPoint } from './gui-click-resolver' +import { type GuiHitElement, resolveGuiPoint } from './gui-click-resolver' import type { ToolResponse } from './response' const pageParam = z.number().describe('Page ID (from list_pages)') @@ -9,6 +9,39 @@ const defineInputTool = defineToolWithCategory('input') const elementParam = z .number() .describe('Element ID from snapshot (the number in [N])') +const guiHitElementOutput = z + .object({ + tagName: z.string(), + role: z.string().optional(), + ariaLabel: z.string().optional(), + labelText: z.string().optional(), + textContent: z.string().optional(), + }) + .nullable() + +function quoteForAgent(value: string): string { + return JSON.stringify(value) +} + +function formatHitElementForAgent(hitElement: GuiHitElement | null): string { + if (!hitElement) { + return 'The click was successful, but no hit element could be resolved at the click point.' + } + + const details = [`tagName=${quoteForAgent(hitElement.tagName)}`] + if (hitElement.role) details.push(`role=${quoteForAgent(hitElement.role)}`) + if (hitElement.ariaLabel) { + details.push(`ariaLabel=${quoteForAgent(hitElement.ariaLabel)}`) + } + if (hitElement.labelText) { + details.push(`labelText=${quoteForAgent(hitElement.labelText)}`) + } + if (hitElement.textContent) { + details.push(`textContent=${quoteForAgent(hitElement.textContent)}`) + } + + return `The click was successful and hit the element: ${details.join(', ')}.` +} async function enforceAcl( toolName: string, @@ -67,9 +100,14 @@ export const click = defineInputTool({ clickCount: z.number(), x: z.number(), y: z.number(), + hitElement: guiHitElementOutput, }), handler: async (args, ctx, response) => { - const { x, y, log } = await resolveGuiPoint(ctx, args.page, args.prompt) + const { x, y, hitElement, log } = await resolveGuiPoint( + ctx, + args.page, + args.prompt, + ) const clickLog = { ...log, clickPoint: { x, y }, @@ -87,7 +125,7 @@ export const click = defineInputTool({ button: args.button, clickCount: args.clickCount, }) - response.text('tool call executed successfully') + response.text(formatHitElementForAgent(hitElement)) response.data({ action: 'click', page: args.page, @@ -96,6 +134,7 @@ export const click = defineInputTool({ clickCount: args.clickCount, x, y, + hitElement, }) }, }) diff --git a/packages/browseros-agent/apps/server/tests/tools/input.test.ts b/packages/browseros-agent/apps/server/tests/tools/input.test.ts index 8dd1a30e1..ea1671635 100644 --- a/packages/browseros-agent/apps/server/tests/tools/input.test.ts +++ b/packages/browseros-agent/apps/server/tests/tools/input.test.ts @@ -256,15 +256,18 @@ describe('input tools', () => { prompt: 'click the Submit button', }) assert.ok(!clickResult.isError, textOf(clickResult)) - assert.strictEqual( + assert.match( textOf(clickResult), - 'tool call executed successfully', - ) - const clickData = structuredOf<{ action: string; prompt: string }>( - clickResult, + /The click was successful and hit the element: .*tagName="button".*textContent="Submit"/, ) + const clickData = structuredOf<{ + action: string + prompt: string + hitElement: { tagName: string; textContent?: string } | null + }>(clickResult) assert.strictEqual(clickData.action, 'click') assert.strictEqual(clickData.prompt, 'click the Submit button') + assert.strictEqual(clickData.hitElement?.tagName, 'button') }) const output = await execute(evaluate_script, { From 0eddab8499418e01485307c8976609f85ad52ad1 Mon Sep 17 00:00:00 2001 From: Neel Gupta Date: Wed, 6 May 2026 13:56:27 +0100 Subject: [PATCH 09/13] fix: tune opus eval provider configs --- .../browseros-agent-opus-4-6-agisdk-real.json | 27 +++++++++++++++++++ ...agent-opus-4-7-openrouter-agisdk-real.json | 3 ++- .../eval/tests/suites/config-adapter.test.ts | 26 ++++++++++++++---- .../apps/server/src/agent/provider-factory.ts | 16 ++++++----- .../apps/server/src/agent/types.ts | 1 + .../server/src/lib/clients/llm/provider.ts | 16 ++++++++++- .../packages/shared/src/schemas/llm.ts | 2 ++ 7 files changed, 77 insertions(+), 14 deletions(-) create mode 100644 packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-6-agisdk-real.json diff --git a/packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-6-agisdk-real.json b/packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-6-agisdk-real.json new file mode 100644 index 000000000..b2e8aaea7 --- /dev/null +++ b/packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-6-agisdk-real.json @@ -0,0 +1,27 @@ +{ + "agent": { + "type": "single", + "provider": "bedrock", + "model": "global.anthropic.claude-opus-4-6-v1", + "region": "us-west-2", + "accessKeyId": "AWS_ACCESS_KEY_ID", + "secretAccessKey": "AWS_SECRET_ACCESS_KEY", + "supportsImages": true + }, + "dataset": "../../data/agisdk-real.jsonl", + "num_workers": 2, + "restart_server_per_task": true, + "browseros": { + "server_url": "http://127.0.0.1:9110", + "base_cdp_port": 9010, + "base_server_port": 9110, + "base_extension_port": 9310, + "load_extensions": false, + "headless": false + }, + "captcha": { + "api_key_env": "NOPECHA_API_KEY" + }, + "graders": ["agisdk_state_diff"], + "timeout_ms": 1800000 +} diff --git a/packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-7-openrouter-agisdk-real.json b/packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-7-openrouter-agisdk-real.json index 3970af46c..90e0206e3 100644 --- a/packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-7-openrouter-agisdk-real.json +++ b/packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-7-openrouter-agisdk-real.json @@ -7,7 +7,8 @@ "supportsImages": true, "reasoning": { "enabled": true - } + }, + "verbosity": "xhigh" }, "dataset": "../../data/agisdk-real.jsonl", "num_workers": 10, diff --git a/packages/browseros-agent/apps/eval/tests/suites/config-adapter.test.ts b/packages/browseros-agent/apps/eval/tests/suites/config-adapter.test.ts index 6f0de5dc5..55a67126e 100644 --- a/packages/browseros-agent/apps/eval/tests/suites/config-adapter.test.ts +++ b/packages/browseros-agent/apps/eval/tests/suites/config-adapter.test.ts @@ -42,9 +42,12 @@ describe('adaptEvalConfigFile', () => { const kimi = await adaptEvalConfigFile( 'apps/eval/configs/legacy/browseros-agent-kimi-k2-5-agisdk-real.json', ) - const opus = await adaptEvalConfigFile( + const bedrockOpus = await adaptEvalConfigFile( 'apps/eval/configs/legacy/browseros-agent-opus-4-6-agisdk-real.json', ) + const openrouterOpus = await adaptEvalConfigFile( + 'apps/eval/configs/legacy/browseros-agent-opus-4-7-openrouter-agisdk-real.json', + ) expect(kimi.suite.id).toBe('browseros-agent-kimi-k2-5-agisdk-real') expect(kimi.evalConfig.agent).toMatchObject({ @@ -54,16 +57,29 @@ describe('adaptEvalConfigFile', () => { }) expect(kimi.evalConfig.num_workers).toBe(3) - expect(opus.suite.id).toBe('browseros-agent-opus-4-6-agisdk-real') - expect(opus.evalConfig.agent).toMatchObject({ + expect(bedrockOpus.suite.id).toBe('browseros-agent-opus-4-6-agisdk-real') + expect(bedrockOpus.evalConfig.agent).toMatchObject({ type: 'single', provider: 'bedrock', model: 'global.anthropic.claude-opus-4-6-v1', - region: 'AWS_REGION', + region: 'us-west-2', accessKeyId: 'AWS_ACCESS_KEY_ID', secretAccessKey: 'AWS_SECRET_ACCESS_KEY', }) - expect(opus.evalConfig.num_workers).toBe(2) + expect(bedrockOpus.evalConfig.num_workers).toBe(2) + + expect(openrouterOpus.suite.id).toBe( + 'browseros-agent-opus-4-7-openrouter-agisdk-real', + ) + expect(openrouterOpus.evalConfig.agent).toMatchObject({ + type: 'single', + provider: 'openrouter', + model: 'anthropic/claude-opus-4.7', + apiKey: 'OPENROUTER_API_KEY', + reasoning: { enabled: true }, + verbosity: 'xhigh', + }) + expect(openrouterOpus.evalConfig.num_workers).toBe(10) }) it('adapts claude-code configs without provider credentials', async () => { diff --git a/packages/browseros-agent/apps/server/src/agent/provider-factory.ts b/packages/browseros-agent/apps/server/src/agent/provider-factory.ts index a452ade5a..11f9b6701 100644 --- a/packages/browseros-agent/apps/server/src/agent/provider-factory.ts +++ b/packages/browseros-agent/apps/server/src/agent/provider-factory.ts @@ -44,15 +44,16 @@ function createGoogleFactory( return createGoogleGenerativeAI({ apiKey: config.apiKey }) } -function buildOpenRouterReasoning( +function buildOpenRouterExtraBody( config: ResolvedAgentConfig, ): Record { - const r = config.reasoning - if (!r) return {} const body: Record = {} - if (r.enabled !== undefined) body.enabled = r.enabled - if (r.maxTokens !== undefined) body.max_tokens = r.maxTokens - if (r.effort !== undefined) body.effort = r.effort + + if (config.reasoning?.enabled !== undefined) { + body.reasoning = { enabled: config.reasoning.enabled } + } + if (config.verbosity !== undefined) body.verbosity = config.verbosity + return body } @@ -60,9 +61,10 @@ function createOpenRouterFactory( config: ResolvedAgentConfig, ): (modelId: string) => unknown { if (!config.apiKey) throw new Error('OpenRouter provider requires apiKey') + const extraBody = buildOpenRouterExtraBody(config) return createOpenRouter({ apiKey: config.apiKey, - extraBody: { reasoning: buildOpenRouterReasoning(config) }, + ...(Object.keys(extraBody).length > 0 ? { extraBody } : {}), fetch: createOpenRouterCompatibleFetch(), }) } diff --git a/packages/browseros-agent/apps/server/src/agent/types.ts b/packages/browseros-agent/apps/server/src/agent/types.ts index f8eb7ed30..b8110792d 100644 --- a/packages/browseros-agent/apps/server/src/agent/types.ts +++ b/packages/browseros-agent/apps/server/src/agent/types.ts @@ -35,6 +35,7 @@ export interface ResolvedAgentConfig { reasoningEffort?: string reasoningSummary?: string reasoning?: { enabled?: boolean; maxTokens?: number; effort?: string } + verbosity?: 'low' | 'medium' | 'high' | 'xhigh' | 'max' contextWindowSize?: number userSystemPrompt?: string workingDir?: string diff --git a/packages/browseros-agent/apps/server/src/lib/clients/llm/provider.ts b/packages/browseros-agent/apps/server/src/lib/clients/llm/provider.ts index 8018ae69e..eba946b65 100644 --- a/packages/browseros-agent/apps/server/src/lib/clients/llm/provider.ts +++ b/packages/browseros-agent/apps/server/src/lib/clients/llm/provider.ts @@ -44,11 +44,25 @@ function createGoogleModel(config: ResolvedLLMConfig): LanguageModel { return createGoogleGenerativeAI({ apiKey: config.apiKey })(config.model) } +function buildOpenRouterExtraBody( + config: ResolvedLLMConfig, +): Record { + const body: Record = {} + + if (config.reasoning?.enabled !== undefined) { + body.reasoning = { enabled: config.reasoning.enabled } + } + if (config.verbosity !== undefined) body.verbosity = config.verbosity + + return body +} + function createOpenRouterModel(config: ResolvedLLMConfig): LanguageModel { if (!config.apiKey) throw new Error('OpenRouter provider requires apiKey') + const extraBody = buildOpenRouterExtraBody(config) return createOpenRouter({ apiKey: config.apiKey, - extraBody: { reasoning: {} }, + ...(Object.keys(extraBody).length > 0 ? { extraBody } : {}), fetch: createOpenRouterCompatibleFetch(), })(config.model) } diff --git a/packages/browseros-agent/packages/shared/src/schemas/llm.ts b/packages/browseros-agent/packages/shared/src/schemas/llm.ts index dade9d62a..ac3a5a602 100644 --- a/packages/browseros-agent/packages/shared/src/schemas/llm.ts +++ b/packages/browseros-agent/packages/shared/src/schemas/llm.ts @@ -93,6 +93,7 @@ export const LLMConfigSchema: z.ZodObject<{ > }> > + verbosity: z.ZodOptional> }> = z.object({ provider: LLMProviderSchema, model: z.string().optional(), @@ -116,6 +117,7 @@ export const LLMConfigSchema: z.ZodObject<{ effort: z.enum(['minimal', 'low', 'medium', 'high', 'xhigh']).optional(), }) .optional(), + verbosity: z.enum(['low', 'medium', 'high', 'xhigh', 'max']).optional(), }) export type LLMConfig = z.infer From 90310626765304d06afcef0b8383c99d96187050 Mon Sep 17 00:00:00 2001 From: Neel Gupta Date: Wed, 6 May 2026 17:28:37 +0100 Subject: [PATCH 10/13] fix(eval): route opus via openrouter bedrock --- .../browseros-agent-opus-4-6-agisdk-real.json | 20 ++- ...agent-opus-4-7-openrouter-agisdk-real.json | 29 ----- .../eval/tests/suites/config-adapter.test.ts | 32 ++--- .../server/src/agent/message-normalization.ts | 96 +++++++++++++- .../apps/server/src/agent/provider-factory.ts | 19 +++ .../apps/server/src/agent/types.ts | 6 +- .../server/src/lib/clients/llm/provider.ts | 19 +++ .../apps/server/src/lib/openrouter-fetch.ts | 120 ++++++++++++++++-- .../packages/shared/src/schemas/llm.ts | 14 ++ 9 files changed, 282 insertions(+), 73 deletions(-) delete mode 100644 packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-7-openrouter-agisdk-real.json diff --git a/packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-6-agisdk-real.json b/packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-6-agisdk-real.json index b2e8aaea7..e86c9fd7c 100644 --- a/packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-6-agisdk-real.json +++ b/packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-6-agisdk-real.json @@ -1,15 +1,21 @@ { "agent": { "type": "single", - "provider": "bedrock", - "model": "global.anthropic.claude-opus-4-6-v1", - "region": "us-west-2", - "accessKeyId": "AWS_ACCESS_KEY_ID", - "secretAccessKey": "AWS_SECRET_ACCESS_KEY", - "supportsImages": true + "provider": "openrouter", + "model": "anthropic/claude-opus-4.6", + "apiKey": "OPENROUTER_API_KEY", + "supportsImages": true, + "reasoning": { + "enabled": true + }, + "verbosity": "high", + "providerRouting": { + "only": ["amazon-bedrock"], + "allowFallbacks": false + } }, "dataset": "../../data/agisdk-real.jsonl", - "num_workers": 2, + "num_workers": 10, "restart_server_per_task": true, "browseros": { "server_url": "http://127.0.0.1:9110", diff --git a/packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-7-openrouter-agisdk-real.json b/packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-7-openrouter-agisdk-real.json deleted file mode 100644 index 90e0206e3..000000000 --- a/packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-7-openrouter-agisdk-real.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "agent": { - "type": "single", - "provider": "openrouter", - "model": "anthropic/claude-opus-4.7", - "apiKey": "OPENROUTER_API_KEY", - "supportsImages": true, - "reasoning": { - "enabled": true - }, - "verbosity": "xhigh" - }, - "dataset": "../../data/agisdk-real.jsonl", - "num_workers": 10, - "restart_server_per_task": true, - "browseros": { - "server_url": "http://127.0.0.1:9110", - "base_cdp_port": 9010, - "base_server_port": 9110, - "base_extension_port": 9310, - "load_extensions": false, - "headless": false - }, - "captcha": { - "api_key_env": "NOPECHA_API_KEY" - }, - "graders": ["agisdk_state_diff"], - "timeout_ms": 1800000 -} diff --git a/packages/browseros-agent/apps/eval/tests/suites/config-adapter.test.ts b/packages/browseros-agent/apps/eval/tests/suites/config-adapter.test.ts index 55a67126e..a09ec9268 100644 --- a/packages/browseros-agent/apps/eval/tests/suites/config-adapter.test.ts +++ b/packages/browseros-agent/apps/eval/tests/suites/config-adapter.test.ts @@ -42,12 +42,9 @@ describe('adaptEvalConfigFile', () => { const kimi = await adaptEvalConfigFile( 'apps/eval/configs/legacy/browseros-agent-kimi-k2-5-agisdk-real.json', ) - const bedrockOpus = await adaptEvalConfigFile( + const openrouterBedrockOpus = await adaptEvalConfigFile( 'apps/eval/configs/legacy/browseros-agent-opus-4-6-agisdk-real.json', ) - const openrouterOpus = await adaptEvalConfigFile( - 'apps/eval/configs/legacy/browseros-agent-opus-4-7-openrouter-agisdk-real.json', - ) expect(kimi.suite.id).toBe('browseros-agent-kimi-k2-5-agisdk-real') expect(kimi.evalConfig.agent).toMatchObject({ @@ -57,29 +54,22 @@ describe('adaptEvalConfigFile', () => { }) expect(kimi.evalConfig.num_workers).toBe(3) - expect(bedrockOpus.suite.id).toBe('browseros-agent-opus-4-6-agisdk-real') - expect(bedrockOpus.evalConfig.agent).toMatchObject({ - type: 'single', - provider: 'bedrock', - model: 'global.anthropic.claude-opus-4-6-v1', - region: 'us-west-2', - accessKeyId: 'AWS_ACCESS_KEY_ID', - secretAccessKey: 'AWS_SECRET_ACCESS_KEY', - }) - expect(bedrockOpus.evalConfig.num_workers).toBe(2) - - expect(openrouterOpus.suite.id).toBe( - 'browseros-agent-opus-4-7-openrouter-agisdk-real', + expect(openrouterBedrockOpus.suite.id).toBe( + 'browseros-agent-opus-4-6-agisdk-real', ) - expect(openrouterOpus.evalConfig.agent).toMatchObject({ + expect(openrouterBedrockOpus.evalConfig.agent).toMatchObject({ type: 'single', provider: 'openrouter', - model: 'anthropic/claude-opus-4.7', + model: 'anthropic/claude-opus-4.6', apiKey: 'OPENROUTER_API_KEY', reasoning: { enabled: true }, - verbosity: 'xhigh', + verbosity: 'high', + providerRouting: { + only: ['amazon-bedrock'], + allowFallbacks: false, + }, }) - expect(openrouterOpus.evalConfig.num_workers).toBe(10) + expect(openrouterBedrockOpus.evalConfig.num_workers).toBe(2) }) it('adapts claude-code configs without provider credentials', async () => { diff --git a/packages/browseros-agent/apps/server/src/agent/message-normalization.ts b/packages/browseros-agent/apps/server/src/agent/message-normalization.ts index 1792d5c5d..cc60f73fe 100644 --- a/packages/browseros-agent/apps/server/src/agent/message-normalization.ts +++ b/packages/browseros-agent/apps/server/src/agent/message-normalization.ts @@ -19,6 +19,10 @@ type ToolResultContentPart = Extract< type UserMessagePart = Exclude[number] type UserMediaPart = Extract +const MAX_SCREENSHOTS_IN_MODEL_HISTORY = 3 +const SCREENSHOT_HISTORY_PLACEHOLDER = + '' + export interface MessageNormalizationOptions { supportsImages: boolean supportsMediaInToolResults: boolean @@ -113,6 +117,90 @@ function toolResultContentPartToUserMedia( } } +function isScreenshotToolResult(part: ToolResultPart): boolean { + return ( + part.type === 'tool-result' && + typeof part.toolName === 'string' && + (part.toolName.includes('screenshot') || part.toolName === 'snapshot') + ) +} + +function isImageToolResultContentPart(part: ToolResultContentPart): boolean { + switch (part.type) { + case 'media': + case 'image-data': + case 'file-data': + return part.mediaType.startsWith('image/') + case 'image-url': + case 'image-file-id': + return true + default: + return false + } +} + +function pruneScreenshotHistory(messages: ModelMessage[]): ModelMessage[] { + let remainingScreenshots = MAX_SCREENSHOTS_IN_MODEL_HISTORY + let changed = false + const pruned = [...messages] + + for ( + let messageIndex = messages.length - 1; + messageIndex >= 0; + messageIndex-- + ) { + const message = messages[messageIndex] + if (message.role !== 'tool') continue + + let messageChanged = false + const content = [...message.content] + + for (let partIndex = content.length - 1; partIndex >= 0; partIndex--) { + const part = content[partIndex] + if ( + part.type !== 'tool-result' || + part.output.type !== 'content' || + !isScreenshotToolResult(part) + ) { + continue + } + + let partChanged = false + const value = [...part.output.value] + + for (let valueIndex = value.length - 1; valueIndex >= 0; valueIndex--) { + if (!isImageToolResultContentPart(value[valueIndex])) continue + + if (remainingScreenshots > 0) { + remainingScreenshots-- + continue + } + + value[valueIndex] = { + type: 'text', + text: SCREENSHOT_HISTORY_PLACEHOLDER, + } + partChanged = true + } + + if (!partChanged) continue + + content[partIndex] = { + ...part, + output: { ...part.output, value }, + } + messageChanged = true + } + + if (!messageChanged) continue + + pruned[messageIndex] = { ...message, content } + changed = true + } + + return changed ? pruned : messages +} + function normalizeToolMessageForModel( message: ToolModelMessage, supportsImages: boolean, @@ -178,14 +266,16 @@ export function normalizeMessagesForModel( messages: ModelMessage[], options: MessageNormalizationOptions, ): ModelMessage[] { + const screenshotPrunedMessages = pruneScreenshotHistory(messages) + if (options.supportsMediaInToolResults) { - return messages + return screenshotPrunedMessages } let changed = false const normalized: ModelMessage[] = [] - for (const message of messages) { + for (const message of screenshotPrunedMessages) { if (message.role !== 'tool') { normalized.push(message) continue @@ -201,5 +291,5 @@ export function normalizeMessagesForModel( normalized.push(...replacement) } - return changed ? normalized : messages + return changed ? normalized : screenshotPrunedMessages } diff --git a/packages/browseros-agent/apps/server/src/agent/provider-factory.ts b/packages/browseros-agent/apps/server/src/agent/provider-factory.ts index 11f9b6701..d96fec501 100644 --- a/packages/browseros-agent/apps/server/src/agent/provider-factory.ts +++ b/packages/browseros-agent/apps/server/src/agent/provider-factory.ts @@ -53,6 +53,25 @@ function buildOpenRouterExtraBody( body.reasoning = { enabled: config.reasoning.enabled } } if (config.verbosity !== undefined) body.verbosity = config.verbosity + if (config.providerRouting !== undefined) { + body.provider = { + ...(config.providerRouting.order !== undefined && { + order: config.providerRouting.order, + }), + ...(config.providerRouting.only !== undefined && { + only: config.providerRouting.only, + }), + ...(config.providerRouting.ignore !== undefined && { + ignore: config.providerRouting.ignore, + }), + ...(config.providerRouting.allowFallbacks !== undefined && { + allow_fallbacks: config.providerRouting.allowFallbacks, + }), + ...(config.providerRouting.requireParameters !== undefined && { + require_parameters: config.providerRouting.requireParameters, + }), + } + } return body } diff --git a/packages/browseros-agent/apps/server/src/agent/types.ts b/packages/browseros-agent/apps/server/src/agent/types.ts index b8110792d..fde310717 100644 --- a/packages/browseros-agent/apps/server/src/agent/types.ts +++ b/packages/browseros-agent/apps/server/src/agent/types.ts @@ -4,7 +4,10 @@ * SPDX-License-Identifier: AGPL-3.0-or-later */ import type { ToolApprovalConfig } from '@browseros/shared/constants/tool-approval' -import type { LLMProvider } from '@browseros/shared/schemas/llm' +import type { + LLMProvider, + OpenRouterProviderRouting, +} from '@browseros/shared/schemas/llm' export interface ProviderConfig { provider: LLMProvider @@ -36,6 +39,7 @@ export interface ResolvedAgentConfig { reasoningSummary?: string reasoning?: { enabled?: boolean; maxTokens?: number; effort?: string } verbosity?: 'low' | 'medium' | 'high' | 'xhigh' | 'max' + providerRouting?: OpenRouterProviderRouting contextWindowSize?: number userSystemPrompt?: string workingDir?: string diff --git a/packages/browseros-agent/apps/server/src/lib/clients/llm/provider.ts b/packages/browseros-agent/apps/server/src/lib/clients/llm/provider.ts index eba946b65..dbe66859f 100644 --- a/packages/browseros-agent/apps/server/src/lib/clients/llm/provider.ts +++ b/packages/browseros-agent/apps/server/src/lib/clients/llm/provider.ts @@ -53,6 +53,25 @@ function buildOpenRouterExtraBody( body.reasoning = { enabled: config.reasoning.enabled } } if (config.verbosity !== undefined) body.verbosity = config.verbosity + if (config.providerRouting !== undefined) { + body.provider = { + ...(config.providerRouting.order !== undefined && { + order: config.providerRouting.order, + }), + ...(config.providerRouting.only !== undefined && { + only: config.providerRouting.only, + }), + ...(config.providerRouting.ignore !== undefined && { + ignore: config.providerRouting.ignore, + }), + ...(config.providerRouting.allowFallbacks !== undefined && { + allow_fallbacks: config.providerRouting.allowFallbacks, + }), + ...(config.providerRouting.requireParameters !== undefined && { + require_parameters: config.providerRouting.requireParameters, + }), + } + } return body } diff --git a/packages/browseros-agent/apps/server/src/lib/openrouter-fetch.ts b/packages/browseros-agent/apps/server/src/lib/openrouter-fetch.ts index d9ada38f3..ab28fd371 100644 --- a/packages/browseros-agent/apps/server/src/lib/openrouter-fetch.ts +++ b/packages/browseros-agent/apps/server/src/lib/openrouter-fetch.ts @@ -1,5 +1,80 @@ import { APICallError } from '@ai-sdk/provider' +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value) +} + +function getOpenRouterErrorMessage(parsed: unknown, fallback: string): string { + if (!isRecord(parsed) || !isRecord(parsed.error)) return fallback + + let message = + typeof parsed.error.message === 'string' ? parsed.error.message : fallback + + if (parsed.error.code !== undefined) { + message = `[${String(parsed.error.code)}] ${message}` + } + + const metadata = isRecord(parsed.error.metadata) + ? parsed.error.metadata + : undefined + if (metadata?.raw !== undefined) { + message += ` (${JSON.stringify(metadata.raw)})` + } + + return message +} + +function getOpenRouterErrorStatus(parsed: unknown, fallback: number): number { + if (!isRecord(parsed) || !isRecord(parsed.error)) return fallback + return typeof parsed.error.code === 'number' ? parsed.error.code : fallback +} + +function sanitizeReasoningReplay( + options?: RequestInit, +): RequestInit | undefined { + if (typeof options?.body !== 'string') return options + + let body: unknown + try { + body = JSON.parse(options.body) + } catch { + return options + } + + if (!isRecord(body) || !Array.isArray(body.messages)) return options + + let changed = false + const messages = body.messages.map((message) => { + if ( + !isRecord(message) || + message.role !== 'assistant' || + !Array.isArray(message.reasoning_details) || + message.reasoning_details.length === 0 || + (!('reasoning' in message) && !('reasoning_content' in message)) + ) { + return message + } + + const { + reasoning: _reasoning, + reasoning_content: _reasoningContent, + ...rest + } = message + changed = true + return rest + }) + + if (!changed) return options + + return { + ...options, + body: JSON.stringify({ + ...body, + messages, + }), + } +} + /** * Creates a fetch function that extracts detailed error messages from OpenRouter-style APIs. * @@ -13,25 +88,24 @@ import { APICallError } from '@ai-sdk/provider' */ export function createOpenRouterCompatibleFetch(): typeof fetch { return (async (url: RequestInfo | URL, options?: RequestInit) => { - const response = await globalThis.fetch(url, options) + const response = await globalThis.fetch( + url, + sanitizeReasoningReplay(options), + ) + let responseBody: string | undefined + let parsedResponseBody: unknown if (!response.ok) { const statusCode = response.status let errorMessage = `HTTP ${statusCode}: ${response.statusText}` - let responseBody: string | undefined try { responseBody = await response.clone().text() - const parsed = JSON.parse(responseBody) - if (parsed.error?.message) { - errorMessage = parsed.error.message - if (parsed.error.code) { - errorMessage = `[${parsed.error.code}] ${errorMessage}` - } - if (parsed.error.metadata?.raw) { - errorMessage += ` (${JSON.stringify(parsed.error.metadata.raw)})` - } - } + parsedResponseBody = JSON.parse(responseBody) + errorMessage = getOpenRouterErrorMessage( + parsedResponseBody, + errorMessage, + ) } catch { // Keep default error message if parsing fails } @@ -45,6 +119,28 @@ export function createOpenRouterCompatibleFetch(): typeof fetch { }) } + if (response.headers.get('content-type')?.includes('application/json')) { + try { + responseBody = await response.clone().text() + parsedResponseBody = JSON.parse(responseBody) + } catch { + parsedResponseBody = undefined + } + + if (isRecord(parsedResponseBody) && isRecord(parsedResponseBody.error)) { + throw new APICallError({ + message: getOpenRouterErrorMessage( + parsedResponseBody, + 'Provider returned error', + ), + url: typeof url === 'string' ? url : url.toString(), + requestBodyValues: {}, + statusCode: getOpenRouterErrorStatus(parsedResponseBody, 400), + responseBody, + }) + } + } + return response }) as typeof fetch } diff --git a/packages/browseros-agent/packages/shared/src/schemas/llm.ts b/packages/browseros-agent/packages/shared/src/schemas/llm.ts index ac3a5a602..366f01a85 100644 --- a/packages/browseros-agent/packages/shared/src/schemas/llm.ts +++ b/packages/browseros-agent/packages/shared/src/schemas/llm.ts @@ -68,6 +68,18 @@ export const LLMProviderSchema: z.ZodEnum< export type LLMProvider = z.infer +const OpenRouterProviderRoutingSchema = z.object({ + order: z.array(z.string()).optional(), + only: z.array(z.string()).optional(), + ignore: z.array(z.string()).optional(), + allowFallbacks: z.boolean().optional(), + requireParameters: z.boolean().optional(), +}) + +export type OpenRouterProviderRouting = z.infer< + typeof OpenRouterProviderRoutingSchema +> + /** * LLM configuration schema * Used by SDK endpoints and agent configuration @@ -94,6 +106,7 @@ export const LLMConfigSchema: z.ZodObject<{ }> > verbosity: z.ZodOptional> + providerRouting: z.ZodOptional }> = z.object({ provider: LLMProviderSchema, model: z.string().optional(), @@ -118,6 +131,7 @@ export const LLMConfigSchema: z.ZodObject<{ }) .optional(), verbosity: z.enum(['low', 'medium', 'high', 'xhigh', 'max']).optional(), + providerRouting: OpenRouterProviderRoutingSchema.optional(), }) export type LLMConfig = z.infer From 758445fbbef73660a9f19af0fe53dda1d1a60dca Mon Sep 17 00:00:00 2001 From: Neel Gupta Date: Wed, 6 May 2026 17:38:39 +0100 Subject: [PATCH 11/13] fix: avoid wheel dispatch for page scroll --- .../apps/server/src/browser/browser.ts | 61 ++----------------- .../apps/server/tests/tools/input.test.ts | 27 ++++++++ 2 files changed, 31 insertions(+), 57 deletions(-) diff --git a/packages/browseros-agent/apps/server/src/browser/browser.ts b/packages/browseros-agent/apps/server/src/browser/browser.ts index 7657a5ca8..99f9a1bea 100644 --- a/packages/browseros-agent/apps/server/src/browser/browser.ts +++ b/packages/browseros-agent/apps/server/src/browser/browser.ts @@ -1252,69 +1252,16 @@ export class Browser { if (deltaX === 0 && deltaY === 0) return - let x: number - let y: number if (element !== undefined) { const center = await elements.getElementCenter(session, element) - x = center.x - y = center.y - } else { - const metrics = await session.Page.getLayoutMetrics() - const viewport = metrics.cssVisualViewport ?? metrics.cssLayoutViewport - x = viewport.clientWidth / 2 - y = viewport.clientHeight / 2 - } - - const beforeWindowPosition = - element === undefined - ? await this.getWindowScrollPosition(session) - : undefined - - await mouse.dispatchScroll(session, x, y, deltaX, deltaY) - - if (beforeWindowPosition === undefined) return - - const afterWindowPosition = await this.getWindowScrollPosition(session) - const moved = this.didScrollInExpectedDirection( - beforeWindowPosition, - afterWindowPosition, - deltaX, - deltaY, - ) - if (moved) return - - await this.fallbackWindowScroll(session, deltaX, deltaY) - } - - private async getWindowScrollPosition( - session: ProtocolApi, - ): Promise<{ x: number; y: number }> { - const result = await session.Runtime.evaluate({ - expression: - '({ x: window.scrollX ?? window.pageXOffset ?? 0, y: window.scrollY ?? window.pageYOffset ?? 0 })', - returnByValue: true, - }) - const value = (result.result?.value ?? {}) as { x?: number; y?: number } - return { - x: typeof value.x === 'number' ? value.x : 0, - y: typeof value.y === 'number' ? value.y : 0, + await mouse.dispatchScroll(session, center.x, center.y, deltaX, deltaY) + return } - } - private didScrollInExpectedDirection( - before: { x: number; y: number }, - after: { x: number; y: number }, - deltaX: number, - deltaY: number, - ): boolean { - if (deltaX > 0 && after.x > before.x) return true - if (deltaX < 0 && after.x < before.x) return true - if (deltaY > 0 && after.y > before.y) return true - if (deltaY < 0 && after.y < before.y) return true - return false + await this.scrollWindow(session, deltaX, deltaY) } - private async fallbackWindowScroll( + private async scrollWindow( session: ProtocolApi, deltaX: number, deltaY: number, diff --git a/packages/browseros-agent/apps/server/tests/tools/input.test.ts b/packages/browseros-agent/apps/server/tests/tools/input.test.ts index ea1671635..69926029f 100644 --- a/packages/browseros-agent/apps/server/tests/tools/input.test.ts +++ b/packages/browseros-agent/apps/server/tests/tools/input.test.ts @@ -644,6 +644,33 @@ describe('input tools', () => { }) }) + it('scroll moves the page viewport', async () => { + await withBrowser(async ({ execute }) => { + const newResult = await execute(new_page, { url: FORM_PAGE }) + const pageId = pageIdOf(newResult) + + const result = await execute(scroll, { + page: pageId, + direction: 'down', + amount: 5, + }) + + assert.ok(!result.isError, textOf(result)) + assert.ok(textOf(result).includes('Scrolled down')) + + const position = await execute(evaluate_script, { + page: pageId, + expression: 'window.scrollY', + }) + const data = structuredOf<{ value?: unknown }>(position) + const value = data.value + assert.strictEqual(typeof value, 'number') + assert.ok(value > 0) + + await execute(close_page, { page: pageId }) + }) + }, 60_000) + it('hover moves cursor via the GUI point model response', async () => { await withBrowser(async ({ browser, execute }) => { const newResult = await execute(new_page, { url: FORM_PAGE }) From 283f76eba7132911f0f58b862078cc4fb144e8e8 Mon Sep 17 00:00:00 2001 From: Neel Gupta Date: Thu, 7 May 2026 12:31:49 +0100 Subject: [PATCH 12/13] fix(eval): continue after empty tool-result stop --- .../apps/eval/src/agents/single-agent.ts | 235 +++++++++++------- .../eval/tests/agents/single-agent.test.ts | 50 ++++ 2 files changed, 202 insertions(+), 83 deletions(-) create mode 100644 packages/browseros-agent/apps/eval/tests/agents/single-agent.test.ts diff --git a/packages/browseros-agent/apps/eval/src/agents/single-agent.ts b/packages/browseros-agent/apps/eval/src/agents/single-agent.ts index 3fd55df76..4e378db90 100644 --- a/packages/browseros-agent/apps/eval/src/agents/single-agent.ts +++ b/packages/browseros-agent/apps/eval/src/agents/single-agent.ts @@ -9,7 +9,7 @@ import { CdpBackend } from '@browseros/server/browser/backends/cdp' import { registry } from '@browseros/server/tools/registry' import { CaptchaWaiter } from '../capture/captcha-waiter' import { DEFAULT_TIMEOUT_MS } from '../constants' -import type { TaskMetadata } from '../types' +import type { TaskMetadata, UIMessageStreamEvent } from '../types' import { isProviderExecutionError, retryProviderErrors, @@ -18,6 +18,43 @@ import { resolveProviderConfig } from '../utils/resolve-provider-config' import { withEvalTimeout } from '../utils/with-eval-timeout' import type { AgentContext, AgentEvaluator, AgentResult } from './types' +const EMPTY_TOOL_RESULT_STOP_CONTINUATION_LIMIT = 2 + +interface ToolLoopResultShape { + text: string + finishReason: string + toolCalls: readonly unknown[] + steps: ReadonlyArray<{ + toolResults: readonly unknown[] + }> +} + +export function shouldContinueAfterEmptyToolResultStop( + result: ToolLoopResultShape, +): boolean { + const previousStep = result.steps.at(-2) + + return ( + result.finishReason === 'stop' && + result.text.trim().length === 0 && + result.toolCalls.length === 0 && + (previousStep?.toolResults.length ?? 0) > 0 + ) +} + +export function buildEmptyToolResultStopContinuationPrompt( + taskQuery: string, +): string { + return [ + 'Continue the eval task from the current browser state.', + '', + 'The previous model response stopped immediately after a tool result without issuing another tool call or a final answer. Do not stop after routine tool results. If the requested workflow is complete, respond with a brief completion message. Otherwise, inspect the page if needed and continue using tools.', + '', + 'Original task:', + taskQuery, + ].join('\n') +} + export class SingleAgentEvaluator implements AgentEvaluator { constructor(private ctx: AgentContext) {} @@ -94,93 +131,125 @@ export class SingleAgentEvaluator implements AgentEvaluator { async (signal) => { if (!agent) throw new Error('Agent was not initialized') const activeAgent = agent - // Format prompt with browser context so the agent knows what page it's on - // (same formatting as chat-service.ts → formatUserMessage) - const prompt = formatUserMessage(task.query, browserContext) - const result = await retryProviderErrors( - () => - activeAgent.toolLoopAgent.generate({ - prompt, - abortSignal: signal, - - experimental_onToolCallStart: ({ toolCall }) => { - const input = toolCall.input as - | Record - | undefined - if (input && typeof input.page === 'number') { - capture.setActivePageId(input.page) - } - }, - - experimental_onToolCallFinish: async () => { - try { - if (captchaWaiter) { - await captchaWaiter.waitIfCaptchaPresent( - browser, + + let continuationCount = 0 + let currentQuery = task.query + + for (;;) { + // Format prompt with browser context so the agent knows what page it's on + // (same formatting as chat-service.ts → formatUserMessage) + const prompt = formatUserMessage(currentQuery, browserContext) + const result = await retryProviderErrors( + () => + activeAgent.toolLoopAgent.generate({ + prompt, + abortSignal: signal, + + experimental_onToolCallStart: ({ toolCall }) => { + const input = toolCall.input as + | Record + | undefined + if (input && typeof input.page === 'number') { + capture.setActivePageId(input.page) + } + }, + + experimental_onToolCallFinish: async () => { + try { + if (captchaWaiter) { + await captchaWaiter.waitIfCaptchaPresent( + browser, + capture.getActivePageId(), + ) + } + const screenshotNum = await capture.screenshot.capture( capture.getActivePageId(), ) + capture.emitEvent(task.query_id, { + type: 'screenshot-captured', + screenshot: screenshotNum, + }) + } catch { + // Screenshot failures are non-fatal + } + }, + + onStepFinish: async ({ toolCalls, toolResults, text }) => { + if (toolCalls) { + for (const tc of toolCalls) { + const inputEvent: UIMessageStreamEvent = { + type: 'tool-input-available', + toolCallId: tc.toolCallId, + toolName: tc.toolName, + input: tc.input, + } + await capture.messageLogger.logStreamEvent(inputEvent) + capture.emitEvent(task.query_id, inputEvent) + } } - const screenshotNum = await capture.screenshot.capture( - capture.getActivePageId(), - ) - capture.emitEvent(task.query_id, { - type: 'screenshot-captured', - screenshot: screenshotNum, - }) - } catch { - // Screenshot failures are non-fatal - } - }, - - onStepFinish: async ({ toolCalls, toolResults, text }) => { - if (toolCalls) { - for (const tc of toolCalls) { - const inputEvent = { - type: 'tool-input-available', - toolCallId: tc.toolCallId, - toolName: tc.toolName, - input: tc.input, - } as any - await capture.messageLogger.logStreamEvent(inputEvent) - capture.emitEvent(task.query_id, inputEvent) + + if (toolResults) { + for (const tr of toolResults) { + const outputEvent: UIMessageStreamEvent = { + type: 'tool-output-available', + toolCallId: tr.toolCallId, + output: tr.output, + } + await capture.messageLogger.logStreamEvent(outputEvent) + capture.emitEvent(task.query_id, outputEvent) + } } - } - - if (toolResults) { - for (const tr of toolResults) { - const outputEvent = { - type: 'tool-output-available', - toolCallId: tr.toolCallId, - output: tr.output, - } as any - await capture.messageLogger.logStreamEvent(outputEvent) - capture.emitEvent(task.query_id, outputEvent) + + if (text) { + const textId = randomUUID() + const startEvent: UIMessageStreamEvent = { + type: 'text-start', + id: textId, + } + const deltaEvent: UIMessageStreamEvent = { + type: 'text-delta', + id: textId, + delta: text, + } + const endEvent: UIMessageStreamEvent = { + type: 'text-end', + id: textId, + } + await capture.messageLogger.logStreamEvent(startEvent) + await capture.messageLogger.logStreamEvent(deltaEvent) + await capture.messageLogger.logStreamEvent(endEvent) + capture.emitEvent(task.query_id, deltaEvent) } - } - - if (text) { - const textId = randomUUID() - const startEvent = { type: 'text-start', id: textId } as any - const deltaEvent = { - type: 'text-delta', - id: textId, - delta: text, - } as any - const endEvent = { type: 'text-end', id: textId } as any - await capture.messageLogger.logStreamEvent(startEvent) - await capture.messageLogger.logStreamEvent(deltaEvent) - await capture.messageLogger.logStreamEvent(endEvent) - capture.emitEvent(task.query_id, deltaEvent) - } - }, - }), - { - label: `single-agent ${task.query_id}`, - signal, - }, - ) - - finalText = result.text || null + }, + }), + { + label: `single-agent ${task.query_id}`, + signal, + }, + ) + + if (!shouldContinueAfterEmptyToolResultStop(result)) { + finalText = result.text || null + break + } + + if ( + continuationCount >= EMPTY_TOOL_RESULT_STOP_CONTINUATION_LIMIT + ) { + throw new Error( + `Model stopped with empty output immediately after a tool result ${continuationCount + 1} times`, + ) + } + + continuationCount++ + capture.addWarning( + 'agent_execution', + `Model stopped with empty output immediately after a tool result; continuing task (${continuationCount}/${EMPTY_TOOL_RESULT_STOP_CONTINUATION_LIMIT})`, + ) + currentQuery = buildEmptyToolResultStopContinuationPrompt( + task.query, + ) + } }, { rethrowError: isProviderExecutionError }, ) diff --git a/packages/browseros-agent/apps/eval/tests/agents/single-agent.test.ts b/packages/browseros-agent/apps/eval/tests/agents/single-agent.test.ts new file mode 100644 index 000000000..57eaeee2a --- /dev/null +++ b/packages/browseros-agent/apps/eval/tests/agents/single-agent.test.ts @@ -0,0 +1,50 @@ +import { describe, expect, it } from 'bun:test' +import { + buildEmptyToolResultStopContinuationPrompt, + shouldContinueAfterEmptyToolResultStop, +} from '../../src/agents/single-agent' + +describe('single-agent empty tool-result stop handling', () => { + it('continues when the model emits an empty stop after a tool result', () => { + expect( + shouldContinueAfterEmptyToolResultStop({ + text: '', + finishReason: 'stop', + toolCalls: [], + steps: [{ toolResults: [{ ok: true }] }, { toolResults: [] }], + }), + ).toBe(true) + }) + + it('does not continue for normal final text', () => { + expect( + shouldContinueAfterEmptyToolResultStop({ + text: 'Done', + finishReason: 'stop', + toolCalls: [], + steps: [{ toolResults: [{ ok: true }] }, { toolResults: [] }], + }), + ).toBe(false) + }) + + it('does not continue when there was no previous tool result', () => { + expect( + shouldContinueAfterEmptyToolResultStop({ + text: '', + finishReason: 'stop', + toolCalls: [], + steps: [{ toolResults: [] }], + }), + ).toBe(false) + }) + + it('builds a continuation prompt with the original task', () => { + const prompt = buildEmptyToolResultStopContinuationPrompt( + 'Delete the target email.', + ) + + expect(prompt).toContain('Continue the eval task') + expect(prompt).toContain('Do not stop after routine tool results') + expect(prompt).toContain('Delete the target email.') + }) +}) From 0f7e2098c48fc6717a0d3abf6161fbeb97352296 Mon Sep 17 00:00:00 2001 From: Neel Gupta Date: Thu, 7 May 2026 20:44:46 +0100 Subject: [PATCH 13/13] fix(agent): improve gui eval reliability --- .../eval/src/runner/browseros-app-manager.ts | 13 +++++-- .../eval/src/utils/provider-error-retry.ts | 35 +++++++++++++++-- .../tests/utils/provider-error-retry.test.ts | 38 +++++++++++++++++++ .../apps/server/src/agent/ai-sdk-agent.ts | 1 + .../apps/server/src/agent/prompt.ts | 25 ++++++++++++ .../apps/server/src/agent/tool-adapter.ts | 1 + .../apps/server/src/tools/input.ts | 4 ++ 7 files changed, 110 insertions(+), 7 deletions(-) diff --git a/packages/browseros-agent/apps/eval/src/runner/browseros-app-manager.ts b/packages/browseros-agent/apps/eval/src/runner/browseros-app-manager.ts index f3f50b0a9..1cabec6ef 100644 --- a/packages/browseros-agent/apps/eval/src/runner/browseros-app-manager.ts +++ b/packages/browseros-agent/apps/eval/src/runner/browseros-app-manager.ts @@ -62,6 +62,12 @@ const CAPTCHA_EXT_DIR = join( '../../extensions/nopecha', ) +export function resolveServerStartScript( + env: Record = process.env, +): string { + return env.BROWSEROS_EVAL_SERVER_START_SCRIPT || 'start:ci' +} + export class BrowserOSAppManager { private ports: EvalPorts private chromeProc: Subprocess | null = null @@ -215,9 +221,10 @@ export class BrowserOSAppManager { } this.serverLogFd = logFd - // `start:ci` skips `--watch` (no file-watcher overhead in CI). Falls back - // to the regular `start` script outside CI for the dev-watch experience. - const startScript = process.env.CI ? 'start:ci' : 'start' + // Eval servers must not use `start` because it runs Bun in watch mode; a + // source edit during a long eval can restart the worker server before the + // grader fetches /finish. Keep an escape hatch for local debugging. + const startScript = resolveServerStartScript() this.serverProc = spawn({ cmd: ['bun', 'run', '--filter', '@browseros/server', startScript], cwd: MONOREPO_ROOT, diff --git a/packages/browseros-agent/apps/eval/src/utils/provider-error-retry.ts b/packages/browseros-agent/apps/eval/src/utils/provider-error-retry.ts index 428a76e21..82a24b246 100644 --- a/packages/browseros-agent/apps/eval/src/utils/provider-error-retry.ts +++ b/packages/browseros-agent/apps/eval/src/utils/provider-error-retry.ts @@ -2,6 +2,8 @@ import { sleep } from './sleep' const DEFAULT_PROVIDER_ERROR_RETRIES = 5 const DEFAULT_PROVIDER_ERROR_RETRY_WINDOW_MS = 10_000 +const DEFAULT_RATE_LIMIT_RETRIES = 8 +const DEFAULT_RATE_LIMIT_RETRY_WINDOW_MS = 120_000 const PROVIDER_ERROR_LOG_MAX_STRING_CHARS = 10_000 const PROVIDER_ERROR_LOG_MAX_DEPTH = 5 @@ -19,6 +21,8 @@ export interface ProviderErrorRetryOptions { signal?: AbortSignal retries?: number windowMs?: number + rateLimitRetries?: number + rateLimitWindowMs?: number onRetry?: (event: ProviderErrorRetryEvent) => void } @@ -47,7 +51,9 @@ function errorMarkers(error: unknown, seen = new Set()): string[] { const record = error as Record if ('isRetryable' in record) markers.push('isRetryable') if ('statusCode' in record) markers.push('statusCode') + if ('statusCode' in record) markers.push(String(record.statusCode)) if ('responseBody' in record) markers.push('responseBody') + if ('responseBody' in record) markers.push(String(record.responseBody)) if ('cause' in record) { markers.push(...errorMarkers(record.cause, seen)) } @@ -63,6 +69,7 @@ function errorMarkers(error: unknown, seen = new Set()): string[] { export function isProviderExecutionError(error: unknown): boolean { const markerText = errorMarkers(error).join('\n') return ( + isProviderRateLimitError(error) || markerText.includes('Provider returned error') || markerText.includes('APICallError') || markerText.includes('AI_RetryError') || @@ -73,6 +80,17 @@ export function isProviderExecutionError(error: unknown): boolean { ) } +export function isProviderRateLimitError(error: unknown): boolean { + const markerText = errorMarkers(error).join('\n').toLowerCase() + return ( + markerText.includes('rate limit') || + markerText.includes('rate-limit') || + markerText.includes('too many requests') || + markerText.includes('statuscode\n429') || + markerText.includes('\n429\n') + ) +} + function errorMessage(error: unknown): string { return error instanceof Error ? error.message : String(error) } @@ -150,9 +168,13 @@ export async function retryProviderErrors( operation: () => Promise, options: ProviderErrorRetryOptions, ): Promise { - const maxRetries = options.retries ?? DEFAULT_PROVIDER_ERROR_RETRIES - const windowMs = options.windowMs ?? DEFAULT_PROVIDER_ERROR_RETRY_WINDOW_MS - const delayMs = maxRetries > 0 ? Math.floor(windowMs / maxRetries) : 0 + const providerRetries = options.retries ?? DEFAULT_PROVIDER_ERROR_RETRIES + const providerWindowMs = + options.windowMs ?? DEFAULT_PROVIDER_ERROR_RETRY_WINDOW_MS + const rateLimitRetries = + options.rateLimitRetries ?? DEFAULT_RATE_LIMIT_RETRIES + const rateLimitWindowMs = + options.rateLimitWindowMs ?? DEFAULT_RATE_LIMIT_RETRY_WINDOW_MS for (let attempt = 0; ; attempt++) { try { @@ -163,6 +185,11 @@ export async function retryProviderErrors( throw error } + const isRateLimit = isProviderRateLimitError(error) + const maxRetries = isRateLimit ? rateLimitRetries : providerRetries + const windowMs = isRateLimit ? rateLimitWindowMs : providerWindowMs + const delayMs = maxRetries > 0 ? Math.floor(windowMs / maxRetries) : 0 + if (attempt >= maxRetries) { logFinalProviderError(options.label, error, attempt + 1) throw error @@ -176,7 +203,7 @@ export async function retryProviderErrors( } options.onRetry?.(event) console.warn( - `[provider-retry] ${options.label}: retry ${event.retryNumber}/${maxRetries} in ${delayMs}ms after provider error: ${errorMessage(error)}`, + `[provider-retry] ${options.label}: retry ${event.retryNumber}/${maxRetries} in ${delayMs}ms after ${isRateLimit ? 'rate limit' : 'provider error'}: ${errorMessage(error)}`, ) await sleep(delayMs, options.signal) } diff --git a/packages/browseros-agent/apps/eval/tests/utils/provider-error-retry.test.ts b/packages/browseros-agent/apps/eval/tests/utils/provider-error-retry.test.ts index 0b1b5f2f5..baa42e7ab 100644 --- a/packages/browseros-agent/apps/eval/tests/utils/provider-error-retry.test.ts +++ b/packages/browseros-agent/apps/eval/tests/utils/provider-error-retry.test.ts @@ -1,6 +1,7 @@ import { describe, expect, it } from 'bun:test' import { isProviderExecutionError, + isProviderRateLimitError, retryProviderErrors, } from '../../src/utils/provider-error-retry' @@ -13,6 +14,13 @@ function providerError(message = 'Provider returned error'): Error { return error } +function rateLimitError(): Error { + const error = new Error('rate limit exceeded, please try again later') + error.name = 'AI_APICallError' + ;(error as unknown as Record).statusCode = 429 + return error +} + async function withoutRetryWarnings(fn: () => Promise): Promise { const originalWarn = console.warn const originalError = console.error @@ -29,6 +37,13 @@ async function withoutRetryWarnings(fn: () => Promise): Promise { describe('provider error retries', () => { it('detects provider errors from SDK-style markers', () => { expect(isProviderExecutionError(providerError())).toBe(true) + expect(isProviderExecutionError(rateLimitError())).toBe(true) + expect(isProviderRateLimitError(rateLimitError())).toBe(true) + expect( + isProviderExecutionError( + new Error('rate limit exceeded, please try again later'), + ), + ).toBe(true) expect(isProviderExecutionError(new Error('regular tool failure'))).toBe( false, ) @@ -51,6 +66,29 @@ describe('provider error retries', () => { }) }) + it('uses the rate-limit retry policy for provider rate limits', async () => { + await withoutRetryWarnings(async () => { + let calls = 0 + const result = await retryProviderErrors( + async () => { + calls++ + if (calls <= 2) throw rateLimitError() + return 'ok' + }, + { + label: 'test', + retries: 0, + windowMs: 0, + rateLimitRetries: 2, + rateLimitWindowMs: 0, + }, + ) + + expect(result).toBe('ok') + expect(calls).toBe(3) + }) + }) + it('throws the final provider error after retries are exhausted', async () => { const originalWarn = console.warn const originalError = console.error diff --git a/packages/browseros-agent/apps/server/src/agent/ai-sdk-agent.ts b/packages/browseros-agent/apps/server/src/agent/ai-sdk-agent.ts index 11cf307a3..1437887a4 100644 --- a/packages/browseros-agent/apps/server/src/agent/ai-sdk-agent.ts +++ b/packages/browseros-agent/apps/server/src/agent/ai-sdk-agent.ts @@ -268,6 +268,7 @@ export class AiSdkAgent { skillsCatalog, origin: config.resolvedConfig.origin, guiClickOnly: GUI_CLICK_ONLY_MODE, + evalMode: config.resolvedConfig.evalMode, }) // Configure compaction for context window management diff --git a/packages/browseros-agent/apps/server/src/agent/prompt.ts b/packages/browseros-agent/apps/server/src/agent/prompt.ts index 82c85a71d..f8cf5c837 100644 --- a/packages/browseros-agent/apps/server/src/agent/prompt.ts +++ b/packages/browseros-agent/apps/server/src/agent/prompt.ts @@ -524,6 +524,28 @@ function getErrorRecovery( return recovery } +// ----------------------------------------------------------------------------- +// section: eval-mode +// ----------------------------------------------------------------------------- + +function getEvalMode( + _exclude: Set, + options?: BuildSystemPromptOptions, +): string { + if (!options?.evalMode) return '' + + return ` +## Eval Mode + +You are running inside a benchmark environment. The user's task is pre-authorized for the eval clone site. + +- Do not ask the user for confirmation before clicking final submit, place-order, book, send, save, create, or similar completion buttons required by the task. +- If the task asks you to complete a workflow and the visible page asks for fields the user did not specify, choose reasonable dummy values. +- Complete the task end-to-end using the available tools, then report the outcome. +- Only pause for credentials, CAPTCHA, 2FA, real payment confirmation, or blocked site access. +` +} + // ----------------------------------------------------------------------------- // section: memory-and-identity // ----------------------------------------------------------------------------- @@ -809,6 +831,7 @@ const promptSections: Record = { ) => getToolSelection(_exclude, options), 'external-integrations': getExternalIntegrations, 'error-recovery': getErrorRecovery, + 'eval-mode': getEvalMode, 'memory-and-identity': getMemoryAndIdentity, workspace: getWorkspace, skills: (_exclude: Set, options?: BuildSystemPromptOptions) => @@ -837,6 +860,8 @@ export interface BuildSystemPromptOptions { origin?: 'sidepanel' | 'newtab' /** Experimental mode: browser control is limited to GUI-backed click only. */ guiClickOnly?: boolean + /** Eval mode: benchmark tasks are pre-authorized within clone sites. */ + evalMode?: boolean } export function buildSystemPrompt(options?: BuildSystemPromptOptions): string { diff --git a/packages/browseros-agent/apps/server/src/agent/tool-adapter.ts b/packages/browseros-agent/apps/server/src/agent/tool-adapter.ts index 432471fa4..df26c8ab8 100644 --- a/packages/browseros-agent/apps/server/src/agent/tool-adapter.ts +++ b/packages/browseros-agent/apps/server/src/agent/tool-adapter.ts @@ -79,6 +79,7 @@ export function buildBrowserToolSet( content: result.content, isError: result.isError ?? false, metadata: result.metadata, + structuredContent: result.structuredContent, } } catch (error) { const errorText = diff --git a/packages/browseros-agent/apps/server/src/tools/input.ts b/packages/browseros-agent/apps/server/src/tools/input.ts index ad945a490..98306e15e 100644 --- a/packages/browseros-agent/apps/server/src/tools/input.ts +++ b/packages/browseros-agent/apps/server/src/tools/input.ts @@ -101,6 +101,7 @@ export const click = defineInputTool({ x: z.number(), y: z.number(), hitElement: guiHitElementOutput, + guiPointDebug: z.record(z.unknown()).optional(), }), handler: async (args, ctx, response) => { const { x, y, hitElement, log } = await resolveGuiPoint( @@ -135,6 +136,7 @@ export const click = defineInputTool({ x, y, hitElement, + guiPointDebug: clickLog, }) }, }) @@ -251,6 +253,7 @@ export const hover = defineInputTool({ prompt: z.string(), x: z.number(), y: z.number(), + guiPointDebug: z.record(z.unknown()).optional(), }), handler: async (args, ctx, response) => { const { x, y, log } = await resolveGuiPoint(ctx, args.page, args.prompt) @@ -270,6 +273,7 @@ export const hover = defineInputTool({ prompt: args.prompt, x, y, + guiPointDebug: hoverLog, }) }, })