diff --git a/src/@types/types.d.ts b/src/@types/types.d.ts index c375faf..9cdcbf4 100644 --- a/src/@types/types.d.ts +++ b/src/@types/types.d.ts @@ -37,6 +37,13 @@ import type { ProxyOptionsSchema } from '../lib/agent-client.js'; export interface BrowserlessSession extends Record { token: string; apiUrl: string; + /** + * A pre-created browser session id to ATTACH to (via /chromium/agent?sessionId), + * threaded by the caller through the `x-browserless-session-id` header. Used by + * the autologin runner, which does POST /profile itself and hands the agent the + * resulting id instead of letting the model open a `createProfile` session. + */ + attachSessionId?: string; } export interface SupabaseJwtPayload { @@ -134,6 +141,7 @@ export interface SnapshotElement { focused?: boolean; required?: boolean; ariaLabel?: string; + frameId?: string; } export interface TabInfo { @@ -143,6 +151,13 @@ export interface TabInfo { active: boolean; } +// for iframe handling +export interface FrameInfo { + frameId: string; + url: string; + crossOrigin: boolean; +} + export interface SnapshotResult { url: string; title: string; @@ -151,6 +166,7 @@ export interface SnapshotResult { tabs?: TabInfo[]; activeTargetId?: string | null; detectedChallenges?: string[]; + frames?: FrameInfo[]; } export interface ActiveSession { diff --git a/src/index.ts b/src/index.ts index 7748da5..7798a7d 100644 --- a/src/index.ts +++ b/src/index.ts @@ -117,18 +117,31 @@ const hybridAuthenticate = params.get('browserlessUrl') ?? config.browserlessApiUrl; + // A pre-created session id to attach to, threaded by the autologin + // runner. The agent tool opens /chromium/agent?sessionId= instead + // of doing its own POST /profile. + const attachSessionId = + (request.headers['x-browserless-session-id'] as string) ?? + params.get('browserlessSessionId') ?? + undefined; + // JWTs have 3 dot-separated base64url segments; plain API keys do not. const isJwt = headerToken ? headerToken.split('.').length === 3 : false; + // apiUrl/attachSessionId are the same across every auth path; only the + // resolved token differs. + const session = (token: string): BrowserlessSession => + ({ token, apiUrl, attachSessionId }) as BrowserlessSession; + // 1. Authorization header with plain API key if (headerToken && !isJwt) { - return { token: headerToken, apiUrl } as BrowserlessSession; + return session(headerToken); } // 2. ?token= query param const directToken = params.get('token') || undefined; if (directToken) { - return { token: directToken, apiUrl } as BrowserlessSession; + return session(directToken); } // 3. Authorization header with JWT → decode Supabase token directly @@ -138,7 +151,7 @@ const hybridAuthenticate = config.supabaseServiceRoleKey, headerToken, ); - return { token: apiKey, apiUrl } as BrowserlessSession; + return session(apiKey); } throw new Error( diff --git a/src/lib/agent-client.ts b/src/lib/agent-client.ts index f4bd570..7a92948 100644 --- a/src/lib/agent-client.ts +++ b/src/lib/agent-client.ts @@ -237,11 +237,13 @@ const getSessionKey = ( proxy?: ProxyOptions, profile?: string, createProfile?: CreateProfileParams, + attachSessionId?: string, ): string => (mcpSessionId ?? `stdio:${hashToken(token)}`) + proxyFingerprint(proxy) + (profile ? KEY_SEP + 'profile#' + hashToken(profile) : '') + - (createProfile ? KEY_SEP + 'create#' + hashToken(createProfile.name) : ''); + (createProfile ? KEY_SEP + 'create#' + hashToken(createProfile.name) : '') + + (attachSessionId ? KEY_SEP + 'attach#' + attachSessionId : ''); /** * Build the WebSocket URL for `/chromium/agent`: normalize trailing slashes, @@ -601,9 +603,17 @@ export const getOrCreateSession = async ( proxy?: ProxyOptions, profile?: string, createProfile?: CreateProfileParams, + attachSessionId?: string, ): Promise => { sweepSessions(); - const key = getSessionKey(mcpSessionId, token, proxy, profile, createProfile); + const key = getSessionKey( + mcpSessionId, + token, + proxy, + profile, + createProfile, + attachSessionId, + ); const existing = sessions.get(key); if (existing && existing.ws.readyState === WebSocket.OPEN) { @@ -626,11 +636,19 @@ export const getOrCreateSession = async ( } const creation = (async (): Promise => { - // Profile-creation mode: launch a tracked session via POST /profile, then - // attach the agent WS to it by id. Otherwise launch a fresh agent browser. - const creationSessionId = createProfile - ? (await postCreateProfile(apiUrl, token, createProfile)).id - : undefined; + // Three modes for the session to attach to: + // - attachSessionId: a session the caller already created (autologin + // runner did POST /profile itself) — attach by id, no POST here. + // - createProfile: open a tracked session via POST /profile, then attach. + // - neither: launch a fresh agent browser. + let creationSessionId: string | undefined; + if (attachSessionId) { + creationSessionId = attachSessionId; + } else if (createProfile) { + creationSessionId = ( + await postCreateProfile(apiUrl, token, createProfile) + ).id; + } const ws = await connect(apiUrl, token, proxy, profile, creationSessionId); const session: ActiveSession = { ws, @@ -727,8 +745,16 @@ export const closeSession = ( proxy?: ProxyOptions, profile?: string, createProfile?: CreateProfileParams, + attachSessionId?: string, ): void => { - const key = getSessionKey(mcpSessionId, token, proxy, profile, createProfile); + const key = getSessionKey( + mcpSessionId, + token, + proxy, + profile, + createProfile, + attachSessionId, + ); const session = sessions.get(key); if (session) { try { @@ -751,8 +777,16 @@ export const destroySession = ( proxy?: ProxyOptions, profile?: string, createProfile?: CreateProfileParams, + attachSessionId?: string, ): void => { - const key = getSessionKey(mcpSessionId, token, proxy, profile, createProfile); + const key = getSessionKey( + mcpSessionId, + token, + proxy, + profile, + createProfile, + attachSessionId, + ); const session = sessions.get(key); if (session) { try { diff --git a/src/lib/agent-format.ts b/src/lib/agent-format.ts index b1ff2dd..0c99265 100644 --- a/src/lib/agent-format.ts +++ b/src/lib/agent-format.ts @@ -5,6 +5,7 @@ export type { SnapshotResult, SnapshotElement, TabInfo, + FrameInfo, } from '../@types/types.js'; const safeOrigin = (url: string): string | undefined => { @@ -118,10 +119,16 @@ export const formatConnectError = (err: unknown): string => { /** * Format a single snapshot element as a compact one-liner: - * [ref] tag role "name" ref=selector value="…" (state) + * [ref] tag role "name" ref=selector value="…" (state) [frame#N] * e.g. [7] input checkbox "Remember me" ref=input#remember (checked, required) + * `frameLabels` maps a frameId to its display label (frame#1, …); when an + * element carries a frameId, the label is appended so the agent sees which + * iframe it lives in. */ -const formatElement = (el: SnapshotElement): string => { +const formatElement = ( + el: SnapshotElement, + frameLabels?: Map, +): string => { const parts: string[] = [`[${el.ref}]`, el.tag, el.role]; const name = el.name || el.text || ''; if (name) parts.push(`"${name}"`); @@ -141,6 +148,9 @@ const formatElement = (el: SnapshotElement): string => { if (el.required) flags.push('required'); if (flags.length) parts.push(`(${flags.join(', ')})`); + const frameLabel = el.frameId && frameLabels?.get(el.frameId); + if (frameLabel) parts.push(`[${frameLabel}]`); + return parts.join(' '); }; @@ -166,10 +176,29 @@ export const formatSnapshot = (snapshot: SnapshotResult): string => { } } + // Label cross-origin iframes (frame#1, …) and list them so the agent knows + // which elements live in a frame and that their deep-ref selectors pierce it. + const frameLabels = new Map(); + if (snapshot.frames?.length) { + snapshot.frames.forEach((frame, i) => + frameLabels.set(frame.frameId, `frame#${i + 1}`), + ); + lines.push(`Frames (${snapshot.frames.length} iframes):`); + for (const frame of snapshot.frames) { + const origin = frame.crossOrigin ? 'cross-origin' : 'same-origin'; + lines.push( + ` ${frameLabels.get(frame.frameId)} ${frame.url} (${origin})`, + ); + } + lines.push( + 'Elements tagged [frame#N] live in that iframe; their deep-ref selectors pierce it — pass as-is to click/type/hover.', + ); + } + lines.push(''); for (const el of snapshot.elements) { - lines.push(formatElement(el)); + lines.push(formatElement(el, frameLabels)); } lines.push('--- END SNAPSHOT ---'); diff --git a/src/lib/define-tool.ts b/src/lib/define-tool.ts index 46d2dbf..7bd4aff 100644 --- a/src/lib/define-tool.ts +++ b/src/lib/define-tool.ts @@ -45,6 +45,11 @@ export interface ToolRunContext

{ }) => Promise; /** MCP session id (httpStream transport) or undefined for stdio — used by agent tool. */ sessionId: string | undefined; + /** + * Pre-created browser session id to attach to (from the `x-browserless-session-id` + * header). When set, the agent tool attaches to it instead of opening its own. + */ + attachSessionId?: string; } export interface ToolDefinition { @@ -142,6 +147,7 @@ export function defineTool( apiUrl, reportProgress, sessionId, + attachSessionId: s?.attachSessionId, }); } catch (err) { if (err instanceof ProfileNotFoundError) { diff --git a/src/skills/autonomous-login.md b/src/skills/autonomous-login.md index f909560..45f713c 100644 --- a/src/skills/autonomous-login.md +++ b/src/skills/autonomous-login.md @@ -60,6 +60,8 @@ One batched call (type username, type password, click submit) with Gate-2 values 2. Password input absent from new snapshot. 3. Authed element matching `/log out|sign out|my account|profile|dashboard|avatar/i`. +The visible account/display name will usually NOT equal the email or username you typed (it's the profile's display name, often a real name) — that's expected, NOT a mismatch. Never mark a login failed because the shown identity differs from the credential; judge only by the three signals above. + None holds: - Error matching `/invalid|incorrect|wrong|doesn'?t match|not recognized|please try again/i` → `INVALID_CREDENTIALS`. diff --git a/src/skills/shadow-dom.md b/src/skills/shadow-dom.md index a589285..4de70ed 100644 --- a/src/skills/shadow-dom.md +++ b/src/skills/shadow-dom.md @@ -2,6 +2,15 @@ Snapshot contains `deep-ref=` selectors, or you hit `SELECTOR_NOT_FOUND` on regular selector. Page using shadow DOM or iframes — read before next action. +## Iframes in the snapshot + +Iframes (same-origin and cross-origin) are now snapshotted too. When present: + +- Snapshot shows a `Frames (N iframes):` block listing each frame's label, URL, and origin. +- Elements inside a frame are tagged `[frame#N]` and carry a ready `deep-ref=` selector — cross-origin uses `< *url* css`, same-origin uses `< css`. Pass it as-is to `click`/`type`/`hover`/`checkbox` — no frame switching, no hand-construction. + +Only build a deep selector by hand (below) when a frame element wasn't surfaced (a11y-empty widget, capped snapshot). + ## Deep selectors: `< ` prefix Browserless deep selectors start with `< ` (less-than, space). Space mandatory. Format: @@ -20,7 +29,7 @@ When snapshot lists `deep-ref=< button#deny`, pass to `click` / `type` / `hover` ## Constructing deep selectors for iframes snapshot didn't surface -Snapshots only include accessible content. Iframes (captcha/payment widgets) often have nothing meaningful in accessibility tree. Build selector by hand: +Fallback only — most cross-origin iframes are now in the snapshot (see above). Some widgets still have nothing meaningful in the accessibility tree. Build selector by hand: - `< *google.com/recaptcha* #recaptcha-anchor` — reCAPTCHA checkbox - `< *hcaptcha.com* #checkbox` — hCaptcha checkbox diff --git a/src/skills/system-prompt.ts b/src/skills/system-prompt.ts index e79b640..eac67d3 100644 --- a/src/skills/system-prompt.ts +++ b/src/skills/system-prompt.ts @@ -50,7 +50,10 @@ Load manually via **browserless_skill** if suspected but not injected: ## Selectors - Use **ref=** (CSS) or **deep-ref=** (starts \`< \`) exactly as shown in snapshot - Example: \`[3] button "Sign In" ref=button#submit\` → \`"button#submit"\` -- deep-ref for shadow DOM — see \`shadow-dom\` skill +- deep-ref for shadow DOM / iframes — see \`shadow-dom\` skill + +## Iframes +Snapshots include a \`Frames\` list (cross-origin iframes) when present. Elements inside a frame are tagged \`[frame#N]\` and carry a \`deep-ref=< *url* css\` selector that already pierces the frame — pass it as-is to \`click\`/\`type\`/\`hover\`/\`checkbox\`. No frame switching needed. captcha/payment widgets (reCAPTCHA, hCaptcha, Stripe, Turnstile) show up here. \`shadow-dom\` skill auto-loads when frames present. ## Tabs Snapshots include \`tabs\` + \`activeTargetId\` — no getTabs needed. Multi-tab / \`snapshot { targetId }\` in \`tabs\` skill (auto-loads when >1 tab). diff --git a/src/tools/agent.ts b/src/tools/agent.ts index e6876f2..f6d34b9 100644 --- a/src/tools/agent.ts +++ b/src/tools/agent.ts @@ -162,6 +162,7 @@ export function registerAgentTools( token, apiUrl, sessionId: mcpSessionId, + attachSessionId, }) => { const commands: Array<{ method: string; @@ -203,11 +204,44 @@ export function registerAgentTools( } if (commands.length === 1 && commands[0].method === 'close') { - closeSession(mcpSessionId, token, proxy, profile, createProfile); + closeSession( + mcpSessionId, + token, + proxy, + profile, + createProfile, + attachSessionId, + ); sendAnalytics(true); return [{ type: 'text' as const, text: 'Browser session closed.' }]; } + // Open-only call: no real command (e.g. `createProfile`/`profile`/`proxy` + // set with no method/commands). Dispatching the empty-method default would + // make the agent route reject it as `Missing required id/method`, so just + // open (or reuse) the session and report it's ready for follow-up commands. + if (commands.length === 1 && !commands[0].method) { + try { + await getOrCreateSession( + mcpSessionId, + apiUrl, + token, + proxy, + profile, + createProfile, + attachSessionId, + ); + } catch (connErr: unknown) { + sendAnalytics(false); + throw new UserError(formatConnectError(connErr)); + } + sendAnalytics(true); + const text = createProfile + ? `Profile-creation session "${createProfile.name}" is open (non-headless). Send commands to drive the login, then call saveProfile.` + : 'Browser session is open. Send commands to drive it.'; + return [{ type: 'text' as const, text }]; + } + const runCommands = async (isRetry: boolean): Promise => { let agentSession; try { @@ -218,6 +252,7 @@ export function registerAgentTools( proxy, profile, createProfile, + attachSessionId, ); } catch (connErr: unknown) { // No retry when the server gave a definitive 4xx — re-attempting @@ -226,7 +261,14 @@ export function registerAgentTools( if (isRetry || !isRetryableUpgradeError(connErr)) { throw new UserError(formatConnectError(connErr)); } - destroySession(mcpSessionId, token, proxy, profile, createProfile); + destroySession( + mcpSessionId, + token, + proxy, + profile, + createProfile, + attachSessionId, + ); return runCommands(true); } @@ -239,7 +281,14 @@ export function registerAgentTools( let crossOriginBaseline: string | undefined = agentSession.lastUrl; for (const cmd of commands) { if (cmd.method === 'close') { - closeSession(mcpSessionId, token, proxy, profile, createProfile); + closeSession( + mcpSessionId, + token, + proxy, + profile, + createProfile, + attachSessionId, + ); results.push({ method: 'close', result: { closed: true } }); closedDuringBatch = true; break; @@ -253,7 +302,14 @@ export function registerAgentTools( try { resp = await send(agentSession, cmd.method, cmd.params); } catch (sendErr: unknown) { - destroySession(mcpSessionId, token, proxy, profile, createProfile); + destroySession( + mcpSessionId, + token, + proxy, + profile, + createProfile, + attachSessionId, + ); const errMessage = sendErr instanceof Error ? sendErr.message : String(sendErr); if (!isRetry) { @@ -285,6 +341,7 @@ export function registerAgentTools( proxy, profile, createProfile, + attachSessionId, ); if (!isRetry) { return runCommands(true); diff --git a/src/tools/schemas.ts b/src/tools/schemas.ts index 2818106..1d9169c 100644 --- a/src/tools/schemas.ts +++ b/src/tools/schemas.ts @@ -162,6 +162,27 @@ const TypeCommandSchema = z.object({ }), }); +const LoadSecretCommandSchema = z.object({ + method: z.literal('loadSecret'), + params: z.object({ + ref: z + .string() + .describe( + 'The credential reference/alias to inject (e.g. an op:// reference). ' + + 'The secret value is resolved server-side and typed into the field — ' + + 'you never see it. Use this for ALL passwords and usernames from a ' + + 'secrets vault; never put a secret value in `type`.', + ), + selector: z + .string() + .optional() + .describe( + 'CSS selector of the input to fill. If omitted, the secret is injected ' + + 'into the currently focused element (click/focus the field first).', + ), + }), +}); + const SelectCommandSchema = z.object({ method: z.literal('select'), params: z.object({ @@ -474,6 +495,7 @@ const specificCommandSchemas = [ CloseTabCommandSchema, ClickCommandSchema, TypeCommandSchema, + LoadSecretCommandSchema, SelectCommandSchema, CheckboxCommandSchema, HoverCommandSchema, diff --git a/test/tools/schemas.spec.ts b/test/tools/schemas.spec.ts index e63a137..5902115 100644 --- a/test/tools/schemas.spec.ts +++ b/test/tools/schemas.spec.ts @@ -242,6 +242,48 @@ describe('profile field (shared profileField helper)', () => { }); }); +describe('loadSecret command', () => { + it('accepts a loadSecret command with ref + selector', () => { + const parsed = AgentParamsSchema.parse({ + commands: [ + { + method: 'loadSecret', + params: { + ref: 'op://Automation/imdb/password', + selector: 'input#ap_password', + }, + }, + ], + }); + const cmd = parsed.commands?.[0]; + expect(cmd?.method).to.equal('loadSecret'); + expect((cmd?.params as { ref?: string })?.ref).to.equal( + 'op://Automation/imdb/password', + ); + }); + + it('accepts a loadSecret command with ref only (selector optional)', () => { + const result = AgentParamsSchema.safeParse({ + commands: [ + { + method: 'loadSecret', + params: { ref: 'op://Automation/imdb/username' }, + }, + ], + }); + expect(result.success).to.equal(true); + }); + + it('rejects a loadSecret command missing ref', () => { + const result = AgentParamsSchema.safeParse({ + commands: [ + { method: 'loadSecret', params: { selector: 'input#ap_email' } }, + ], + }); + expect(result.success).to.equal(false); + }); +}); + describe('createProfile field', () => { it('accepts a createProfile object on its own', () => { const parsed = AgentParamsSchema.parse({