Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions src/@types/types.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ import type { ProxyOptionsSchema } from '../lib/agent-client.js';
export interface BrowserlessSession extends Record<string, unknown> {
token: string;
apiUrl: string;
/**
* A pre-created browser session id to ATTACH to (via /chromium/agent?sessionId),
* threaded by the caller through the `x-browserless-session-id` header. Used by
* the autologin runner, which does POST /profile itself and hands the agent the
* resulting id instead of letting the model open a `createProfile` session.
*/
attachSessionId?: string;
}

export interface SupabaseJwtPayload {
Expand Down Expand Up @@ -134,6 +141,7 @@ export interface SnapshotElement {
focused?: boolean;
required?: boolean;
ariaLabel?: string;
frameId?: string;
}

export interface TabInfo {
Expand All @@ -143,6 +151,13 @@ export interface TabInfo {
active: boolean;
}

// for iframe handling
export interface FrameInfo {
frameId: string;
url: string;
crossOrigin: boolean;
}

export interface SnapshotResult {
url: string;
title: string;
Expand All @@ -151,6 +166,7 @@ export interface SnapshotResult {
tabs?: TabInfo[];
activeTargetId?: string | null;
detectedChallenges?: string[];
frames?: FrameInfo[];
}

export interface ActiveSession {
Expand Down
19 changes: 16 additions & 3 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -117,18 +117,31 @@ const hybridAuthenticate =
params.get('browserlessUrl') ??
config.browserlessApiUrl;

// A pre-created session id to attach to, threaded by the autologin
// runner. The agent tool opens /chromium/agent?sessionId=<this> instead
// of doing its own POST /profile.
const attachSessionId =
(request.headers['x-browserless-session-id'] as string) ??
params.get('browserlessSessionId') ??
undefined;

// JWTs have 3 dot-separated base64url segments; plain API keys do not.
const isJwt = headerToken ? headerToken.split('.').length === 3 : false;

// apiUrl/attachSessionId are the same across every auth path; only the
// resolved token differs.
const session = (token: string): BrowserlessSession =>
({ token, apiUrl, attachSessionId }) as BrowserlessSession;

// 1. Authorization header with plain API key
if (headerToken && !isJwt) {
return { token: headerToken, apiUrl } as BrowserlessSession;
return session(headerToken);
}

// 2. ?token= query param
const directToken = params.get('token') || undefined;
if (directToken) {
return { token: directToken, apiUrl } as BrowserlessSession;
return session(directToken);
}

// 3. Authorization header with JWT → decode Supabase token directly
Expand All @@ -138,7 +151,7 @@ const hybridAuthenticate =
config.supabaseServiceRoleKey,
headerToken,
);
return { token: apiKey, apiUrl } as BrowserlessSession;
return session(apiKey);
}

throw new Error(
Expand Down
52 changes: 43 additions & 9 deletions src/lib/agent-client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -237,11 +237,13 @@ const getSessionKey = (
proxy?: ProxyOptions,
profile?: string,
createProfile?: CreateProfileParams,
attachSessionId?: string,
): string =>
(mcpSessionId ?? `stdio:${hashToken(token)}`) +
proxyFingerprint(proxy) +
(profile ? KEY_SEP + 'profile#' + hashToken(profile) : '') +
(createProfile ? KEY_SEP + 'create#' + hashToken(createProfile.name) : '');
(createProfile ? KEY_SEP + 'create#' + hashToken(createProfile.name) : '') +
(attachSessionId ? KEY_SEP + 'attach#' + attachSessionId : '');

/**
* Build the WebSocket URL for `/chromium/agent`: normalize trailing slashes,
Expand Down Expand Up @@ -601,9 +603,17 @@ export const getOrCreateSession = async (
proxy?: ProxyOptions,
profile?: string,
createProfile?: CreateProfileParams,
attachSessionId?: string,
): Promise<ActiveSession> => {
sweepSessions();
const key = getSessionKey(mcpSessionId, token, proxy, profile, createProfile);
const key = getSessionKey(
mcpSessionId,
token,
proxy,
profile,
createProfile,
attachSessionId,
);
const existing = sessions.get(key);

if (existing && existing.ws.readyState === WebSocket.OPEN) {
Expand All @@ -626,11 +636,19 @@ export const getOrCreateSession = async (
}

const creation = (async (): Promise<ActiveSession> => {
// Profile-creation mode: launch a tracked session via POST /profile, then
// attach the agent WS to it by id. Otherwise launch a fresh agent browser.
const creationSessionId = createProfile
? (await postCreateProfile(apiUrl, token, createProfile)).id
: undefined;
// Three modes for the session to attach to:
// - attachSessionId: a session the caller already created (autologin
// runner did POST /profile itself) — attach by id, no POST here.
// - createProfile: open a tracked session via POST /profile, then attach.
// - neither: launch a fresh agent browser.
let creationSessionId: string | undefined;
if (attachSessionId) {
creationSessionId = attachSessionId;
} else if (createProfile) {
creationSessionId = (
await postCreateProfile(apiUrl, token, createProfile)
).id;
}
const ws = await connect(apiUrl, token, proxy, profile, creationSessionId);
const session: ActiveSession = {
ws,
Expand Down Expand Up @@ -727,8 +745,16 @@ export const closeSession = (
proxy?: ProxyOptions,
profile?: string,
createProfile?: CreateProfileParams,
attachSessionId?: string,
): void => {
const key = getSessionKey(mcpSessionId, token, proxy, profile, createProfile);
const key = getSessionKey(
mcpSessionId,
token,
proxy,
profile,
createProfile,
attachSessionId,
);
const session = sessions.get(key);
if (session) {
try {
Expand All @@ -751,8 +777,16 @@ export const destroySession = (
proxy?: ProxyOptions,
profile?: string,
createProfile?: CreateProfileParams,
attachSessionId?: string,
): void => {
const key = getSessionKey(mcpSessionId, token, proxy, profile, createProfile);
const key = getSessionKey(
mcpSessionId,
token,
proxy,
profile,
createProfile,
attachSessionId,
);
const session = sessions.get(key);
if (session) {
try {
Expand Down
35 changes: 32 additions & 3 deletions src/lib/agent-format.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ export type {
SnapshotResult,
SnapshotElement,
TabInfo,
FrameInfo,
} from '../@types/types.js';

const safeOrigin = (url: string): string | undefined => {
Expand Down Expand Up @@ -118,10 +119,16 @@ export const formatConnectError = (err: unknown): string => {

/**
* Format a single snapshot element as a compact one-liner:
* [ref] tag role "name" ref=selector value="…" (state)
* [ref] tag role "name" ref=selector value="…" (state) [frame#N]
* e.g. [7] input checkbox "Remember me" ref=input#remember (checked, required)
* `frameLabels` maps a frameId to its display label (frame#1, …); when an
* element carries a frameId, the label is appended so the agent sees which
* iframe it lives in.
*/
const formatElement = (el: SnapshotElement): string => {
const formatElement = (
el: SnapshotElement,
frameLabels?: Map<string, string>,
): string => {
const parts: string[] = [`[${el.ref}]`, el.tag, el.role];
const name = el.name || el.text || '';
if (name) parts.push(`"${name}"`);
Expand All @@ -141,6 +148,9 @@ const formatElement = (el: SnapshotElement): string => {
if (el.required) flags.push('required');
if (flags.length) parts.push(`(${flags.join(', ')})`);

const frameLabel = el.frameId && frameLabels?.get(el.frameId);
if (frameLabel) parts.push(`[${frameLabel}]`);

return parts.join(' ');
};

Expand All @@ -166,10 +176,29 @@ export const formatSnapshot = (snapshot: SnapshotResult): string => {
}
}

// Label cross-origin iframes (frame#1, …) and list them so the agent knows
// which elements live in a frame and that their deep-ref selectors pierce it.
const frameLabels = new Map<string, string>();
if (snapshot.frames?.length) {
snapshot.frames.forEach((frame, i) =>
frameLabels.set(frame.frameId, `frame#${i + 1}`),
);
lines.push(`Frames (${snapshot.frames.length} iframes):`);
for (const frame of snapshot.frames) {
const origin = frame.crossOrigin ? 'cross-origin' : 'same-origin';
lines.push(
` ${frameLabels.get(frame.frameId)} ${frame.url} (${origin})`,
);
}
lines.push(
'Elements tagged [frame#N] live in that iframe; their deep-ref selectors pierce it — pass as-is to click/type/hover.',
);
Comment on lines +194 to +195

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎯 Functional Correctness | 🟡 Minor | ⚡ Quick win

Include checkbox in the frame-action instruction list.

The snapshot guidance omits checkbox, but the skill/system guidance includes it. That inconsistency can cause missed valid interactions in framed widgets.

Suggested fix
-      'Elements tagged [frame#N] live in that iframe; their deep-ref selectors pierce it — pass as-is to click/type/hover.',
+      'Elements tagged [frame#N] live in that iframe; their deep-ref selectors pierce it — pass as-is to click/type/hover/checkbox.',
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
'Elements tagged [frame#N] live in that iframe; their deep-ref selectors pierce it — pass as-is to click/type/hover.',
);
'Elements tagged [frame#N] live in that iframe; their deep-ref selectors pierce it — pass as-is to click/type/hover/checkbox.',
);
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/lib/agent-format.ts` around lines 194 - 195, The frame-action instruction
message in the agent-format.ts file (around line 194-195) is missing checkbox
from its list of supported actions, while the skill/system guidance elsewhere
includes it. Add checkbox to the list of actions mentioned in the instruction
string that describes elements tagged with [frame#N] to ensure consistency
between the snapshot guidance and the skill/system guidance. This will prevent
missed valid interactions in framed widgets.

}

lines.push('');

for (const el of snapshot.elements) {
lines.push(formatElement(el));
lines.push(formatElement(el, frameLabels));
}

lines.push('--- END SNAPSHOT ---');
Expand Down
6 changes: 6 additions & 0 deletions src/lib/define-tool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ export interface ToolRunContext<P> {
}) => Promise<void>;
/** MCP session id (httpStream transport) or undefined for stdio — used by agent tool. */
sessionId: string | undefined;
/**
* Pre-created browser session id to attach to (from the `x-browserless-session-id`
* header). When set, the agent tool attaches to it instead of opening its own.
*/
attachSessionId?: string;
}

export interface ToolDefinition<P, R> {
Expand Down Expand Up @@ -142,6 +147,7 @@ export function defineTool<P, R>(
apiUrl,
reportProgress,
sessionId,
attachSessionId: s?.attachSessionId,
});
} catch (err) {
if (err instanceof ProfileNotFoundError) {
Expand Down
2 changes: 2 additions & 0 deletions src/skills/autonomous-login.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ One batched call (type username, type password, click submit) with Gate-2 values
2. Password input absent from new snapshot.
3. Authed element matching `/log out|sign out|my account|profile|dashboard|avatar/i`.

The visible account/display name will usually NOT equal the email or username you typed (it's the profile's display name, often a real name) — that's expected, NOT a mismatch. Never mark a login failed because the shown identity differs from the credential; judge only by the three signals above.

None holds:

- Error matching `/invalid|incorrect|wrong|doesn'?t match|not recognized|please try again/i` → `INVALID_CREDENTIALS`.
Expand Down
11 changes: 10 additions & 1 deletion src/skills/shadow-dom.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@

Snapshot contains `deep-ref=` selectors, or you hit `SELECTOR_NOT_FOUND` on regular selector. Page using shadow DOM or iframes — read before next action.

## Iframes in the snapshot

Iframes (same-origin and cross-origin) are now snapshotted too. When present:

- Snapshot shows a `Frames (N iframes):` block listing each frame's label, URL, and origin.
- Elements inside a frame are tagged `[frame#N]` and carry a ready `deep-ref=` selector — cross-origin uses `< *url* css`, same-origin uses `< css`. Pass it as-is to `click`/`type`/`hover`/`checkbox` — no frame switching, no hand-construction.

Only build a deep selector by hand (below) when a frame element wasn't surfaced (a11y-empty widget, capped snapshot).

## Deep selectors: `< ` prefix

Browserless deep selectors start with `< ` (less-than, space). Space mandatory. Format:
Expand All @@ -20,7 +29,7 @@ When snapshot lists `deep-ref=< button#deny`, pass to `click` / `type` / `hover`

## Constructing deep selectors for iframes snapshot didn't surface

Snapshots only include accessible content. Iframes (captcha/payment widgets) often have nothing meaningful in accessibility tree. Build selector by hand:
Fallback only — most cross-origin iframes are now in the snapshot (see above). Some widgets still have nothing meaningful in the accessibility tree. Build selector by hand:

- `< *google.com/recaptcha* #recaptcha-anchor` — reCAPTCHA checkbox
- `< *hcaptcha.com* #checkbox` — hCaptcha checkbox
Expand Down
5 changes: 4 additions & 1 deletion src/skills/system-prompt.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,10 @@ Load manually via **browserless_skill** if suspected but not injected:
## Selectors
- Use **ref=** (CSS) or **deep-ref=** (starts \`< \`) exactly as shown in snapshot
- Example: \`[3] button "Sign In" ref=button#submit\` → \`"button#submit"\`
- deep-ref for shadow DOM — see \`shadow-dom\` skill
- deep-ref for shadow DOM / iframes — see \`shadow-dom\` skill

## Iframes
Snapshots include a \`Frames\` list (cross-origin iframes) when present. Elements inside a frame are tagged \`[frame#N]\` and carry a \`deep-ref=< *url* css\` selector that already pierces the frame — pass it as-is to \`click\`/\`type\`/\`hover\`/\`checkbox\`. No frame switching needed. captcha/payment widgets (reCAPTCHA, hCaptcha, Stripe, Turnstile) show up here. \`shadow-dom\` skill auto-loads when frames present.
Comment thread
andyMrtnzP marked this conversation as resolved.

## Tabs
Snapshots include \`tabs\` + \`activeTargetId\` — no getTabs needed. Multi-tab / \`snapshot { targetId }\` in \`tabs\` skill (auto-loads when >1 tab).
Expand Down
Loading