Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changeset/stg-1689-a11y-snapshot.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
"@browserbasehq/stagehand": patch
"@browserbasehq/browse-cli": patch
---

Add interactive accessibility snapshots for agent and CLI usage.
43 changes: 38 additions & 5 deletions packages/cli/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
* Multiple sessions can run simultaneously using --session <name> or BROWSE_SESSION env var.
*/

import { Command, Option } from "commander";
import { Command, InvalidArgumentError, Option } from "commander";
import { Stagehand, type Page as BrowsePage } from "@browserbasehq/stagehand";
import { promises as fs } from "fs";
import * as path from "path";
Expand Down Expand Up @@ -1300,15 +1300,26 @@ async function executeCommand(

// Snapshot
case "snapshot": {
const [compact] = args as [boolean?];
const snapshot = await page!.snapshot();
const [options] = args as [
{
compact?: boolean;
interactive?: boolean;
maxDepth?: number;
focusSelector?: string;
}?,
];
const snapshot = await page!.snapshot({
interactive: options?.interactive,
maxDepth: options?.maxDepth,
focusSelector: options?.focusSelector,
});

refMap = {
xpathMap: snapshot.xpathMap ?? {},
urlMap: snapshot.urlMap ?? {},
};

if (compact) {
if (options?.compact) {
return { tree: snapshot.formattedTree };
}
return {
Expand Down Expand Up @@ -2682,10 +2693,32 @@ program
.command("snapshot")
.description("Get accessibility tree snapshot")
.option("-c, --compact", "Output tree only (no xpath map)")
.option(
"-i, --interactive",
"Only include actionable elements and their structural ancestors",
)
.option("-d, --depth <n>", "Maximum tree depth", (value) => {
const parsed = Number.parseInt(value, 10);
if (Number.isNaN(parsed) || parsed < 0) {
throw new InvalidArgumentError("depth must be a non-negative integer");
}
return parsed;
})
.option(
"-s, --selector <selector>",
"Scope snapshot to CSS selector or XPath",
)
.action(async (cmdOpts) => {
const opts = program.opts<GlobalOpts>();
try {
const result = (await runCommand("snapshot", [cmdOpts.compact])) as {
const result = (await runCommand("snapshot", [
{
compact: cmdOpts.compact,
interactive: cmdOpts.interactive,
maxDepth: cmdOpts.depth,
focusSelector: cmdOpts.selector,
},
])) as {
tree: string;
xpathMap?: Record<string, string>;
urlMap?: Record<string, string>;
Expand Down
49 changes: 38 additions & 11 deletions packages/core/lib/v3/agent/tools/ariaTree.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,52 @@ import { TimeoutError } from "../../types/public/sdkErrors.js";
export const ariaTreeTool = (v3: V3, toolTimeout?: number) =>
tool({
description:
"gets the accessibility (ARIA) hybrid tree text for the current page. use this to understand structure and content.",
inputSchema: z.object({}),
execute: async () => {
"gets the accessibility (ARIA) hybrid tree text for the current page. defaults to interactive controls for efficient action planning; use mode 'full' for reading page content.",
inputSchema: z.object({
mode: z
.enum(["interactive", "full"])
.optional()
.describe(
"interactive returns actionable elements only; full returns the complete accessibility tree",
),
maxDepth: z
.number()
.int()
.nonnegative()
.optional()
.describe("optional maximum tree depth"),
}),
execute: async ({ mode = "interactive", maxDepth }) => {
try {
v3.logger({
category: "agent",
message: `Agent calling tool: ariaTree`,
message: `Agent calling tool: ariaTree (${mode})`,
level: 1,
});
const page = await v3.context.awaitActivePage();
const extractOptions = toolTimeout
? { timeout: toolTimeout }
: undefined;
const { pageText } = (await v3.extract(extractOptions)) as {
pageText: string;
};
const snapshotPromise = page.snapshot({
interactive: mode === "interactive",
maxDepth,
});
let timeoutId: ReturnType<typeof setTimeout> | undefined;
const snapshot = toolTimeout
Comment thread
cubic-dev-ai[bot] marked this conversation as resolved.
? await Promise.race([
snapshotPromise,
new Promise<never>((_, reject) => {
timeoutId = setTimeout(
() => reject(new TimeoutError("ariaTree", toolTimeout)),
toolTimeout,
);
}),
]).finally(() => {
if (timeoutId) {
clearTimeout(timeoutId);
}
})
: await snapshotPromise;
const pageUrl = page.url();

let content = pageText;
let content = snapshot.formattedTree;
const MAX_TOKENS = 70000; // rough cap, assume ~4 chars per token for conservative truncation
const estimatedTokens = Math.ceil(content.length / 4);
if (estimatedTokens > MAX_TOKENS) {
Expand Down
10 changes: 10 additions & 0 deletions packages/core/lib/v3/types/private/snapshot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ export type SnapshotOptions = {
* Optional feature flag that surfaces experimental traversal tweaks in the Accessibility layer.
*/
experimental?: boolean;
/**
* Filter the snapshot to actionable elements and their structural ancestors.
*/
interactive?: boolean;
/**
* Maximum tree depth to include after pruning.
*/
maxDepth?: number;
};

/**
Expand Down Expand Up @@ -105,6 +113,8 @@ export type A11yNode = {
export type A11yOptions = {
focusSelector?: string;
experimental: boolean;
interactive?: boolean;
maxDepth?: number;
tagNameMap: Record<string, string>;
scrollableMap: Record<string, boolean>;
encode: (backendNodeId: number) => string;
Expand Down
6 changes: 6 additions & 0 deletions packages/core/lib/v3/types/public/page.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,10 @@ export type SnapshotResult = {

export type PageSnapshotOptions = {
includeIframes?: boolean;
/** Only include actionable elements and their structural ancestors. */
interactive?: boolean;
/** Maximum tree depth to include after pruning. */
maxDepth?: number;
/** Scope the snapshot to a CSS selector or XPath. */
focusSelector?: string;
};
125 changes: 124 additions & 1 deletion packages/core/lib/v3/understudy/a11y/snapshot/a11yTree.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,53 @@ import {
} from "./focusSelectors.js";
import { formatTreeLine, normaliseSpaces } from "./treeFormatUtils.js";

const INTERACTIVE_ROLES = new Set([
"button",
"checkbox",
"combobox",
"disclosuretriangle",
"gridcell",
"link",
"listbox",
"menuitem",
"menuitemcheckbox",
"menuitemradio",
"option",
"radio",
"rowheader",
"scrollbar",
"searchbox",
"select",
"slider",
"spinbutton",
"switch",
"tab",
"textbox",
"treeitem",
]);

const INTERACTIVE_TAGS = new Set([
"a",
"button",
"input",
"select",
"summary",
"textarea",
]);

const INTERACTIVE_CONTEXT_ROLES = new Set([
"alertdialog",
"banner",
"dialog",
"form",
"main",
"menu",
"navigation",
"region",
"rootwebarea",
"toolbar",
]);

/**
* Fetch and prune the accessibility tree for a frame, optionally scoping the
* output to a selector root for faster targeted snapshots.
Expand Down Expand Up @@ -180,7 +227,13 @@ export async function buildHierarchicalTree(
Boolean,
) as A11yNode[];

return { tree: cleaned };
const depthLimited = limitTreeDepth(cleaned, opts.maxDepth);

if (opts.interactive) {
return { tree: filterToInteractiveNodes(depthLimited, opts) };
}

return { tree: depthLimited };

async function pruneStructuralSafe(node: A11yNode): Promise<A11yNode | null> {
if (+node.nodeId < 0) return null;
Expand Down Expand Up @@ -221,6 +274,76 @@ export function isStructural(role: string): boolean {
return r === "generic" || r === "none" || r === "inlinetextbox";
}

export function isInteractive(node: A11yNode, opts: A11yOptions): boolean {
const role = node.role?.toLowerCase() ?? "";
if (INTERACTIVE_ROLES.has(role)) return true;
if (role.startsWith("scrollable")) return true;
if (!node.encodedId) return false;

const tag = opts.tagNameMap[node.encodedId]?.split(",")[0]?.trim();
return INTERACTIVE_TAGS.has(tag);
}

function filterToInteractiveNodes(
roots: A11yNode[],
opts: A11yOptions,
): A11yNode[] {
const filtered: A11yNode[] = [];

for (const root of roots) {
filtered.push(...filterNode(root));
}

return filtered;

function filterNode(node: A11yNode): A11yNode[] {
const children = (node.children ?? [])
.flatMap(filterNode)
.filter(Boolean) as A11yNode[];

const interactive = isInteractive(node, opts);
if (!interactive && children.length === 0) return [];

if (!interactive && !isInteractiveContext(node)) return children;

return [
{
...node,
encodedId: interactive ? node.encodedId : undefined,
children: children.length ? children : undefined,
},
];
}
}

function isInteractiveContext(node: A11yNode): boolean {
const role = node.role?.toLowerCase() ?? "";
if (role.startsWith("scrollable")) return true;
if (INTERACTIVE_CONTEXT_ROLES.has(role)) return true;
return role === "body" || role === "html";
}

function limitTreeDepth(
roots: A11yNode[],
maxDepth: number | undefined,
): A11yNode[] {
if (maxDepth === undefined) return roots;
const normalizedDepth = Math.max(0, Math.floor(maxDepth));

const visit = (node: A11yNode, depth: number): A11yNode | null => {
if (depth > normalizedDepth) return null;
const children = (node.children ?? [])
.map((child) => visit(child, depth + 1))
.filter(Boolean) as A11yNode[];
return {
...node,
children: children.length ? children : undefined,
};
};

return roots.map((root) => visit(root, 0)).filter(Boolean) as A11yNode[];
}

export function extractUrlFromAXNode(
ax: Protocol.Accessibility.AXNode,
): string | undefined {
Expand Down
4 changes: 4 additions & 0 deletions packages/core/lib/v3/understudy/a11y/snapshot/capture.ts
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,8 @@ export async function tryScopedSnapshot(
scrollableMap,
encode: (backendNodeId) =>
`${page.getOrdinal(targetFrameId)}-${backendNodeId}`,
interactive: options?.interactive,
maxDepth: options?.maxDepth,
},
);

Expand Down Expand Up @@ -331,6 +333,8 @@ export async function collectPerFrameMaps(
tagNameMap,
scrollableMap,
encode: (backendNodeId) => `${page.getOrdinal(frameId)}-${backendNodeId}`,
interactive: options?.interactive,
maxDepth: options?.maxDepth,
});

perFrameOutlines.push({ frameId, outline });
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ import type { A11yNode } from "../../../types/private/snapshot.js";
*/
export function formatTreeLine(node: A11yNode, level = 0): string {
const indent = " ".repeat(level);
const labelId = node.encodedId ?? node.nodeId;
const label = `[${labelId}] ${node.role}${node.name ? `: ${cleanText(node.name)}` : ""}`;
const ref = node.encodedId ? `[${node.encodedId}] ` : "";
const label = `${ref}${node.role}${node.name ? `: ${cleanText(node.name)}` : ""}`;
const kids =
node.children?.map((c) => formatTreeLine(c, level + 1)).join("\n") ?? "";
return kids ? `${indent}${label}\n${kids}` : `${indent}${label}`;
Expand Down
20 changes: 18 additions & 2 deletions packages/core/lib/v3/understudy/page.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1899,12 +1899,28 @@ export class Page {
await captureHybridSnapshot(this, {
pierceShadow: true,
includeIframes: options?.includeIframes,
interactive: options?.interactive,
maxDepth: options?.maxDepth,
focusSelector: options?.focusSelector,
});

const refsInTree = new Set(
Array.from(
combinedTree.matchAll(/(?:^|\n)\s*\[([^\]]+)\]/g),
(match) => match[1],
),
);
const filterMap = (map: Record<string, string>) =>
Object.fromEntries(
Object.entries(map).filter(([encodedId]) =>
refsInTree.has(encodedId),
),
);

return {
formattedTree: combinedTree,
xpathMap: combinedXpathMap,
urlMap: combinedUrlMap,
xpathMap: filterMap(combinedXpathMap),
urlMap: filterMap(combinedUrlMap),
};
} catch (err) {
throw new StagehandSnapshotError(err);
Expand Down
Loading
Loading