diff --git a/.changeset/stg-1689-a11y-snapshot.md b/.changeset/stg-1689-a11y-snapshot.md new file mode 100644 index 000000000..fdd799f9c --- /dev/null +++ b/.changeset/stg-1689-a11y-snapshot.md @@ -0,0 +1,6 @@ +--- +"@browserbasehq/stagehand": patch +"@browserbasehq/browse-cli": patch +--- + +Add interactive accessibility snapshots for agent and CLI usage. diff --git a/packages/cli/src/index.ts b/packages/cli/src/index.ts index e30e5437a..41118c701 100644 --- a/packages/cli/src/index.ts +++ b/packages/cli/src/index.ts @@ -8,7 +8,7 @@ * Multiple sessions can run simultaneously using --session or BROWSE_SESSION env var. */ -import { Command, Option } from "commander"; +import { Command, InvalidArgumentError, Option } from "commander"; import { Stagehand, type Page as BrowsePage } from "@browserbasehq/stagehand"; import { promises as fs } from "fs"; import * as path from "path"; @@ -1300,15 +1300,26 @@ async function executeCommand( // Snapshot case "snapshot": { - const [compact] = args as [boolean?]; - const snapshot = await page!.snapshot(); + const [options] = args as [ + { + compact?: boolean; + interactive?: boolean; + maxDepth?: number; + focusSelector?: string; + }?, + ]; + const snapshot = await page!.snapshot({ + interactive: options?.interactive, + maxDepth: options?.maxDepth, + focusSelector: options?.focusSelector, + }); refMap = { xpathMap: snapshot.xpathMap ?? {}, urlMap: snapshot.urlMap ?? {}, }; - if (compact) { + if (options?.compact) { return { tree: snapshot.formattedTree }; } return { @@ -2682,10 +2693,32 @@ program .command("snapshot") .description("Get accessibility tree snapshot") .option("-c, --compact", "Output tree only (no xpath map)") + .option( + "-i, --interactive", + "Only include actionable elements and their structural ancestors", + ) + .option("-d, --depth ", "Maximum tree depth", (value) => { + const parsed = Number.parseInt(value, 10); + if (Number.isNaN(parsed) || parsed < 0) { + throw new InvalidArgumentError("depth must be a non-negative integer"); + } + return parsed; + }) + .option( + "-s, --selector ", + "Scope snapshot to CSS selector or XPath", + ) .action(async (cmdOpts) => { const opts = program.opts(); try { - const result = (await runCommand("snapshot", [cmdOpts.compact])) as { + const result = (await runCommand("snapshot", [ + { + compact: cmdOpts.compact, + interactive: cmdOpts.interactive, + maxDepth: cmdOpts.depth, + focusSelector: cmdOpts.selector, + }, + ])) as { tree: string; xpathMap?: Record; urlMap?: Record; diff --git a/packages/core/lib/v3/agent/tools/ariaTree.ts b/packages/core/lib/v3/agent/tools/ariaTree.ts index 2537de8cf..02d846785 100644 --- a/packages/core/lib/v3/agent/tools/ariaTree.ts +++ b/packages/core/lib/v3/agent/tools/ariaTree.ts @@ -6,25 +6,52 @@ import { TimeoutError } from "../../types/public/sdkErrors.js"; export const ariaTreeTool = (v3: V3, toolTimeout?: number) => tool({ description: - "gets the accessibility (ARIA) hybrid tree text for the current page. use this to understand structure and content.", - inputSchema: z.object({}), - execute: async () => { + "gets the accessibility (ARIA) hybrid tree text for the current page. defaults to interactive controls for efficient action planning; use mode 'full' for reading page content.", + inputSchema: z.object({ + mode: z + .enum(["interactive", "full"]) + .optional() + .describe( + "interactive returns actionable elements only; full returns the complete accessibility tree", + ), + maxDepth: z + .number() + .int() + .nonnegative() + .optional() + .describe("optional maximum tree depth"), + }), + execute: async ({ mode = "interactive", maxDepth }) => { try { v3.logger({ category: "agent", - message: `Agent calling tool: ariaTree`, + message: `Agent calling tool: ariaTree (${mode})`, level: 1, }); const page = await v3.context.awaitActivePage(); - const extractOptions = toolTimeout - ? { timeout: toolTimeout } - : undefined; - const { pageText } = (await v3.extract(extractOptions)) as { - pageText: string; - }; + const snapshotPromise = page.snapshot({ + interactive: mode === "interactive", + maxDepth, + }); + let timeoutId: ReturnType | undefined; + const snapshot = toolTimeout + ? await Promise.race([ + snapshotPromise, + new Promise((_, reject) => { + timeoutId = setTimeout( + () => reject(new TimeoutError("ariaTree", toolTimeout)), + toolTimeout, + ); + }), + ]).finally(() => { + if (timeoutId) { + clearTimeout(timeoutId); + } + }) + : await snapshotPromise; const pageUrl = page.url(); - let content = pageText; + let content = snapshot.formattedTree; const MAX_TOKENS = 70000; // rough cap, assume ~4 chars per token for conservative truncation const estimatedTokens = Math.ceil(content.length / 4); if (estimatedTokens > MAX_TOKENS) { diff --git a/packages/core/lib/v3/types/private/snapshot.ts b/packages/core/lib/v3/types/private/snapshot.ts index 410d28284..902a1e3d3 100644 --- a/packages/core/lib/v3/types/private/snapshot.ts +++ b/packages/core/lib/v3/types/private/snapshot.ts @@ -20,6 +20,14 @@ export type SnapshotOptions = { * Optional feature flag that surfaces experimental traversal tweaks in the Accessibility layer. */ experimental?: boolean; + /** + * Filter the snapshot to actionable elements and their structural ancestors. + */ + interactive?: boolean; + /** + * Maximum tree depth to include after pruning. + */ + maxDepth?: number; }; /** @@ -105,6 +113,8 @@ export type A11yNode = { export type A11yOptions = { focusSelector?: string; experimental: boolean; + interactive?: boolean; + maxDepth?: number; tagNameMap: Record; scrollableMap: Record; encode: (backendNodeId: number) => string; diff --git a/packages/core/lib/v3/types/public/page.ts b/packages/core/lib/v3/types/public/page.ts index 656f03a1b..bd61a625e 100644 --- a/packages/core/lib/v3/types/public/page.ts +++ b/packages/core/lib/v3/types/public/page.ts @@ -20,4 +20,10 @@ export type SnapshotResult = { export type PageSnapshotOptions = { includeIframes?: boolean; + /** Only include actionable elements and their structural ancestors. */ + interactive?: boolean; + /** Maximum tree depth to include after pruning. */ + maxDepth?: number; + /** Scope the snapshot to a CSS selector or XPath. */ + focusSelector?: string; }; diff --git a/packages/core/lib/v3/understudy/a11y/snapshot/a11yTree.ts b/packages/core/lib/v3/understudy/a11y/snapshot/a11yTree.ts index e13dd79ba..e1dcf0b99 100644 --- a/packages/core/lib/v3/understudy/a11y/snapshot/a11yTree.ts +++ b/packages/core/lib/v3/understudy/a11y/snapshot/a11yTree.ts @@ -11,6 +11,53 @@ import { } from "./focusSelectors.js"; import { formatTreeLine, normaliseSpaces } from "./treeFormatUtils.js"; +const INTERACTIVE_ROLES = new Set([ + "button", + "checkbox", + "combobox", + "disclosuretriangle", + "gridcell", + "link", + "listbox", + "menuitem", + "menuitemcheckbox", + "menuitemradio", + "option", + "radio", + "rowheader", + "scrollbar", + "searchbox", + "select", + "slider", + "spinbutton", + "switch", + "tab", + "textbox", + "treeitem", +]); + +const INTERACTIVE_TAGS = new Set([ + "a", + "button", + "input", + "select", + "summary", + "textarea", +]); + +const INTERACTIVE_CONTEXT_ROLES = new Set([ + "alertdialog", + "banner", + "dialog", + "form", + "main", + "menu", + "navigation", + "region", + "rootwebarea", + "toolbar", +]); + /** * Fetch and prune the accessibility tree for a frame, optionally scoping the * output to a selector root for faster targeted snapshots. @@ -180,7 +227,13 @@ export async function buildHierarchicalTree( Boolean, ) as A11yNode[]; - return { tree: cleaned }; + const depthLimited = limitTreeDepth(cleaned, opts.maxDepth); + + if (opts.interactive) { + return { tree: filterToInteractiveNodes(depthLimited, opts) }; + } + + return { tree: depthLimited }; async function pruneStructuralSafe(node: A11yNode): Promise { if (+node.nodeId < 0) return null; @@ -221,6 +274,76 @@ export function isStructural(role: string): boolean { return r === "generic" || r === "none" || r === "inlinetextbox"; } +export function isInteractive(node: A11yNode, opts: A11yOptions): boolean { + const role = node.role?.toLowerCase() ?? ""; + if (INTERACTIVE_ROLES.has(role)) return true; + if (role.startsWith("scrollable")) return true; + if (!node.encodedId) return false; + + const tag = opts.tagNameMap[node.encodedId]?.split(",")[0]?.trim(); + return INTERACTIVE_TAGS.has(tag); +} + +function filterToInteractiveNodes( + roots: A11yNode[], + opts: A11yOptions, +): A11yNode[] { + const filtered: A11yNode[] = []; + + for (const root of roots) { + filtered.push(...filterNode(root)); + } + + return filtered; + + function filterNode(node: A11yNode): A11yNode[] { + const children = (node.children ?? []) + .flatMap(filterNode) + .filter(Boolean) as A11yNode[]; + + const interactive = isInteractive(node, opts); + if (!interactive && children.length === 0) return []; + + if (!interactive && !isInteractiveContext(node)) return children; + + return [ + { + ...node, + encodedId: interactive ? node.encodedId : undefined, + children: children.length ? children : undefined, + }, + ]; + } +} + +function isInteractiveContext(node: A11yNode): boolean { + const role = node.role?.toLowerCase() ?? ""; + if (role.startsWith("scrollable")) return true; + if (INTERACTIVE_CONTEXT_ROLES.has(role)) return true; + return role === "body" || role === "html"; +} + +function limitTreeDepth( + roots: A11yNode[], + maxDepth: number | undefined, +): A11yNode[] { + if (maxDepth === undefined) return roots; + const normalizedDepth = Math.max(0, Math.floor(maxDepth)); + + const visit = (node: A11yNode, depth: number): A11yNode | null => { + if (depth > normalizedDepth) return null; + const children = (node.children ?? []) + .map((child) => visit(child, depth + 1)) + .filter(Boolean) as A11yNode[]; + return { + ...node, + children: children.length ? children : undefined, + }; + }; + + return roots.map((root) => visit(root, 0)).filter(Boolean) as A11yNode[]; +} + export function extractUrlFromAXNode( ax: Protocol.Accessibility.AXNode, ): string | undefined { diff --git a/packages/core/lib/v3/understudy/a11y/snapshot/capture.ts b/packages/core/lib/v3/understudy/a11y/snapshot/capture.ts index 87ff845d3..0ac2a3f92 100644 --- a/packages/core/lib/v3/understudy/a11y/snapshot/capture.ts +++ b/packages/core/lib/v3/understudy/a11y/snapshot/capture.ts @@ -191,6 +191,8 @@ export async function tryScopedSnapshot( scrollableMap, encode: (backendNodeId) => `${page.getOrdinal(targetFrameId)}-${backendNodeId}`, + interactive: options?.interactive, + maxDepth: options?.maxDepth, }, ); @@ -331,6 +333,8 @@ export async function collectPerFrameMaps( tagNameMap, scrollableMap, encode: (backendNodeId) => `${page.getOrdinal(frameId)}-${backendNodeId}`, + interactive: options?.interactive, + maxDepth: options?.maxDepth, }); perFrameOutlines.push({ frameId, outline }); diff --git a/packages/core/lib/v3/understudy/a11y/snapshot/treeFormatUtils.ts b/packages/core/lib/v3/understudy/a11y/snapshot/treeFormatUtils.ts index fd90e1529..a036a980d 100644 --- a/packages/core/lib/v3/understudy/a11y/snapshot/treeFormatUtils.ts +++ b/packages/core/lib/v3/understudy/a11y/snapshot/treeFormatUtils.ts @@ -7,8 +7,8 @@ import type { A11yNode } from "../../../types/private/snapshot.js"; */ export function formatTreeLine(node: A11yNode, level = 0): string { const indent = " ".repeat(level); - const labelId = node.encodedId ?? node.nodeId; - const label = `[${labelId}] ${node.role}${node.name ? `: ${cleanText(node.name)}` : ""}`; + const ref = node.encodedId ? `[${node.encodedId}] ` : ""; + const label = `${ref}${node.role}${node.name ? `: ${cleanText(node.name)}` : ""}`; const kids = node.children?.map((c) => formatTreeLine(c, level + 1)).join("\n") ?? ""; return kids ? `${indent}${label}\n${kids}` : `${indent}${label}`; diff --git a/packages/core/lib/v3/understudy/page.ts b/packages/core/lib/v3/understudy/page.ts index 5f241ba3f..eb74d1381 100644 --- a/packages/core/lib/v3/understudy/page.ts +++ b/packages/core/lib/v3/understudy/page.ts @@ -1899,12 +1899,28 @@ export class Page { await captureHybridSnapshot(this, { pierceShadow: true, includeIframes: options?.includeIframes, + interactive: options?.interactive, + maxDepth: options?.maxDepth, + focusSelector: options?.focusSelector, }); + const refsInTree = new Set( + Array.from( + combinedTree.matchAll(/(?:^|\n)\s*\[([^\]]+)\]/g), + (match) => match[1], + ), + ); + const filterMap = (map: Record) => + Object.fromEntries( + Object.entries(map).filter(([encodedId]) => + refsInTree.has(encodedId), + ), + ); + return { formattedTree: combinedTree, - xpathMap: combinedXpathMap, - urlMap: combinedUrlMap, + xpathMap: filterMap(combinedXpathMap), + urlMap: filterMap(combinedUrlMap), }; } catch (err) { throw new StagehandSnapshotError(err); diff --git a/packages/core/tests/unit/page-snapshot.test.ts b/packages/core/tests/unit/page-snapshot.test.ts index e2c568d2d..873f98f4f 100644 --- a/packages/core/tests/unit/page-snapshot.test.ts +++ b/packages/core/tests/unit/page-snapshot.test.ts @@ -28,6 +28,9 @@ describe("Page.snapshot", () => { expect(captureSpy).toHaveBeenCalledWith(fakePage, { pierceShadow: true, includeIframes: false, + interactive: undefined, + maxDepth: undefined, + focusSelector: undefined, }); }); @@ -43,6 +46,41 @@ describe("Page.snapshot", () => { expect(captureSpy).toHaveBeenCalledWith(fakePage, { pierceShadow: true, includeIframes: undefined, + interactive: undefined, + maxDepth: undefined, + focusSelector: undefined, }); }); + + it("forwards snapshot filtering options and drops maps for absent refs", async () => { + vi.spyOn(fs, "writeFile").mockResolvedValue(); + const captureSpy = vi + .spyOn(snapshotModule, "captureHybridSnapshot") + .mockResolvedValue({ + combinedTree: "[keep] button: Save", + combinedXpathMap: { keep: "/html/body/button", drop: "/html/body/p" }, + combinedUrlMap: { + keep: "https://example.com/save", + drop: "https://example.com", + }, + perFrame: [], + }); + + const fakePage = {} as Page; + const snapshot = await Page.prototype.snapshot.call(fakePage, { + interactive: true, + maxDepth: 3, + focusSelector: "#app", + }); + + expect(captureSpy).toHaveBeenCalledWith(fakePage, { + pierceShadow: true, + includeIframes: undefined, + interactive: true, + maxDepth: 3, + focusSelector: "#app", + }); + expect(snapshot.xpathMap).toEqual({ keep: "/html/body/button" }); + expect(snapshot.urlMap).toEqual({ keep: "https://example.com/save" }); + }); }); diff --git a/packages/core/tests/unit/snapshot-a11y-tree-utils.test.ts b/packages/core/tests/unit/snapshot-a11y-tree-utils.test.ts index 7b03212d3..38ae801cb 100644 --- a/packages/core/tests/unit/snapshot-a11y-tree-utils.test.ts +++ b/packages/core/tests/unit/snapshot-a11y-tree-utils.test.ts @@ -122,6 +122,9 @@ describe("buildHierarchicalTree", () => { tagNameMap: { root: "div", child: "span" }, }; + const collectRoles = (nodes: A11yNode[]): string[] => + nodes.flatMap((node) => [node.role, ...collectRoles(node.children ?? [])]); + it("drops structural nodes without children or names", async () => { const nodes: A11yNode[] = [ { @@ -271,6 +274,115 @@ describe("buildHierarchicalTree", () => { const { tree } = await buildHierarchicalTree(nodes, opts); expect(tree).toEqual([]); }); + + it("keeps interactive controls and useful landmarks in interactive mode", async () => { + const nodes: A11yNode[] = [ + { + role: "RootWebArea", + name: "Dashboard", + nodeId: "root", + encodedId: "root", + parentId: undefined, + childIds: ["main"], + }, + { + role: "main", + name: "", + nodeId: "main", + encodedId: "main", + parentId: "root", + childIds: ["region"], + }, + { + role: "region", + name: "Reports", + nodeId: "region", + encodedId: "region", + parentId: "main", + childIds: ["article", "paragraph", "input", "summary"], + }, + { + role: "article", + name: "Report 1", + nodeId: "article", + encodedId: "article", + parentId: "region", + childIds: ["heading", "link", "button"], + }, + { + role: "heading", + name: "Quarterly report", + nodeId: "heading", + encodedId: "heading", + parentId: "article", + childIds: [], + }, + { + role: "paragraph", + name: "Static summary copy", + nodeId: "paragraph", + encodedId: "paragraph", + parentId: "region", + childIds: [], + }, + { + role: "link", + name: "Open report", + nodeId: "link", + encodedId: "link", + parentId: "article", + childIds: [], + }, + { + role: "button", + name: "Archive report", + nodeId: "button", + encodedId: "button", + parentId: "article", + childIds: [], + }, + { + role: "textbox", + name: "Search dashboards", + nodeId: "input", + encodedId: "input", + parentId: "region", + childIds: [], + }, + { + role: "DisclosureTriangle", + name: "Advanced filters", + nodeId: "summary", + encodedId: "summary", + parentId: "region", + childIds: [], + }, + ]; + + const { tree } = await buildHierarchicalTree(nodes, { + ...opts, + interactive: true, + tagNameMap: { input: "input" }, + }); + + expect(tree).toHaveLength(1); + expect(tree[0]).toMatchObject({ role: "RootWebArea", name: "Dashboard" }); + expect(tree[0]?.encodedId).toBeUndefined(); + expect(tree[0]?.children?.[0]).toMatchObject({ role: "main" }); + expect(tree[0]?.children?.[0]?.encodedId).toBeUndefined(); + const treeText = JSON.stringify(tree); + const roles = collectRoles(tree); + + expect(treeText).toContain("Open report"); + expect(treeText).toContain("Archive report"); + expect(treeText).toContain("Search dashboards"); + expect(treeText).toContain("Advanced filters"); + expect(treeText).not.toContain("Quarterly report"); + expect(treeText).not.toContain("Static summary copy"); + expect(roles).not.toContain("article"); + expect(roles).not.toContain("paragraph"); + expect(roles).not.toContain("heading"); + }); }); describe("isStructural", () => { diff --git a/packages/core/tests/unit/snapshot-tree-format-utils.test.ts b/packages/core/tests/unit/snapshot-tree-format-utils.test.ts index 3245cabbd..350b9b2db 100644 --- a/packages/core/tests/unit/snapshot-tree-format-utils.test.ts +++ b/packages/core/tests/unit/snapshot-tree-format-utils.test.ts @@ -19,15 +19,34 @@ describe("formatTreeLine", () => { { role: "button", name: "Submit", + encodedId: "frame-2", nodeId: "ax-2", }, ], }); expect(outline).toBe( - "[frame-1] section: Container\n [ax-2] button: Submit", + "[frame-1] section: Container\n [frame-2] button: Submit", ); }); + + it("omits refs for nodes without encoded ids", () => { + const outline = formatTreeLine({ + role: "main", + name: "Content", + nodeId: "ax-1", + children: [ + { + role: "button", + name: "Submit", + encodedId: "button-1", + nodeId: "ax-2", + }, + ], + }); + + expect(outline).toBe("main: Content\n [button-1] button: Submit"); + }); }); describe("injectSubtrees", () => {