diff --git a/AGENTS.md b/AGENTS.md
index 03b2265d..105c2932 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -220,11 +220,13 @@ The embeddable companion: a **vanilla-TS + Shadow-DOM** widget (no framework —
| File | Purpose |
|------|---------|
| `sdk/web/src/index.ts` | Public `Skilly` API (`init`/`start`/`on`/`identify`/`destroy`) + auto-init from `data-skilly-*` script attrs + typed event emitter. |
-| `sdk/web/src/widget.ts` | Shadow-DOM UI: launcher button, response bubble, blue cursor element + `moveCursorTo`. |
+| `sdk/web/src/widget.ts` | Shadow-DOM UI: launcher button, response bubble, blue cursor + `setCursorPosition` (driven per-frame by the pointing engine). |
| `sdk/web/src/core.ts` | Lazy, tolerant loader for the `core/web-sdk` WASM (widget runs UI-only if absent). |
+| `sdk/web/src/digest.ts` | **8.2** DOM digest: structured, screenshot-free page view (interactive/annotated elements → stable ids, labels, rects) + element registry. `getPageDigest()`. |
+| `sdk/web/src/pointing.ts` | **8.2** Pointing engine: parse `[POINT:id:label]`, resolve (digest id / `data-skilly` / CSS / visible text), bezier-arc cursor flight, scroll/resize re-anchor. |
| `sdk/web/demo/index.html` | Demo host page (`bun run demo`). |
-> 8.1 is the embed SKELETON with a simulated turn lifecycle (listening→thinking→speaking→complete). Validated: `bun run typecheck` + `bun run build` clean; Playwright confirms the widget mounts, the launcher renders, and `start()` shows the bubble + cursor. Next: **8.2** DOM digest + selector pointing · **8.3** OpenAI Realtime voice · **8.4+** multi-tenant Next.js backend. `dist/`, `node_modules/`, `generated/` are gitignored.
+> A simulated turn lifecycle (listening→thinking→speaking→complete) keeps the embed demonstrable until 8.3. Validated: `bun run typecheck` + `bun run build` clean; Playwright confirms the widget mounts and renders, and (8.2) that the cursor flies a bezier arc and lands **exactly** on a `data-skilly`-resolved element (0px error). Next: **8.3** OpenAI Realtime voice (replaces the simulated turn) · **8.4+** multi-tenant Next.js backend. `dist/`, `node_modules/`, `generated/` are gitignored.
### Skill Files
diff --git a/sdk/web/README.md b/sdk/web/README.md
index b1734a13..59e8c83c 100644
--- a/sdk/web/README.md
+++ b/sdk/web/README.md
@@ -8,17 +8,21 @@ install it on their own web app; their visitors get the companion. See
This package consumes the shared Rust core compiled to WASM (`core/web-sdk`,
output in `sdk/web/generated/`).
-## Status — Phase 8.1 (embed skeleton)
+## Status — Phases 8.1 + 8.2
What's here:
-- `@skilly/web` package: Shadow-DOM widget (launcher, response bubble, blue
- cursor), the public `Skilly` API, and the lazy WASM-core loader.
+- **8.1** `@skilly/web` package: Shadow-DOM widget (launcher, response bubble,
+ blue cursor), the public `Skilly` API, and the lazy WASM-core loader.
+- **8.2** **DOM digest** (`getPageDigest()`) — a structured, screenshot-free view
+ of the page's interactive/annotated elements with stable ids + rects — and the
+ **selector-based pointing engine**: `[POINT:id:label]` → resolve (digest id /
+ `data-skilly` / CSS / visible text) → **bezier-arc cursor flight** → re-anchor
+ on scroll/resize.
- A simulated turn lifecycle (listening → thinking → speaking → complete) so the
embed is demonstrable end-to-end.
-Layered on next: **8.2** DOM digest + selector-based pointing · **8.3** OpenAI
-Realtime voice pipeline · **8.4+** multi-tenant Next.js backend (keys, metering,
-SKILL.md serving).
+Layered on next: **8.3** OpenAI Realtime voice pipeline (replaces the simulated
+turn) · **8.4+** multi-tenant Next.js backend (keys, metering, SKILL.md serving).
## Install / embed
diff --git a/sdk/web/demo/index.html b/sdk/web/demo/index.html
index 80021c4c..029291a1 100644
--- a/sdk/web/demo/index.html
+++ b/sdk/web/demo/index.html
@@ -26,9 +26,9 @@
Getting started
This page simulates a website owner's app. The @skilly/web widget is mounted via a single script tag.
-
+
Pricing
-
A section the companion could point at (target resolution lands in Phase 8.2).
+
A section the companion points at — resolved via its data-skilly annotation and a bezier-arc cursor flight (Phase 8.2).
diff --git a/sdk/web/src/digest.ts b/sdk/web/src/digest.ts
new file mode 100644
index 00000000..cc00350a
--- /dev/null
+++ b/sdk/web/src/digest.ts
@@ -0,0 +1,140 @@
+// DOM digest — the web analog of the desktop screenshot.
+//
+// Instead of sending pixels, we send the host page's *structure*: a compact,
+// stable list of the interactive / annotated / heading elements the companion
+// can talk about and point at. Each entry has a stable id, an accessible label,
+// a role, and its current viewport rect. The accompanying registry maps ids
+// back to live elements so the pointing engine can resolve a target.
+//
+// This is cheaper, more accurate, and more privacy-friendly than a screenshot
+// (see docs/architecture/web-sdk-prd.md §6).
+
+export interface DigestElementRect {
+ x: number;
+ y: number;
+ width: number;
+ height: number;
+}
+
+export interface DigestElement {
+ /** Stable id the AI references in a [POINT:id:label] tag. Prefers data-skilly. */
+ id: string;
+ /** Coarse role: button | link | input | heading | region. */
+ role: string;
+ /** Accessible name (aria-label / data-skilly / visible text / placeholder). */
+ label: string;
+ rect: DigestElementRect;
+}
+
+export interface DomDigest {
+ url: string;
+ title: string;
+ viewport: { width: number; height: number };
+ elements: DigestElement[];
+ /** True when more elements existed than `maxElements` — never silently dropped. */
+ truncated: boolean;
+}
+
+/** Maps a digest id to the live element, for the pointing engine to resolve. */
+export type ElementRegistry = Map
;
+
+const INTERACTIVE_SELECTOR = [
+ "a[href]",
+ "button",
+ "input:not([type=hidden])",
+ "select",
+ "textarea",
+ "[role=button]",
+ "[role=link]",
+ "[role=tab]",
+ "[onclick]",
+ "[data-skilly]",
+ "h1",
+ "h2",
+ "h3",
+].join(",");
+
+const MAX_LABEL_LENGTH = 80;
+
+function isVisible(element: HTMLElement): boolean {
+ const rect = element.getBoundingClientRect();
+ if (rect.width <= 1 || rect.height <= 1) {
+ return false;
+ }
+ const style = window.getComputedStyle(element);
+ return style.visibility !== "hidden" && style.display !== "none" && style.opacity !== "0";
+}
+
+function accessibleLabel(element: HTMLElement): string {
+ const candidate =
+ element.getAttribute("aria-label") ??
+ element.dataset.skillyLabel ??
+ element.dataset.skilly ??
+ element.getAttribute("placeholder") ??
+ element.getAttribute("title") ??
+ element.getAttribute("alt") ??
+ element.textContent ??
+ "";
+ return candidate.replace(/\s+/g, " ").trim().slice(0, MAX_LABEL_LENGTH);
+}
+
+function coarseRole(element: HTMLElement): string {
+ const explicitRole = element.getAttribute("role");
+ if (explicitRole) {
+ return explicitRole;
+ }
+ const tag = element.tagName.toLowerCase();
+ if (tag === "a") return "link";
+ if (tag === "button") return "button";
+ if (tag === "input" || tag === "select" || tag === "textarea") return "input";
+ if (tag === "h1" || tag === "h2" || tag === "h3") return "heading";
+ return "region";
+}
+
+/**
+ * Build a digest of the current page. Annotated (`data-skilly`) elements are
+ * prioritized so authored targets always make the cut; the list is capped at
+ * `maxElements` and `truncated` flags any overflow (no silent truncation).
+ * Skilly's own widget is excluded.
+ */
+export function buildDomDigest(maxElements = 40): { digest: DomDigest; registry: ElementRegistry } {
+ const candidates = Array.from(
+ document.querySelectorAll(INTERACTIVE_SELECTOR),
+ ).filter((element) => !element.closest("[data-skilly-widget]") && isVisible(element));
+
+ // Authored annotations first, then everything else (preserving DOM order within each group).
+ candidates.sort((first, second) => {
+ const firstAnnotated = first.hasAttribute("data-skilly") ? 0 : 1;
+ const secondAnnotated = second.hasAttribute("data-skilly") ? 0 : 1;
+ return firstAnnotated - secondAnnotated;
+ });
+
+ const truncated = candidates.length > maxElements;
+ const selected = candidates.slice(0, maxElements);
+
+ const registry: ElementRegistry = new Map();
+ const elements: DigestElement[] = selected.map((element, index) => {
+ const id = element.dataset.skilly ?? `el_${index + 1}`;
+ registry.set(id, element);
+ const rect = element.getBoundingClientRect();
+ return {
+ id,
+ role: coarseRole(element),
+ label: accessibleLabel(element),
+ rect: { x: Math.round(rect.x), y: Math.round(rect.y), width: Math.round(rect.width), height: Math.round(rect.height) },
+ };
+ });
+
+ if (truncated) {
+ console.warn(`[skilly] DOM digest capped at ${maxElements}; ${candidates.length} candidates found.`);
+ }
+
+ const digest: DomDigest = {
+ url: window.location.href,
+ title: document.title,
+ viewport: { width: window.innerWidth, height: window.innerHeight },
+ elements,
+ truncated,
+ };
+ return { digest, registry };
+}
diff --git a/sdk/web/src/index.ts b/sdk/web/src/index.ts
index 6adb8598..c0dfbfe6 100644
--- a/sdk/web/src/index.ts
+++ b/sdk/web/src/index.ts
@@ -15,6 +15,8 @@
import { loadCore } from "./core.js";
import { SkillyWidget } from "./widget.js";
+import { buildDomDigest, type DomDigest, type ElementRegistry } from "./digest.js";
+import { parsePointTags, PointingEngine } from "./pointing.js";
import type {
SkillyConfig,
SkillyEventHandler,
@@ -26,6 +28,8 @@ const DEFAULT_ACCENT = "#2F6BFF";
class SkillyController {
private widget: SkillyWidget | null = null;
+ private pointing: PointingEngine | null = null;
+ private currentRegistry: ElementRegistry | null = null;
// Storage is type-erased; the public on()/emit() signatures keep callers type-safe.
private handlers = new Map void>>();
private turnInProgress = false;
@@ -43,23 +47,38 @@ class SkillyController {
this.widget = new SkillyWidget(config.accentColor ?? DEFAULT_ACCENT);
this.widget.onLauncherActivated = () => this.start();
this.widget.mount();
+ this.pointing = new PointingEngine(this.widget);
// Begin loading the shared WASM core in the background (optional in 8.1).
void loadCore(config.coreUrl);
}
+ /**
+ * Snapshot the host page as a DOM digest — the structured, screenshot-free
+ * view the companion reasons over (and references in [POINT:id] tags). The
+ * AI integration that consumes this lands in Phase 8.3.
+ */
+ getPageDigest(): DomDigest {
+ const { digest, registry } = buildDomDigest();
+ this.currentRegistry = registry;
+ return digest;
+ }
+
/**
* Open the companion and run a turn. 8.1 simulates the lifecycle
* (listening -> thinking -> speaking -> complete) so the embed is
* demonstrable; 8.3 replaces this with the OpenAI Realtime voice pipeline.
*/
start(goal?: string): void {
- if (!this.widget || this.turnInProgress) {
+ if (!this.widget || !this.pointing || this.turnInProgress) {
return;
}
this.turnInProgress = true;
this.emit("turn", { goal });
+ // Capture the page as a DOM digest at the start of the turn (8.3 sends it to the AI).
+ const digest = this.getPageDigest();
+
this.widget.setState("listening");
this.widget.setBubbleText("Listening…");
@@ -69,24 +88,54 @@ class SkillyController {
}, 800);
window.setTimeout(() => {
- this.widget?.setState("speaking");
- this.widget?.setBubbleText(
- goal
- ? `Let's get started with: ${goal}`
- : "Hi! I'm Skilly. Ask me how to do anything on this site and I'll point you to it.",
- );
- // Pointing demo (8.2 will resolve a real selector to these coordinates).
- this.widget?.moveCursorTo(window.innerWidth / 2, window.innerHeight / 2);
- this.emit("point", { selector: "body", label: "demo target" });
+ void this.respondAndPoint(goal, digest);
}, 1600);
window.setTimeout(() => {
this.widget?.setState("idle");
this.widget?.setBubbleText("");
- this.widget?.hideCursor();
+ this.pointing?.clear();
this.turnInProgress = false;
this.emit("complete", {});
- }, 3600);
+ }, 4200);
+ }
+
+ /**
+ * 8.2: simulate the companion's response (which, from 8.3, will come from the
+ * AI over the Realtime connection) and run its `[POINT:id:label]` tag through
+ * the real pointing engine against the live DOM.
+ */
+ private async respondAndPoint(goal: string | undefined, digest: DomDigest): Promise {
+ if (!this.widget || !this.pointing) {
+ return;
+ }
+ this.widget.setState("speaking");
+
+ // Pick a real, demonstrable target: an authored annotation, else a heading.
+ const target =
+ digest.elements.find((element) => !/^el_\d+$/.test(element.id)) ??
+ digest.elements.find((element) => element.role === "heading") ??
+ digest.elements[0];
+
+ const intro = goal ? `Let's start with "${goal}".` : "Hi! I'm Skilly.";
+ const simulatedResponse = target
+ ? `${intro} ${target.label} is right here. [POINT:${target.id}:${target.label}]`
+ : `${intro} Ask me how to do anything on this site and I'll point you to it.`;
+
+ const { cleanedText, points } = parsePointTags(simulatedResponse);
+ this.widget.setBubbleText(cleanedText);
+
+ const firstPoint = points[0];
+ if (firstPoint) {
+ const resolved = await this.pointing.pointAt(
+ firstPoint.target,
+ firstPoint.label,
+ this.currentRegistry ?? undefined,
+ );
+ if (resolved) {
+ this.emit("point", { selector: firstPoint.target, label: resolved.label });
+ }
+ }
}
/** Subscribe to a companion event. Returns an unsubscribe function. */
@@ -111,6 +160,9 @@ class SkillyController {
/** Tear down the widget and clear subscriptions. */
destroy(): void {
+ this.pointing?.clear();
+ this.pointing = null;
+ this.currentRegistry = null;
this.widget?.destroy();
this.widget = null;
this.handlers.clear();
@@ -139,8 +191,11 @@ export const on = (
export const identify = (endUserId: string, traits?: Record): void =>
controller.identify(endUserId, traits);
export const destroy = (): void => controller.destroy();
+/** Snapshot the host page as a DOM digest (the screenshot-free page view). */
+export const getPageDigest = (): DomDigest => controller.getPageDigest();
export type { SkillyConfig, SkillyEventMap, SkillyEventName } from "./types.js";
+export type { DomDigest, DigestElement } from "./digest.js";
// Auto-init from `