diff --git a/AGENTS.md b/AGENTS.md index 03b2265d..105c2932 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -220,11 +220,13 @@ The embeddable companion: a **vanilla-TS + Shadow-DOM** widget (no framework — | File | Purpose | |------|---------| | `sdk/web/src/index.ts` | Public `Skilly` API (`init`/`start`/`on`/`identify`/`destroy`) + auto-init from `data-skilly-*` script attrs + typed event emitter. | -| `sdk/web/src/widget.ts` | Shadow-DOM UI: launcher button, response bubble, blue cursor element + `moveCursorTo`. | +| `sdk/web/src/widget.ts` | Shadow-DOM UI: launcher button, response bubble, blue cursor + `setCursorPosition` (driven per-frame by the pointing engine). | | `sdk/web/src/core.ts` | Lazy, tolerant loader for the `core/web-sdk` WASM (widget runs UI-only if absent). | +| `sdk/web/src/digest.ts` | **8.2** DOM digest: structured, screenshot-free page view (interactive/annotated elements → stable ids, labels, rects) + element registry. `getPageDigest()`. | +| `sdk/web/src/pointing.ts` | **8.2** Pointing engine: parse `[POINT:id:label]`, resolve (digest id / `data-skilly` / CSS / visible text), bezier-arc cursor flight, scroll/resize re-anchor. | | `sdk/web/demo/index.html` | Demo host page (`bun run demo`). | -> 8.1 is the embed SKELETON with a simulated turn lifecycle (listening→thinking→speaking→complete). Validated: `bun run typecheck` + `bun run build` clean; Playwright confirms the widget mounts, the launcher renders, and `start()` shows the bubble + cursor. Next: **8.2** DOM digest + selector pointing · **8.3** OpenAI Realtime voice · **8.4+** multi-tenant Next.js backend. `dist/`, `node_modules/`, `generated/` are gitignored. +> A simulated turn lifecycle (listening→thinking→speaking→complete) keeps the embed demonstrable until 8.3. Validated: `bun run typecheck` + `bun run build` clean; Playwright confirms the widget mounts and renders, and (8.2) that the cursor flies a bezier arc and lands **exactly** on a `data-skilly`-resolved element (0px error). Next: **8.3** OpenAI Realtime voice (replaces the simulated turn) · **8.4+** multi-tenant Next.js backend. `dist/`, `node_modules/`, `generated/` are gitignored. ### Skill Files diff --git a/sdk/web/README.md b/sdk/web/README.md index b1734a13..59e8c83c 100644 --- a/sdk/web/README.md +++ b/sdk/web/README.md @@ -8,17 +8,21 @@ install it on their own web app; their visitors get the companion. See This package consumes the shared Rust core compiled to WASM (`core/web-sdk`, output in `sdk/web/generated/`). -## Status — Phase 8.1 (embed skeleton) +## Status — Phases 8.1 + 8.2 What's here: -- `@skilly/web` package: Shadow-DOM widget (launcher, response bubble, blue - cursor), the public `Skilly` API, and the lazy WASM-core loader. +- **8.1** `@skilly/web` package: Shadow-DOM widget (launcher, response bubble, + blue cursor), the public `Skilly` API, and the lazy WASM-core loader. +- **8.2** **DOM digest** (`getPageDigest()`) — a structured, screenshot-free view + of the page's interactive/annotated elements with stable ids + rects — and the + **selector-based pointing engine**: `[POINT:id:label]` → resolve (digest id / + `data-skilly` / CSS / visible text) → **bezier-arc cursor flight** → re-anchor + on scroll/resize. - A simulated turn lifecycle (listening → thinking → speaking → complete) so the embed is demonstrable end-to-end. -Layered on next: **8.2** DOM digest + selector-based pointing · **8.3** OpenAI -Realtime voice pipeline · **8.4+** multi-tenant Next.js backend (keys, metering, -SKILL.md serving). +Layered on next: **8.3** OpenAI Realtime voice pipeline (replaces the simulated +turn) · **8.4+** multi-tenant Next.js backend (keys, metering, SKILL.md serving). ## Install / embed diff --git a/sdk/web/demo/index.html b/sdk/web/demo/index.html index 80021c4c..029291a1 100644 --- a/sdk/web/demo/index.html +++ b/sdk/web/demo/index.html @@ -26,9 +26,9 @@

Getting started

This page simulates a website owner's app. The @skilly/web widget is mounted via a single script tag.

-
+

Pricing

-

A section the companion could point at (target resolution lands in Phase 8.2).

+

A section the companion points at — resolved via its data-skilly annotation and a bezier-arc cursor flight (Phase 8.2).

diff --git a/sdk/web/src/digest.ts b/sdk/web/src/digest.ts new file mode 100644 index 00000000..cc00350a --- /dev/null +++ b/sdk/web/src/digest.ts @@ -0,0 +1,140 @@ +// DOM digest — the web analog of the desktop screenshot. +// +// Instead of sending pixels, we send the host page's *structure*: a compact, +// stable list of the interactive / annotated / heading elements the companion +// can talk about and point at. Each entry has a stable id, an accessible label, +// a role, and its current viewport rect. The accompanying registry maps ids +// back to live elements so the pointing engine can resolve a target. +// +// This is cheaper, more accurate, and more privacy-friendly than a screenshot +// (see docs/architecture/web-sdk-prd.md §6). + +export interface DigestElementRect { + x: number; + y: number; + width: number; + height: number; +} + +export interface DigestElement { + /** Stable id the AI references in a [POINT:id:label] tag. Prefers data-skilly. */ + id: string; + /** Coarse role: button | link | input | heading | region. */ + role: string; + /** Accessible name (aria-label / data-skilly / visible text / placeholder). */ + label: string; + rect: DigestElementRect; +} + +export interface DomDigest { + url: string; + title: string; + viewport: { width: number; height: number }; + elements: DigestElement[]; + /** True when more elements existed than `maxElements` — never silently dropped. */ + truncated: boolean; +} + +/** Maps a digest id to the live element, for the pointing engine to resolve. */ +export type ElementRegistry = Map; + +const INTERACTIVE_SELECTOR = [ + "a[href]", + "button", + "input:not([type=hidden])", + "select", + "textarea", + "[role=button]", + "[role=link]", + "[role=tab]", + "[onclick]", + "[data-skilly]", + "h1", + "h2", + "h3", +].join(","); + +const MAX_LABEL_LENGTH = 80; + +function isVisible(element: HTMLElement): boolean { + const rect = element.getBoundingClientRect(); + if (rect.width <= 1 || rect.height <= 1) { + return false; + } + const style = window.getComputedStyle(element); + return style.visibility !== "hidden" && style.display !== "none" && style.opacity !== "0"; +} + +function accessibleLabel(element: HTMLElement): string { + const candidate = + element.getAttribute("aria-label") ?? + element.dataset.skillyLabel ?? + element.dataset.skilly ?? + element.getAttribute("placeholder") ?? + element.getAttribute("title") ?? + element.getAttribute("alt") ?? + element.textContent ?? + ""; + return candidate.replace(/\s+/g, " ").trim().slice(0, MAX_LABEL_LENGTH); +} + +function coarseRole(element: HTMLElement): string { + const explicitRole = element.getAttribute("role"); + if (explicitRole) { + return explicitRole; + } + const tag = element.tagName.toLowerCase(); + if (tag === "a") return "link"; + if (tag === "button") return "button"; + if (tag === "input" || tag === "select" || tag === "textarea") return "input"; + if (tag === "h1" || tag === "h2" || tag === "h3") return "heading"; + return "region"; +} + +/** + * Build a digest of the current page. Annotated (`data-skilly`) elements are + * prioritized so authored targets always make the cut; the list is capped at + * `maxElements` and `truncated` flags any overflow (no silent truncation). + * Skilly's own widget is excluded. + */ +export function buildDomDigest(maxElements = 40): { digest: DomDigest; registry: ElementRegistry } { + const candidates = Array.from( + document.querySelectorAll(INTERACTIVE_SELECTOR), + ).filter((element) => !element.closest("[data-skilly-widget]") && isVisible(element)); + + // Authored annotations first, then everything else (preserving DOM order within each group). + candidates.sort((first, second) => { + const firstAnnotated = first.hasAttribute("data-skilly") ? 0 : 1; + const secondAnnotated = second.hasAttribute("data-skilly") ? 0 : 1; + return firstAnnotated - secondAnnotated; + }); + + const truncated = candidates.length > maxElements; + const selected = candidates.slice(0, maxElements); + + const registry: ElementRegistry = new Map(); + const elements: DigestElement[] = selected.map((element, index) => { + const id = element.dataset.skilly ?? `el_${index + 1}`; + registry.set(id, element); + const rect = element.getBoundingClientRect(); + return { + id, + role: coarseRole(element), + label: accessibleLabel(element), + rect: { x: Math.round(rect.x), y: Math.round(rect.y), width: Math.round(rect.width), height: Math.round(rect.height) }, + }; + }); + + if (truncated) { + console.warn(`[skilly] DOM digest capped at ${maxElements}; ${candidates.length} candidates found.`); + } + + const digest: DomDigest = { + url: window.location.href, + title: document.title, + viewport: { width: window.innerWidth, height: window.innerHeight }, + elements, + truncated, + }; + return { digest, registry }; +} diff --git a/sdk/web/src/index.ts b/sdk/web/src/index.ts index 6adb8598..c0dfbfe6 100644 --- a/sdk/web/src/index.ts +++ b/sdk/web/src/index.ts @@ -15,6 +15,8 @@ import { loadCore } from "./core.js"; import { SkillyWidget } from "./widget.js"; +import { buildDomDigest, type DomDigest, type ElementRegistry } from "./digest.js"; +import { parsePointTags, PointingEngine } from "./pointing.js"; import type { SkillyConfig, SkillyEventHandler, @@ -26,6 +28,8 @@ const DEFAULT_ACCENT = "#2F6BFF"; class SkillyController { private widget: SkillyWidget | null = null; + private pointing: PointingEngine | null = null; + private currentRegistry: ElementRegistry | null = null; // Storage is type-erased; the public on()/emit() signatures keep callers type-safe. private handlers = new Map void>>(); private turnInProgress = false; @@ -43,23 +47,38 @@ class SkillyController { this.widget = new SkillyWidget(config.accentColor ?? DEFAULT_ACCENT); this.widget.onLauncherActivated = () => this.start(); this.widget.mount(); + this.pointing = new PointingEngine(this.widget); // Begin loading the shared WASM core in the background (optional in 8.1). void loadCore(config.coreUrl); } + /** + * Snapshot the host page as a DOM digest — the structured, screenshot-free + * view the companion reasons over (and references in [POINT:id] tags). The + * AI integration that consumes this lands in Phase 8.3. + */ + getPageDigest(): DomDigest { + const { digest, registry } = buildDomDigest(); + this.currentRegistry = registry; + return digest; + } + /** * Open the companion and run a turn. 8.1 simulates the lifecycle * (listening -> thinking -> speaking -> complete) so the embed is * demonstrable; 8.3 replaces this with the OpenAI Realtime voice pipeline. */ start(goal?: string): void { - if (!this.widget || this.turnInProgress) { + if (!this.widget || !this.pointing || this.turnInProgress) { return; } this.turnInProgress = true; this.emit("turn", { goal }); + // Capture the page as a DOM digest at the start of the turn (8.3 sends it to the AI). + const digest = this.getPageDigest(); + this.widget.setState("listening"); this.widget.setBubbleText("Listening…"); @@ -69,24 +88,54 @@ class SkillyController { }, 800); window.setTimeout(() => { - this.widget?.setState("speaking"); - this.widget?.setBubbleText( - goal - ? `Let's get started with: ${goal}` - : "Hi! I'm Skilly. Ask me how to do anything on this site and I'll point you to it.", - ); - // Pointing demo (8.2 will resolve a real selector to these coordinates). - this.widget?.moveCursorTo(window.innerWidth / 2, window.innerHeight / 2); - this.emit("point", { selector: "body", label: "demo target" }); + void this.respondAndPoint(goal, digest); }, 1600); window.setTimeout(() => { this.widget?.setState("idle"); this.widget?.setBubbleText(""); - this.widget?.hideCursor(); + this.pointing?.clear(); this.turnInProgress = false; this.emit("complete", {}); - }, 3600); + }, 4200); + } + + /** + * 8.2: simulate the companion's response (which, from 8.3, will come from the + * AI over the Realtime connection) and run its `[POINT:id:label]` tag through + * the real pointing engine against the live DOM. + */ + private async respondAndPoint(goal: string | undefined, digest: DomDigest): Promise { + if (!this.widget || !this.pointing) { + return; + } + this.widget.setState("speaking"); + + // Pick a real, demonstrable target: an authored annotation, else a heading. + const target = + digest.elements.find((element) => !/^el_\d+$/.test(element.id)) ?? + digest.elements.find((element) => element.role === "heading") ?? + digest.elements[0]; + + const intro = goal ? `Let's start with "${goal}".` : "Hi! I'm Skilly."; + const simulatedResponse = target + ? `${intro} ${target.label} is right here. [POINT:${target.id}:${target.label}]` + : `${intro} Ask me how to do anything on this site and I'll point you to it.`; + + const { cleanedText, points } = parsePointTags(simulatedResponse); + this.widget.setBubbleText(cleanedText); + + const firstPoint = points[0]; + if (firstPoint) { + const resolved = await this.pointing.pointAt( + firstPoint.target, + firstPoint.label, + this.currentRegistry ?? undefined, + ); + if (resolved) { + this.emit("point", { selector: firstPoint.target, label: resolved.label }); + } + } } /** Subscribe to a companion event. Returns an unsubscribe function. */ @@ -111,6 +160,9 @@ class SkillyController { /** Tear down the widget and clear subscriptions. */ destroy(): void { + this.pointing?.clear(); + this.pointing = null; + this.currentRegistry = null; this.widget?.destroy(); this.widget = null; this.handlers.clear(); @@ -139,8 +191,11 @@ export const on = ( export const identify = (endUserId: string, traits?: Record): void => controller.identify(endUserId, traits); export const destroy = (): void => controller.destroy(); +/** Snapshot the host page as a DOM digest (the screenshot-free page view). */ +export const getPageDigest = (): DomDigest => controller.getPageDigest(); export type { SkillyConfig, SkillyEventMap, SkillyEventName } from "./types.js"; +export type { DomDigest, DigestElement } from "./digest.js"; // Auto-init from `