diff --git a/.gitignore b/.gitignore index 4be5b64..eee0f26 100644 --- a/.gitignore +++ b/.gitignore @@ -46,4 +46,5 @@ playground # Temporary directories from sync workflows extension-temp/ - +playground +.env diff --git a/package-lock.json b/package-lock.json index 5f67204..3b779de 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@predicatesystems/runtime", - "version": "1.1.0", + "version": "1.2.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@predicatesystems/runtime", - "version": "1.1.0", + "version": "1.2.0", "license": "(MIT OR Apache-2.0)", "dependencies": { "canvas": "^3.2.1", @@ -27,6 +27,7 @@ "@types/uuid": "^9.0.0", "@typescript-eslint/eslint-plugin": "^8.51.0", "@typescript-eslint/parser": "^8.51.0", + "dotenv": "^17.4.2", "eslint": "^9.39.2", "eslint-config-prettier": "^10.1.8", "eslint-plugin-prettier": "^5.5.4", @@ -3093,6 +3094,19 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, + "node_modules/dotenv": { + "version": "17.4.2", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-17.4.2.tgz", + "integrity": "sha512-nI4U3TottKAcAD9LLud4Cb7b2QztQMUEfHbvhTH09bqXTxnSie8WnjPALV/WMCrJZ6UV/qHJ6L03OqO3LcdYZw==", + "dev": true, + "license": "BSD-2-Clause", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://dotenvx.com" + } + }, "node_modules/dunder-proto": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", diff --git a/package.json b/package.json index a73e6fa..d908b7c 100644 --- a/package.json +++ b/package.json @@ -48,6 +48,7 @@ "@types/uuid": "^9.0.0", "@typescript-eslint/eslint-plugin": "^8.51.0", "@typescript-eslint/parser": "^8.51.0", + "dotenv": "^17.4.2", "eslint": "^9.39.2", "eslint-config-prettier": "^10.1.8", "eslint-plugin-prettier": "^5.5.4", diff --git a/src/agents/planner-executor/DEFERRED_FEATURES.md b/src/agents/planner-executor/DEFERRED_FEATURES.md new file mode 100644 index 0000000..12927e2 --- /dev/null +++ b/src/agents/planner-executor/DEFERRED_FEATURES.md @@ -0,0 +1,333 @@ +# PlannerExecutorAgent: Deferred Features + +**Date:** 2026-04-13 +**Status:** Documentation for post-MVP implementation + +## Overview + +This document outlines features from the Python `PlannerExecutorAgent` that were deferred from the TypeScript MVP port. These features add reliability and flexibility but are not required for basic browser automation tasks. + +## MVP Implementation Summary + +The TypeScript MVP includes: + +1. **Core Agent (~600 lines)** + - Stepwise (ReAct-style) planning loop + - Action parsing (CLICK, TYPE, SCROLL, PRESS, DONE) + - Compact context formatting for small models + - Token usage tracking by role and model + +2. **Reliability Features (~200 lines)** + - Snapshot escalation (progressive limit increase) + - Pre-action authorization hook (for sidecar policy) + - Basic error handling and retry + +3. **Configuration (~250 lines)** + - `PlannerExecutorConfig` with presets + - `SnapshotEscalationConfig`, `RetryConfig`, `StepwisePlanningConfig` + - Factory helpers for provider creation + +## Deferred Features + +### 1. Modal/Overlay Dismissal + +**Python Reference:** `ModalDismissalConfig`, `_attempt_modal_dismissal()` + +**Description:** Automatically dismiss blocking overlays after DOM changes: + +- Product protection/warranty upsells +- Cookie consent banners +- Newsletter signup popups +- Promotional overlays +- Cart upsell drawers + +**Implementation Effort:** ~150 lines + +**Config Interface:** + +```typescript +interface ModalDismissalConfig { + enabled: boolean; + dismissPatterns: string[]; // e.g., ['close', 'no thanks', 'skip'] + dismissIcons: string[]; // e.g., ['×', '✕', 'x'] + roleFilter: string[]; // e.g., ['button', 'link'] + maxAttempts: number; + minNewElements: number; // Minimum DOM changes to trigger +} +``` + +**Key Logic:** + +- Detect DOM changes after CLICK actions +- Find buttons matching dismissal patterns (word-boundary matching) +- Click dismissal button and verify modal closed +- Skip if checkout-related buttons are present + +--- + +### 2. Captcha Handling + +**Python Reference:** `CaptchaConfig`, `_detect_captcha()`, `_handle_captcha()` + +**Description:** Detect and handle CAPTCHAs during automation: + +- Policy options: `abort`, `callback`, `pause` +- Support for external solving services +- Detection via element text and patterns + +**Implementation Effort:** ~100 lines + +**Config Interface:** + +```typescript +interface CaptchaConfig { + enabled: boolean; + policy: 'abort' | 'callback' | 'pause'; + detectionPatterns: string[]; + solverCallback?: (imageBase64: string) => Promise; + maxWaitMs: number; +} +``` + +**Key Logic:** + +- Check snapshot elements for CAPTCHA indicators +- Based on policy: abort task, call external solver, or pause for human +- Resume automation after CAPTCHA solved + +--- + +### 3. Vision Fallback + +**Python Reference:** `VisionFallbackConfig`, vision_executor, vision_verifier + +**Description:** Use vision-capable models when DOM-based automation fails: + +- Canvas pages with no accessible elements +- Low element confidence scores +- Complex visual layouts + +**Implementation Effort:** ~200 lines + +**Config Interface:** + +```typescript +interface VisionFallbackConfig { + enabled: boolean; + maxVisionCalls: number; + triggerRequiresVision: boolean; + triggerCanvasOrLowActionables: boolean; + canvasDetectionThreshold: number; + lowActionablesThreshold: number; +} +``` + +**Key Logic:** + +- Detect snapshot failures (low elements, canvas pages) +- Switch to vision executor with screenshot input +- Use vision verifier for state verification +- Fall back gracefully to DOM mode when possible + +--- + +### 4. Intent Heuristics + +**Python Reference:** `IntentHeuristics` protocol, `_try_intent_heuristics()` + +**Description:** Pluggable domain-specific element selection without LLM: + +- E-commerce: "Add to Cart", "Checkout" buttons +- Authentication: login forms, password fields +- Search: search boxes, result links + +**Implementation Effort:** ~100 lines + +**Interface:** + +```typescript +interface IntentHeuristics { + findElementForIntent( + intent: string, + elements: SnapshotElement[], + url: string, + goal: string + ): number | null; + + priorityOrder(): string[]; +} + +// Example implementation +class EcommerceHeuristics implements IntentHeuristics { + findElementForIntent(intent, elements, url, goal) { + if (intent.toLowerCase().includes('add to cart')) { + const btn = elements.find(el => el.text?.toLowerCase().includes('add to cart')); + return btn?.id ?? null; + } + return null; // Fall back to LLM + } + + priorityOrder() { + return ['add_to_cart', 'checkout', 'search']; + } +} +``` + +**Key Logic:** + +- Check heuristics before calling executor LLM +- Reduces token usage for common patterns +- Improves reliability for known sites + +--- + +### 5. Recovery Navigation + +**Python Reference:** `RecoveryNavigationConfig`, `_last_known_good_url` + +**Description:** Track and recover from off-track navigation: + +- Remember last URL where verification passed +- Navigate back when subsequent steps fail +- Detect when agent is lost + +**Implementation Effort:** ~80 lines + +**Config Interface:** + +```typescript +interface RecoveryNavigationConfig { + enabled: boolean; + maxRecoveryAttempts: number; + trackSuccessfulUrls: boolean; +} +``` + +**Key Logic:** + +- Store URL after successful verification +- On repeated failures, navigate back to last good URL +- Replan from recovered state + +--- + +### 6. Checkout/Auth Boundary Detection + +**Python Reference:** `CheckoutDetectionConfig`, `AuthBoundaryConfig` + +**Description:** Detect when agent reaches boundaries that require human intervention: + +- Checkout pages requiring payment info +- Login/signup pages requiring credentials +- Age verification gates + +**Implementation Effort:** ~60 lines + +**Config Interface:** + +```typescript +interface CheckoutDetectionConfig { + enabled: boolean; + urlPatterns: string[]; // e.g., ['/checkout', '/payment'] + elementPatterns: string[]; // e.g., ['credit card', 'payment'] + stopOnDetection: boolean; +} + +interface AuthBoundaryConfig { + enabled: boolean; + urlPatterns: string[]; // e.g., ['/login', '/signin'] + elementPatterns: string[]; // e.g., ['sign in', 'log in'] + stopOnDetection: boolean; +} +``` + +--- + +### 7. Executor Override + +**Python Reference:** `ExecutorOverride` protocol + +**Description:** Validate or override executor's element choices before action: + +- Safety checks (block delete buttons) +- Domain-specific corrections +- Audit logging + +**Implementation Effort:** ~50 lines + +**Interface:** + +```typescript +interface ExecutorOverride { + validateChoice( + elementId: number, + action: string, + elements: SnapshotElement[], + goal: string + ): { + valid: boolean; + overrideElementId?: number; + rejectionReason?: string; + }; +} +``` + +--- + +### 8. Upfront Planning Mode + +**Python Reference:** `plan()`, `replan()` methods + +**Description:** Generate full execution plan upfront (alternative to stepwise): + +- Better for known workflows +- Supports plan patching on failure +- More efficient for simple tasks + +**Implementation Effort:** ~200 lines + +**Key Functions:** + +- `plan(task, startUrl)` - Generate full plan +- `replan(task, failedStep, reason)` - Patch plan after failure +- `run(runtime, task)` - Execute with upfront planning + +--- + +### 9. Task Category Pruning + +**Python Reference:** `PruningTaskCategory`, `prune_with_recovery()` + +**Description:** Category-specific element filtering to reduce context size: + +- Shopping: prioritize product/cart elements +- Search: prioritize search box/results +- Auth: prioritize form fields + +**Implementation Effort:** ~150 lines + +**Categories:** + +- `shopping`, `checkout`, `search`, `auth`, `form_filling`, `extraction`, `navigation` + +--- + +## Implementation Priority + +Recommended order based on impact and complexity: + +1. **Intent Heuristics** - High impact, low complexity, reduces token usage +2. **Modal Dismissal** - Common pain point, medium complexity +3. **Vision Fallback** - Required for canvas/complex pages +4. **Captcha Handling** - Needed for production use +5. **Recovery Navigation** - Improves reliability +6. **Upfront Planning** - Alternative mode for simple tasks +7. **Boundary Detection** - Nice to have for graceful stops +8. **Executor Override** - Nice to have for safety +9. **Task Category Pruning** - Optimization for large pages + +## References + +- Python implementation: `sdk-python/predicate/agents/planner_executor_agent.py` +- Design doc: `docs/sdk-ts-doc/2026-03-28_planner_executor_agent_port.md` +- Chrome extension feasibility: `docs/sdk-python-doc/2026-04-13_predicate_chrome_extension_agent_feasibility.md` diff --git a/src/agents/planner-executor/agent-factory.ts b/src/agents/planner-executor/agent-factory.ts index 00c06eb..1127b41 100644 --- a/src/agents/planner-executor/agent-factory.ts +++ b/src/agents/planner-executor/agent-factory.ts @@ -35,6 +35,9 @@ export interface CreateAgentOptions { /** Ollama server URL (default: http://localhost:11434) */ ollamaBaseUrl?: string; + /** Timeout for Ollama requests in ms (default: 120000 for local models) */ + ollamaTimeoutMs?: number; + /** OpenAI API key (defaults to OPENAI_API_KEY env var) */ openaiApiKey?: string; @@ -95,6 +98,7 @@ export function createProvider( provider: 'auto' | 'ollama' | 'openai' | 'anthropic', options: { ollamaBaseUrl?: string; + ollamaTimeoutMs?: number; openaiApiKey?: string; anthropicApiKey?: string; } @@ -106,6 +110,8 @@ export function createProvider( return new OllamaProvider({ model, baseUrl: options.ollamaBaseUrl ?? 'http://localhost:11434', + // Default 120s for local models (they're slower and may include reasoning) + timeoutMs: options.ollamaTimeoutMs ?? 120_000, }); case 'openai': { @@ -228,6 +234,7 @@ export async function createPlannerExecutorAgentProviders( plannerProvider = 'auto', executorProvider = 'auto', ollamaBaseUrl, + ollamaTimeoutMs, openaiApiKey, anthropicApiKey, config, @@ -238,12 +245,14 @@ export async function createPlannerExecutorAgentProviders( // Create providers const planner = createProvider(plannerModel, plannerProvider, { ollamaBaseUrl, + ollamaTimeoutMs, openaiApiKey, anthropicApiKey, }); const executor = createProvider(executorModel, executorProvider, { ollamaBaseUrl, + ollamaTimeoutMs, openaiApiKey, anthropicApiKey, }); diff --git a/src/agents/planner-executor/boundary-detection.ts b/src/agents/planner-executor/boundary-detection.ts new file mode 100644 index 0000000..a4fbfe3 --- /dev/null +++ b/src/agents/planner-executor/boundary-detection.ts @@ -0,0 +1,228 @@ +/** + * Boundary Detection for Authentication and Checkout Pages + * + * Detects when the agent reaches terminal states that require special handling: + * - Authentication boundaries (login/sign-in pages) + * - Checkout pages (may require different handling) + * + * Authentication boundaries are graceful terminal states - the agent has + * successfully navigated as far as possible without credentials. + */ + +/** + * Configuration for authentication boundary detection. + */ +export interface AuthBoundaryConfig { + /** Whether auth boundary detection is enabled (default: true) */ + enabled: boolean; + /** URL patterns indicating authentication pages */ + urlPatterns: string[]; + /** If true, mark run as successful when auth boundary reached (default: true) */ + stopOnAuth: boolean; + /** Message to include in outcome when stopping at auth (default: "Reached authentication boundary (login required)") */ + authSuccessMessage: string; +} + +/** + * Default auth boundary configuration. + */ +export const DEFAULT_AUTH_BOUNDARY_CONFIG: AuthBoundaryConfig = { + enabled: true, + urlPatterns: [ + '/signin', + '/sign-in', + '/login', + '/log-in', + '/auth', + '/authenticate', + '/ap/signin', // Amazon sign-in + '/ap/register', // Amazon registration + '/ax/claim', // Amazon CAPTCHA/verification + '/account/login', + '/accounts/login', + '/user/login', + ], + stopOnAuth: true, + authSuccessMessage: 'Reached authentication boundary (login required)', +}; + +/** + * Configuration for checkout page detection. + */ +export interface CheckoutDetectionConfig { + /** Whether checkout detection is enabled (default: true) */ + enabled: boolean; + /** URL patterns indicating cart pages */ + cartUrlPatterns: string[]; + /** URL patterns indicating checkout pages */ + checkoutUrlPatterns: string[]; + /** Element text patterns indicating checkout-related buttons */ + checkoutElementPatterns: string[]; +} + +/** + * Default checkout detection configuration. + */ +export const DEFAULT_CHECKOUT_CONFIG: CheckoutDetectionConfig = { + enabled: true, + cartUrlPatterns: ['/cart', '/basket', '/bag', '/shopping-cart', '/gp/cart'], + checkoutUrlPatterns: ['/checkout', '/buy', '/order', '/payment', '/purchase', '/gp/checkout'], + checkoutElementPatterns: [ + 'proceed to checkout', + 'go to checkout', + 'view cart', + 'shopping cart', + 'your cart', + 'sign in to checkout', + 'continue to payment', + 'place your order', + 'buy now', + ], +}; + +/** + * Result of auth boundary detection. + */ +export interface AuthBoundaryResult { + /** Whether an auth boundary was detected */ + isAuthBoundary: boolean; + /** The URL pattern that matched (if any) */ + matchedPattern: string | null; +} + +/** + * Result of checkout page detection. + */ +export interface CheckoutDetectionResult { + /** Whether a checkout-related page was detected */ + isCheckoutRelated: boolean; + /** Whether it's a cart page */ + isCart: boolean; + /** Whether it's a checkout page */ + isCheckout: boolean; + /** The URL pattern that matched (if any) */ + matchedPattern: string | null; +} + +/** + * Detect if the current URL is an authentication boundary. + * + * An auth boundary is a login/sign-in page where the agent cannot + * proceed without credentials. This is a terminal state. + * + * @param url - Current page URL + * @param config - Auth boundary configuration + * @returns Auth boundary detection result + * + * @example + * ```typescript + * const result = detectAuthBoundary('https://amazon.com/ap/signin', config); + * if (result.isAuthBoundary) { + * console.log(`Auth page detected: ${result.matchedPattern}`); + * } + * ``` + */ +export function detectAuthBoundary( + url: string, + config: AuthBoundaryConfig = DEFAULT_AUTH_BOUNDARY_CONFIG +): AuthBoundaryResult { + if (!config.enabled || !url) { + return { isAuthBoundary: false, matchedPattern: null }; + } + + const urlLower = url.toLowerCase(); + + for (const pattern of config.urlPatterns) { + if (urlLower.includes(pattern.toLowerCase())) { + return { isAuthBoundary: true, matchedPattern: pattern }; + } + } + + return { isAuthBoundary: false, matchedPattern: null }; +} + +/** + * Detect if the current URL is a checkout-related page. + * + * @param url - Current page URL + * @param config - Checkout detection configuration + * @returns Checkout detection result + * + * @example + * ```typescript + * const result = detectCheckoutPage('https://shop.com/checkout', config); + * if (result.isCheckout) { + * console.log('On checkout page'); + * } + * ``` + */ +export function detectCheckoutPage( + url: string, + config: CheckoutDetectionConfig = DEFAULT_CHECKOUT_CONFIG +): CheckoutDetectionResult { + if (!config.enabled || !url) { + return { + isCheckoutRelated: false, + isCart: false, + isCheckout: false, + matchedPattern: null, + }; + } + + const urlLower = url.toLowerCase(); + + // Check cart patterns + for (const pattern of config.cartUrlPatterns) { + if (urlLower.includes(pattern.toLowerCase())) { + return { + isCheckoutRelated: true, + isCart: true, + isCheckout: false, + matchedPattern: pattern, + }; + } + } + + // Check checkout patterns + for (const pattern of config.checkoutUrlPatterns) { + if (urlLower.includes(pattern.toLowerCase())) { + return { + isCheckoutRelated: true, + isCart: false, + isCheckout: true, + matchedPattern: pattern, + }; + } + } + + return { + isCheckoutRelated: false, + isCart: false, + isCheckout: false, + matchedPattern: null, + }; +} + +/** + * Check if an element text matches checkout-related patterns. + * + * @param text - Element text to check + * @param config - Checkout detection configuration + * @returns true if text matches a checkout pattern + */ +export function isCheckoutElement( + text: string, + config: CheckoutDetectionConfig = DEFAULT_CHECKOUT_CONFIG +): boolean { + if (!text) return false; + + const textLower = text.toLowerCase(); + + for (const pattern of config.checkoutElementPatterns) { + if (textLower.includes(pattern)) { + return true; + } + } + + return false; +} diff --git a/src/agents/planner-executor/config.ts b/src/agents/planner-executor/config.ts index 2402853..cb481d5 100644 --- a/src/agents/planner-executor/config.ts +++ b/src/agents/planner-executor/config.ts @@ -8,16 +8,41 @@ * Snapshot escalation configuration for reliable element capture. * * When element selection fails, the agent can retry with increasing element limits. + * After exhausting limit escalation, scroll-after-escalation can be used to find + * elements that may be outside the current viewport. + * + * @example + * ```typescript + * // Default: escalation enabled with step=30 + * const config: SnapshotEscalationConfig = { enabled: true, limitBase: 60, limitStep: 30 }; + * + * // Enable scroll-after-escalation to find elements below/above viewport + * const config: SnapshotEscalationConfig = { + * ...DEFAULT_CONFIG.snapshot, + * scrollAfterEscalation: true, + * scrollDirections: ['down', 'up'], + * }; + * ``` */ export interface SnapshotEscalationConfig { /** Whether escalation is enabled (default: true) */ enabled: boolean; - /** Starting element limit (default: 50) */ + /** Starting element limit (default: 60) */ limitBase: number; - /** Increase per escalation step (default: 25) */ + /** Increase per escalation step (default: 30) */ limitStep: number; /** Maximum element limit (default: 200) */ limitMax: number; + /** Whether to scroll after limit escalation is exhausted (default: true) */ + scrollAfterEscalation: boolean; + /** Maximum scroll attempts per direction (default: 3) */ + scrollMaxAttempts: number; + /** Directions to try scrolling (default: ['down', 'up']) */ + scrollDirections: Array<'up' | 'down'>; + /** Scroll amount as fraction of viewport height (default: 0.4 = 40%) */ + scrollViewportFraction: number; + /** Stabilization delay after scroll in ms (default: 300) */ + scrollStabilizeMs: number; } /** @@ -86,9 +111,16 @@ export interface PlannerExecutorConfig { export const DEFAULT_CONFIG: PlannerExecutorConfig = { snapshot: { enabled: true, - limitBase: 50, - limitStep: 25, - limitMax: 200, + // Same defaults as Python SDK - formatContext uses multi-strategy selection + // to ensure product links are captured even with lower snapshot limits + limitBase: 60, // Initial snapshot limit (Python SDK default) + limitStep: 30, // Escalation step (Python SDK default) + limitMax: 200, // Maximum limit (Python SDK default) + scrollAfterEscalation: true, + scrollMaxAttempts: 3, + scrollDirections: ['down', 'up'], + scrollViewportFraction: 0.4, + scrollStabilizeMs: 300, }, retry: { verifyTimeoutMs: 10000, @@ -147,16 +179,18 @@ export function getConfigPreset(preset: ConfigPreset | string): PlannerExecutorC case ConfigPreset.LOCAL_SMALL_MODEL as string: case 'local_small': // Optimized for local 4B-8B models (Ollama) - // - Tighter token limits work better with small models + // - Higher token limits for models like Qwen3 that include reasoning in output // - More lenient timeouts for slower local inference + // - Higher element limits to capture product links on e-commerce pages // - Verbose mode helpful for debugging local model behavior return { ...DEFAULT_CONFIG, snapshot: { - enabled: true, - limitBase: 60, - limitStep: 30, - limitMax: 200, + ...DEFAULT_CONFIG.snapshot, + // Higher limits needed for e-commerce - many elements filtered to interactive roles + limitBase: 200, // Capture more elements (was 60) + limitStep: 50, // Larger escalation steps (was 30) + limitMax: 400, // Higher max for complex pages (was 200) }, retry: { verifyTimeoutMs: 15000, @@ -165,8 +199,13 @@ export function getConfigPreset(preset: ConfigPreset | string): PlannerExecutorC executorRepairAttempts: 3, maxReplans: 2, }, - plannerMaxTokens: 1024, - executorMaxTokens: 64, + // No token limit for planner - let Qwen3 thinking models complete reasoning + // Small local models need room to think through the task step by step + plannerMaxTokens: 8192, + // Higher token limit to accommodate Qwen3/DeepSeek models that output reasoning + // before the actual action. Qwen3 models can use 4000+ chars of reasoning before + // outputting the actual action. Need enough headroom for the model to complete. + executorMaxTokens: 4096, verbose: true, }; @@ -250,10 +289,18 @@ export type DeepPartial = { * @returns Complete PlannerExecutorConfig */ export function mergeConfig(partial: DeepPartial): PlannerExecutorConfig { + const snapshot: SnapshotEscalationConfig = { + ...DEFAULT_CONFIG.snapshot, + ...(partial.snapshot ?? {}), + // Ensure scrollDirections has correct type + scrollDirections: (partial.snapshot?.scrollDirections ?? + DEFAULT_CONFIG.snapshot.scrollDirections) as Array<'up' | 'down'>, + }; + return { ...DEFAULT_CONFIG, ...partial, - snapshot: { ...DEFAULT_CONFIG.snapshot, ...(partial.snapshot ?? {}) }, + snapshot, retry: { ...DEFAULT_CONFIG.retry, ...(partial.retry ?? {}) }, stepwise: { ...DEFAULT_CONFIG.stepwise, ...(partial.stepwise ?? {}) }, }; diff --git a/src/agents/planner-executor/index.ts b/src/agents/planner-executor/index.ts index 573ac3a..861d2e1 100644 --- a/src/agents/planner-executor/index.ts +++ b/src/agents/planner-executor/index.ts @@ -5,8 +5,23 @@ * - Planner (7B+ model): Generates JSON execution plans * - Executor (3B-7B model): Executes steps with tight prompts * - * Note: The full PlannerExecutorAgent class is not yet ported to TypeScript. - * This module provides configuration and factory helpers for when it is. + * Phase 1 (MVP) Features: + * - Stepwise (ReAct-style) planning + * - Snapshot limit escalation for reliable element capture + * - Token usage tracking + * - Pre-action authorization hook (for sidecar policy integration) + * + * Phase 2 (Reliability) Features: + * - Scroll-after-escalation (viewport scrolling to find elements) + * - Intent heuristics (text pattern matching for common intents) + * - Pre-step verification (skip steps if predicates already pass) + * - Retry/repair logic (submit method alternation) + * + * Phase 3 (Advanced) Features: + * - Vision fallback detection (detectSnapshotFailure) + * - Recovery navigation (RecoveryState, RecoveryCheckpoint) + * - Boundary detection (auth pages, checkout pages) + * - Modal/overlay dismissal (findDismissalTarget) */ // Configuration @@ -31,3 +46,112 @@ export { resolveConfig, createPlannerExecutorAgentProviders, } from './agent-factory'; + +// Plan Models (Zod schemas and types) +export { + PredicateSpecSchema, + PlanStepSchema, + PlanSchema, + ReplanPatchSchema, + ActionType, + StepStatus, + type PredicateSpec, + type PlanStep, + type Plan, + type ReplanPatch, + type ActionRecord, + type StepOutcome, + type RunOutcome, + type TokenUsageTotals, + type TokenUsageSummary, + type SnapshotContext, + type ParsedAction, + type Snapshot, + type SnapshotElement, +} from './plan-models'; + +// Prompts +export { + buildStepwisePlannerPrompt, + buildExecutorPrompt, + type StepwisePlannerResponse, +} from './prompts'; + +// Utilities +export { + parseAction, + extractJson, + normalizePlan, + validatePlanSmoothness, + formatContext, +} from './plan-utils'; + +// Predicates +export { + type Predicate, + urlContains, + urlMatches, + exists, + notExists, + elementCount, + anyOf, + allOf, + buildPredicate, + evaluatePredicates, +} from './predicates'; + +// Vision Fallback +export { + type SnapshotDiagnostics, + type VisionFallbackResult, + detectSnapshotFailure, + shouldUseVision, +} from './vision-fallback'; + +// Recovery Navigation +export { + type RecoveryNavigationConfig, + type RecoveryCheckpoint, + RecoveryState, + DEFAULT_RECOVERY_CONFIG, +} from './recovery'; + +// Boundary Detection +export { + type AuthBoundaryConfig, + type CheckoutDetectionConfig, + type AuthBoundaryResult, + type CheckoutDetectionResult, + DEFAULT_AUTH_BOUNDARY_CONFIG, + DEFAULT_CHECKOUT_CONFIG, + detectAuthBoundary, + detectCheckoutPage, + isCheckoutElement, +} from './boundary-detection'; + +// Modal Dismissal +export { + type ModalDismissalConfig, + type ModalDismissalResult, + DEFAULT_MODAL_CONFIG, + findDismissalTarget, + detectModalAppearance, + detectModalDismissed, +} from './modal-dismissal'; + +// Agent +export { + PlannerExecutorAgent, + type PlannerExecutorAgentOptions, + type PreActionAuthorizer, + type AuthorizationResult, + type AgentRuntime, + type IntentHeuristics, +} from './planner-executor-agent'; + +// Runtime (Playwright/Chromium) +export { + PlaywrightRuntime, + createPlaywrightRuntime, + type PlaywrightRuntimeOptions, +} from './playwright-runtime'; diff --git a/src/agents/planner-executor/modal-dismissal.ts b/src/agents/planner-executor/modal-dismissal.ts new file mode 100644 index 0000000..db4b4b4 --- /dev/null +++ b/src/agents/planner-executor/modal-dismissal.ts @@ -0,0 +1,356 @@ +/** + * Modal/Overlay Dismissal Logic + * + * Handles automatic dismissal of blocking overlays after DOM changes: + * - Product protection/warranty upsells + * - Cookie consent banners + * - Newsletter signup popups + * - Promotional overlays + * - Cart upsell drawers + * + * Uses word boundary matching to avoid false positives. + */ + +import type { SnapshotElement } from './plan-models'; + +/** + * Configuration for modal dismissal. + */ +export interface ModalDismissalConfig { + /** Whether modal dismissal is enabled (default: true) */ + enabled: boolean; + /** Maximum dismissal attempts per modal (default: 2) */ + maxAttempts: number; + /** Minimum new elements to consider as modal appearance (default: 5) */ + minNewElements: number; + /** Element roles to consider for dismissal (default: ['button', 'link']) */ + roleFilter: string[]; + /** Dismissal text patterns (decline/skip, close, continue) */ + dismissPatterns: string[]; + /** Icon patterns for close buttons (exact match) */ + iconPatterns: string[]; + /** Checkout button patterns to skip dismissal when found */ + checkoutPatterns: string[]; +} + +/** + * Default modal dismissal configuration. + */ +export const DEFAULT_MODAL_CONFIG: ModalDismissalConfig = { + enabled: true, + maxAttempts: 2, + minNewElements: 5, + roleFilter: ['button', 'link'], + dismissPatterns: [ + // Decline/skip patterns (highest priority) + 'no thanks', + 'no, thanks', + 'not now', + 'skip', + 'decline', + 'maybe later', + 'not interested', + // Close patterns + 'close', + 'dismiss', + 'cancel', + 'x', + // Continue patterns (lower priority) + 'continue', + 'proceed', + 'ok', + 'got it', + 'i understand', + ], + iconPatterns: ['x', '×', '✕', '✖', '✗', '╳'], + checkoutPatterns: [ + 'checkout', + 'check out', + 'proceed to checkout', + 'go to checkout', + 'view cart', + 'view bag', + 'shopping cart', + 'shopping bag', + 'continue to checkout', + 'secure checkout', + 'go to cart', + 'see cart', + 'go to bag', + ], +}; + +/** + * Candidate element for modal dismissal. + */ +interface DismissCandidate { + /** Element ID */ + id: number; + /** Match score (higher = better) */ + score: number; + /** Pattern that matched */ + matchedPattern: string; +} + +/** + * Result of modal dismissal candidate search. + */ +export interface ModalDismissalResult { + /** Whether a dismissal target was found */ + found: boolean; + /** Element ID to click for dismissal */ + elementId: number | null; + /** The pattern that matched */ + matchedPattern: string | null; + /** Whether checkout button was detected (skip dismissal) */ + hasCheckoutButton: boolean; +} + +/** + * Check if text matches a pattern using word boundary matching. + * + * This avoids false positives like: + * - "mexico" matching "x" + * - "enclosed" matching "close" + * - "boxer" matching "x" + * + * @param text - Text to search in + * @param pattern - Pattern to match + * @returns true if pattern matches with word boundaries + */ +function wordBoundaryMatch(text: string, pattern: string): boolean { + const textLower = text.toLowerCase(); + const patternLower = pattern.toLowerCase(); + + // For single-character patterns like "x", require exact match + if (patternLower.length === 1) { + return textLower === patternLower; + } + + // For longer patterns, use word boundary regex + try { + const regex = new RegExp(`\\b${escapeRegex(patternLower)}\\b`, 'i'); + return regex.test(textLower); + } catch { + // Fallback to simple includes if regex fails + return textLower.includes(patternLower); + } +} + +/** + * Escape special regex characters. + */ +function escapeRegex(str: string): string { + return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +/** + * Check if element is a global navigation cart link (skip these). + * + * @param element - Element to check + * @returns true if element appears to be global nav + */ +function isGlobalNavCartLink(element: SnapshotElement): boolean { + const text = (element.text || '').toLowerCase(); + const ariaLabel = (element.ariaLabel || '').toLowerCase(); + const href = (element.href || '').toLowerCase(); + + // Skip if it's a small cart count indicator + if (/^\d+$/.test(text.trim())) { + return true; + } + + // Skip if it looks like main nav cart + if (text === 'cart' && href.includes('/cart')) { + // Check if there's additional context suggesting it's not an overlay button + return true; + } + + return false; +} + +/** + * Find the best element to dismiss a modal/overlay. + * + * Looks for buttons with common dismissal text patterns + * and returns the best candidate. + * + * CRITICAL: First checks if the overlay contains clickable checkout-related + * elements. If found, skips dismissal since the user should interact with those. + * + * @param elements - Elements from post-action snapshot + * @param config - Modal dismissal configuration + * @returns Modal dismissal result with element ID if found + * + * @example + * ```typescript + * const result = findDismissalTarget(elements, config); + * if (result.found && result.elementId !== null) { + * await runtime.click(result.elementId); + * } + * ``` + */ +export function findDismissalTarget( + elements: SnapshotElement[], + config: ModalDismissalConfig = DEFAULT_MODAL_CONFIG +): ModalDismissalResult { + if (!config.enabled) { + return { + found: false, + elementId: null, + matchedPattern: null, + hasCheckoutButton: false, + }; + } + + // CRITICAL: Check for clickable checkout buttons first + // Only skip if there's an actual button/link (not just text) + for (const element of elements) { + const role = (element.role || '').toLowerCase(); + + // Only consider buttons and links + if (!['button', 'link'].includes(role)) { + continue; + } + + // Skip global nav cart links + if (isGlobalNavCartLink(element)) { + continue; + } + + const text = (element.text || '').toLowerCase(); + const ariaLabel = (element.ariaLabel || '').toLowerCase(); + const href = (element.href || '').toLowerCase(); + + // Check text/aria-label for checkout patterns + for (const pattern of config.checkoutPatterns) { + if (text.includes(pattern) || ariaLabel.includes(pattern)) { + return { + found: false, + elementId: null, + matchedPattern: null, + hasCheckoutButton: true, + }; + } + } + + // Check href for cart/checkout links + if (href.includes('cart') || href.includes('checkout') || href.includes('bag')) { + return { + found: false, + elementId: null, + matchedPattern: null, + hasCheckoutButton: true, + }; + } + } + + // Find candidates that match dismissal patterns + const candidates: DismissCandidate[] = []; + + for (const element of elements) { + const id = element.id; + if (id === undefined) continue; + + const role = (element.role || '').toLowerCase(); + + // Only consider specified roles + if (!config.roleFilter.includes(role)) { + continue; + } + + const text = (element.text || '').toLowerCase().trim(); + const ariaLabel = (element.ariaLabel || '').toLowerCase(); + + // Check for icon patterns (exact match, highest priority) + for (const icon of config.iconPatterns) { + if (text === icon || ariaLabel === icon) { + candidates.push({ + id, + score: 200, // Highest priority + matchedPattern: icon, + }); + break; + } + } + + // Check for dismissal patterns (word boundary match) + for (let i = 0; i < config.dismissPatterns.length; i++) { + const pattern = config.dismissPatterns[i]; + + if (wordBoundaryMatch(text, pattern) || wordBoundaryMatch(ariaLabel, pattern)) { + // Score: earlier patterns have higher priority + candidates.push({ + id, + score: 100 - i, + matchedPattern: pattern, + }); + break; + } + } + } + + // Return best candidate (highest score) + if (candidates.length === 0) { + return { + found: false, + elementId: null, + matchedPattern: null, + hasCheckoutButton: false, + }; + } + + candidates.sort((a, b) => b.score - a.score); + const best = candidates[0]; + + return { + found: true, + elementId: best.id, + matchedPattern: best.matchedPattern, + hasCheckoutButton: false, + }; +} + +/** + * Detect if significant DOM change occurred (potential modal). + * + * @param preElements - Element IDs before action + * @param postElements - Element IDs after action + * @param minNewElements - Minimum new elements to consider as modal + * @returns true if modal-like change detected + */ +export function detectModalAppearance( + preElements: Set, + postElements: Set, + minNewElements: number = DEFAULT_MODAL_CONFIG.minNewElements +): boolean { + const newElements = new Set(); + for (const id of postElements) { + if (!preElements.has(id)) { + newElements.add(id); + } + } + return newElements.size >= minNewElements; +} + +/** + * Detect if modal was successfully dismissed. + * + * @param preElements - Element IDs before dismissal attempt + * @param postElements - Element IDs after dismissal attempt + * @param minRemovedElements - Minimum removed elements to consider dismissed (default: 3) + * @returns true if significant elements were removed (modal dismissed) + */ +export function detectModalDismissed( + preElements: Set, + postElements: Set, + minRemovedElements: number = 3 +): boolean { + const removedElements = new Set(); + for (const id of preElements) { + if (!postElements.has(id)) { + removedElements.add(id); + } + } + return removedElements.size >= minRemovedElements; +} diff --git a/src/agents/planner-executor/plan-models.ts b/src/agents/planner-executor/plan-models.ts new file mode 100644 index 0000000..10e5b95 --- /dev/null +++ b/src/agents/planner-executor/plan-models.ts @@ -0,0 +1,308 @@ +/** + * Plan Models for PlannerExecutorAgent + * + * Zod schemas and types for execution plans, steps, and verification predicates. + * Ported from Python SDK's Pydantic models. + */ + +import { z } from 'zod'; + +// --------------------------------------------------------------------------- +// Predicate Specification +// --------------------------------------------------------------------------- + +/** + * Schema for verification predicate specification. + * + * Predicates are used to verify step outcomes: + * - url_contains: Check if URL contains a substring + * - url_matches: Check if URL matches a pattern + * - exists: Check if element exists + * - not_exists: Check if element does not exist + * - element_count: Check element count within range + * - any_of: Any predicate passes + * - all_of: All predicates pass + */ +export const PredicateSpecSchema = z.object({ + predicate: z + .string() + .describe('Predicate type: url_contains, exists, not_exists, any_of, all_of, element_count'), + args: z.array(z.any()).default([]).describe('Predicate arguments'), +}); + +export type PredicateSpec = z.infer; + +// --------------------------------------------------------------------------- +// Plan Step +// --------------------------------------------------------------------------- + +/** + * Action types supported by the executor. + */ +export const ActionType = z.enum([ + 'NAVIGATE', + 'CLICK', + 'TYPE', + 'TYPE_AND_SUBMIT', + 'SCROLL', + 'PRESS', + 'WAIT', + 'EXTRACT', + 'DONE', +]); + +export type ActionType = z.infer; + +/** + * Type for a plan step. + */ +export interface PlanStep { + id: number; + goal: string; + action: string; + target?: string; + intent?: string; + input?: string; + verify: PredicateSpec[]; + required: boolean; + stopIfTrue: boolean; + optionalSubsteps: PlanStep[]; + heuristicHints: Record[]; +} + +/** + * Schema for a single step in the execution plan. + * Note: For simplicity, optionalSubsteps validation is shallow (z.any()). + * Full recursive validation happens at runtime if needed. + */ +export const PlanStepSchema = z.object({ + id: z.number().describe('Step ID (1-indexed, contiguous)'), + goal: z.string().describe('Human-readable goal for this step'), + action: z + .string() + .describe('Action type: NAVIGATE, CLICK, TYPE_AND_SUBMIT, SCROLL, EXTRACT, DONE'), + target: z.string().optional().describe('URL for NAVIGATE action'), + intent: z.string().optional().describe('Intent hint for CLICK action'), + input: z.string().optional().describe('Text for TYPE_AND_SUBMIT action'), + verify: z.array(PredicateSpecSchema).default([]).describe('Verification predicates'), + required: z.boolean().default(true).describe('If True, step failure triggers replan'), + stopIfTrue: z + .boolean() + .default(false) + .describe('If True, stop execution when verification passes'), + optionalSubsteps: z.array(z.any()).default([]).describe('Optional fallback steps'), + heuristicHints: z + .array(z.record(z.any())) + .default([]) + .describe('Planner-generated hints for element selection'), +}); + +// --------------------------------------------------------------------------- +// Plan +// --------------------------------------------------------------------------- + +/** + * Schema for execution plan generated by the Planner. + */ +export const PlanSchema = z.object({ + task: z.string().describe('Original task description'), + notes: z.array(z.string()).default([]).describe('Planner notes/assumptions'), + steps: z.array(PlanStepSchema).describe('Ordered execution steps'), +}); + +export type Plan = z.infer; + +// --------------------------------------------------------------------------- +// Replan Patch +// --------------------------------------------------------------------------- + +/** + * Schema for plan patch (used when replanning after step failure). + */ +export const ReplanPatchSchema = z.object({ + mode: z.literal('patch'), + replaceSteps: z + .array( + z.object({ + id: z.number(), + step: PlanStepSchema, + }) + ) + .describe('Steps to replace by ID'), +}); + +export type ReplanPatch = z.infer; + +// --------------------------------------------------------------------------- +// Action Record (for stepwise planning history) +// --------------------------------------------------------------------------- + +/** + * Record of an executed action for history tracking in stepwise planning. + */ +export interface ActionRecord { + /** Step number (1-indexed) */ + stepNum: number; + /** Action type (CLICK, TYPE_AND_SUBMIT, SCROLL, etc.) */ + action: string; + /** Element description or URL */ + target: string | null; + /** Outcome (success, failed) */ + result: string; + /** URL after action completed */ + urlAfter: string | null; +} + +// --------------------------------------------------------------------------- +// Step Outcome +// --------------------------------------------------------------------------- + +/** + * Status of a step execution. + */ +export enum StepStatus { + SUCCESS = 'success', + FAILED = 'failed', + SKIPPED = 'skipped', + VISION_FALLBACK = 'vision_fallback', +} + +/** + * Result of executing a single plan step. + */ +export interface StepOutcome { + stepId: number; + goal: string; + status: StepStatus; + actionTaken?: string; + verificationPassed: boolean; + usedVision: boolean; + error?: string; + durationMs: number; + urlBefore?: string; + urlAfter?: string; + extractedData?: unknown; +} + +// --------------------------------------------------------------------------- +// Run Outcome +// --------------------------------------------------------------------------- + +/** + * Result of a complete agent run. + */ +export interface RunOutcome { + runId: string; + task: string; + success: boolean; + stepsCompleted: number; + stepsTotal: number; + replansUsed: number; + stepOutcomes: StepOutcome[]; + totalDurationMs: number; + error?: string; + tokenUsage?: TokenUsageSummary; + fallbackUsed: boolean; +} + +// --------------------------------------------------------------------------- +// Token Usage Tracking +// --------------------------------------------------------------------------- + +/** + * Token usage totals for a single role or model. + */ +export interface TokenUsageTotals { + calls: number; + promptTokens: number; + completionTokens: number; + totalTokens: number; +} + +/** + * Summary of all token usage. + */ +export interface TokenUsageSummary { + total: TokenUsageTotals; + byRole: Record; + byModel: Record; +} + +// --------------------------------------------------------------------------- +// Snapshot Types +// --------------------------------------------------------------------------- + +/** + * Snapshot element for context formatting. + */ +export interface SnapshotElement { + id: number; + role?: string; + text?: string; + name?: string; + importance?: number; + isPrimary?: boolean; + background?: boolean; + clickable?: boolean; + nearbyText?: string; + ordinal?: string; + inDominantGroup?: boolean; + href?: string; + ariaLabel?: string; +} + +/** + * Snapshot data from browser runtime. + */ +export interface Snapshot { + url: string; + title: string; + elements: SnapshotElement[]; + screenshot?: string; + status?: string; +} + +// --------------------------------------------------------------------------- +// Snapshot Context +// --------------------------------------------------------------------------- + +/** + * Shared page state between Planner and Executor. + * + * Enables snapshot sharing to avoid redundant captures and + * tracks metadata for vision fallback decisions. + */ +export interface SnapshotContext { + /** The snapshot data */ + snapshot: Snapshot | null; + /** Compact representation for LLM context */ + compactRepresentation: string; + /** Base64-encoded screenshot */ + screenshotBase64: string | null; + /** When the snapshot was captured */ + capturedAt: Date; + /** Element limit used for this snapshot */ + limitUsed: number; + /** Whether snapshot was successful */ + snapshotSuccess: boolean; + /** Whether vision fallback is required */ + requiresVision: boolean; + /** Reason for vision fallback */ + visionReason: string | null; + /** Pruning category used (if any) */ + pruningCategory: string | null; + /** Number of elements after pruning */ + prunedNodeCount: number; +} + +// --------------------------------------------------------------------------- +// Parsed Action +// --------------------------------------------------------------------------- + +/** + * Parsed action from executor response. + */ +export interface ParsedAction { + action: string; + args: unknown[]; +} diff --git a/src/agents/planner-executor/plan-utils.ts b/src/agents/planner-executor/plan-utils.ts new file mode 100644 index 0000000..9867954 --- /dev/null +++ b/src/agents/planner-executor/plan-utils.ts @@ -0,0 +1,648 @@ +/** + * Plan Utilities for PlannerExecutorAgent + * + * Action parsing, plan normalization, and validation utilities. + */ + +import type { ParsedAction, PredicateSpec, Plan, SnapshotElement } from './plan-models'; + +// --------------------------------------------------------------------------- +// Action Parsing +// --------------------------------------------------------------------------- + +/** + * Parse action from executor response. + * + * Handles various LLM output formats: + * - CLICK(42) + * - - CLICK(42) (with leading dash/bullet) + * - TYPE(42, "text") + * - - TYPE(42, "Logitech mouse") + * - SCROLL(down) + * - PRESS('Enter') + * - NONE (executor couldn't find element) + * + * @param text - Raw executor response + * @returns Parsed action with type and arguments + */ +export function parseAction(text: string): ParsedAction { + let cleaned = text.trim(); + + // Strip ... tags (Qwen/DeepSeek reasoning output) + cleaned = cleaned.replace(/[\s\S]*?<\/think>/gi, '').trim(); + // If never closed, strip from first to end + cleaned = cleaned.replace(/[\s\S]*$/gi, '').trim(); + + // If after stripping think tags we have empty content, return NONE + // This happens when the model only outputs thinking without an actual action + if (!cleaned || cleaned.length === 0) { + return { action: 'NONE', args: ['empty response after stripping think tags'] }; + } + + // Strip common prefixes (bullets, dashes, asterisks) + cleaned = cleaned.replace(/^[-*•]\s*/, ''); + + // CLICK() + const clickMatch = cleaned.match(/CLICK\((\d+)\)/); + if (clickMatch) { + return { action: 'CLICK', args: [parseInt(clickMatch[1], 10)] }; + } + + // TYPE(, "text") - also handle without quotes + const typeMatch = cleaned.match(/TYPE\((\d+),\s*["']?([^"']+?)["']?\)/); + if (typeMatch) { + return { action: 'TYPE', args: [parseInt(typeMatch[1], 10), typeMatch[2].trim()] }; + } + + // PRESS('key') + const pressMatch = cleaned.match(/PRESS\(['"]?(.+?)['"]?\)/); + if (pressMatch) { + return { action: 'PRESS', args: [pressMatch[1]] }; + } + + // SCROLL(direction) + const scrollMatch = cleaned.match(/SCROLL\((\w+)\)/); + if (scrollMatch) { + return { action: 'SCROLL', args: [scrollMatch[1]] }; + } + + // FINISH() + if (cleaned.includes('FINISH')) { + return { action: 'FINISH', args: [] }; + } + + // DONE + if (cleaned.toUpperCase().includes('DONE')) { + return { action: 'DONE', args: [] }; + } + + // NONE - executor couldn't find a suitable element + if (cleaned.toUpperCase() === 'NONE' || cleaned.toUpperCase().includes('NONE')) { + return { action: 'NONE', args: [] }; + } + + return { action: 'UNKNOWN', args: [text] }; +} + +// --------------------------------------------------------------------------- +// JSON Extraction +// --------------------------------------------------------------------------- + +/** + * Strip thinking tags from LLM response (Qwen, DeepSeek, etc.) + */ +function stripThinkingTags(content: string): string { + let cleaned = content; + // Strip complete ... tags + cleaned = cleaned.replace(/[\s\S]*?<\/think>/gi, '').trim(); + // If never closed, strip from first to end + cleaned = cleaned.replace(/[\s\S]*$/gi, '').trim(); + return cleaned; +} + +/** + * Extract JSON from LLM response that may contain markdown or prose. + * + * Handles: + * - Pure JSON responses + * - JSON wrapped in ```json code blocks + * - JSON embedded in prose text + * - Qwen/DeepSeek ... tags + * + * @param content - Raw LLM response + * @returns Parsed JSON object + * @throws Error if no valid JSON found + */ +export function extractJson(content: string): Record { + // Strip thinking tags first (Qwen, DeepSeek models) + const cleaned = stripThinkingTags(content); + + // Try direct parse first + try { + return JSON.parse(cleaned); + } catch { + // Continue to extraction methods + } + + // Try to extract from code block + const codeBlockMatch = cleaned.match(/```(?:json)?\s*([\s\S]*?)```/); + if (codeBlockMatch) { + try { + return JSON.parse(codeBlockMatch[1].trim()); + } catch { + // Continue to other methods + } + } + + // Try to find JSON object in text + const jsonMatch = cleaned.match(/\{[\s\S]*\}/); + if (jsonMatch) { + try { + return JSON.parse(jsonMatch[0]); + } catch { + // Continue to last resort + } + } + + throw new Error(`Failed to extract JSON from response: ${cleaned.slice(0, 200)}`); +} + +// --------------------------------------------------------------------------- +// Plan Normalization +// --------------------------------------------------------------------------- + +/** + * Action name aliases to normalize LLM output variations. + */ +const ACTION_ALIASES: Record = { + CLICK_ELEMENT: 'CLICK', + CLICK_BUTTON: 'CLICK', + CLICK_LINK: 'CLICK', + INPUT: 'TYPE_AND_SUBMIT', + TYPE_TEXT: 'TYPE_AND_SUBMIT', + ENTER_TEXT: 'TYPE_AND_SUBMIT', + EXTRACT_TEXT: 'EXTRACT', + GOTO: 'NAVIGATE', + GO_TO: 'NAVIGATE', + OPEN: 'NAVIGATE', + SCROLL_DOWN: 'SCROLL', + SCROLL_UP: 'SCROLL', +}; + +/** + * Parse a string predicate into a normalized object. + * + * LLMs sometimes output predicates as strings like: + * - "url_contains('amazon.com')" -> {predicate: "url_contains", args: ["amazon.com"]} + * - "exists(role=button)" -> {predicate: "exists", args: ["role=button"]} + */ +function parseStringPredicate(predStr: string): Record | null { + const cleaned = predStr.trim(); + + // Try to match function-call style: predicate_name(args) + // Use [\s\S] instead of . with 's' flag for cross-browser compatibility + const match = cleaned.match(/^(\w+)\s*\(\s*([\s\S]+?)\s*\)$/); + if (match) { + const predName = match[1]; + let argsStr = match[2].trim(); + + // Strip quotes from args if present + if ( + (argsStr.startsWith("'") && argsStr.endsWith("'")) || + (argsStr.startsWith('"') && argsStr.endsWith('"')) + ) { + argsStr = argsStr.slice(1, -1); + } + + return { + predicate: predName, + args: [argsStr], + }; + } + + // Try simple predicate name without args + if (/^[\w_]+$/.test(cleaned)) { + return { + predicate: cleaned, + args: [], + }; + } + + return null; +} + +/** + * Normalize a verify predicate to the expected format. + * + * LLMs may output predicates in various formats: + * - {"url_contains": "amazon.com"} -> {"predicate": "url_contains", "args": ["amazon.com"]} + * - {"predicate": "url_contains", "input": "x"} -> {"predicate": "url_contains", "args": ["x"]} + */ +function normalizeVerifyPredicate(pred: Record): Record { + const result = { ...pred }; + + // Handle "type" field as alternative to "predicate" + if ('type' in result && !('predicate' in result)) { + result.predicate = result.type; + delete result.type; + } + + // Already has predicate field - normalize args + if ('predicate' in result) { + if (!result.args || (Array.isArray(result.args) && result.args.length === 0)) { + if ('input' in result) { + result.args = [result.input]; + delete result.input; + } else if ('value' in result) { + result.args = [result.value]; + delete result.value; + } else if ('pattern' in result) { + result.args = [result.pattern]; + delete result.pattern; + } else if ('substring' in result) { + result.args = [result.substring]; + delete result.substring; + } else if ('selector' in result) { + result.args = [result.selector]; + delete result.selector; + } + } + return result; + } + + // Predicate type is a key in the dict (e.g., {"url_contains": "amazon.com"}) + const knownPredicates = [ + 'url_contains', + 'url_equals', + 'url_matches', + 'exists', + 'not_exists', + 'element_count', + 'element_visible', + 'any_of', + 'all_of', + 'text_contains', + 'text_equals', + ]; + + for (const predType of knownPredicates) { + if (predType in result) { + return { + predicate: predType, + args: result[predType] ? [result[predType]] : [], + }; + } + } + + // Unknown format - return as-is + return result; +} + +/** + * Normalize plan dictionary to handle LLM output variations. + * + * Handles: + * - url vs target field names + * - action aliases (click vs CLICK) + * - step id variations (string vs int) + * - verify predicate format variations + * + * @param planDict - Raw plan dictionary from LLM + * @returns Normalized plan dictionary + */ +export function normalizePlan(planDict: Record): Record { + const result = { ...planDict }; + + if ('steps' in result && Array.isArray(result.steps)) { + result.steps = result.steps.map((step: Record) => { + const normalizedStep = { ...step }; + + // Normalize action names to uppercase + if ('action' in normalizedStep && typeof normalizedStep.action === 'string') { + const action = normalizedStep.action.toUpperCase(); + normalizedStep.action = ACTION_ALIASES[action] || action; + } + + // Normalize url -> target for NAVIGATE actions + if ('url' in normalizedStep && !('target' in normalizedStep)) { + normalizedStep.target = normalizedStep.url; + delete normalizedStep.url; + } + + // Ensure step id is number + if ('id' in normalizedStep && typeof normalizedStep.id === 'string') { + const parsed = parseInt(normalizedStep.id, 10); + if (!isNaN(parsed)) { + normalizedStep.id = parsed; + } + } + + // Normalize verify predicates + if ('verify' in normalizedStep && Array.isArray(normalizedStep.verify)) { + normalizedStep.verify = normalizedStep.verify.map((pred: unknown) => { + if (typeof pred === 'object' && pred !== null) { + return normalizeVerifyPredicate(pred as Record); + } else if (typeof pred === 'string') { + const parsed = parseStringPredicate(pred); + if (parsed) { + return parsed; + } + return { predicate: 'unknown', args: [pred] }; + } + return pred; + }); + } + + // Normalize optional_substeps recursively + if ( + 'optional_substeps' in normalizedStep && + Array.isArray(normalizedStep.optional_substeps) + ) { + normalizedStep.optionalSubsteps = normalizedStep.optional_substeps.map( + (substep: Record) => { + const normalizedSubstep = { ...substep }; + if ('action' in normalizedSubstep && typeof normalizedSubstep.action === 'string') { + normalizedSubstep.action = normalizedSubstep.action.toUpperCase(); + } + if ('url' in normalizedSubstep && !('target' in normalizedSubstep)) { + normalizedSubstep.target = normalizedSubstep.url; + delete normalizedSubstep.url; + } + return normalizedSubstep; + } + ); + delete normalizedStep.optional_substeps; + } + + // Convert snake_case to camelCase for common fields + if ('stop_if_true' in normalizedStep) { + normalizedStep.stopIfTrue = normalizedStep.stop_if_true; + delete normalizedStep.stop_if_true; + } + if ('heuristic_hints' in normalizedStep) { + normalizedStep.heuristicHints = normalizedStep.heuristic_hints; + delete normalizedStep.heuristic_hints; + } + + return normalizedStep; + }); + } + + return result; +} + +// --------------------------------------------------------------------------- +// Plan Validation +// --------------------------------------------------------------------------- + +/** + * Validate plan quality and smoothness. + * + * Checks for common issues that indicate a low-quality plan: + * - Missing verification predicates + * - Consecutive same actions + * - Empty or too short plans + * - Missing required fields + * + * @param plan - Parsed Plan object + * @returns List of warning strings (empty if plan is smooth) + */ +export function validatePlanSmoothness(plan: Plan): string[] { + const warnings: string[] = []; + + // Check for empty plan + if (!plan.steps || plan.steps.length === 0) { + warnings.push('Plan has no steps'); + return warnings; + } + + // Check for very short plans (might be incomplete) + if (plan.steps.length < 2) { + warnings.push('Plan has only one step - might be incomplete'); + } + + // Check each step + let prevAction: string | null = null; + for (const step of plan.steps) { + // Check for missing verification + if ((!step.verify || step.verify.length === 0) && step.required !== false) { + warnings.push(`Step ${step.id} has no verification predicates`); + } + + // Check for consecutive same actions (might indicate loop) + if (step.action === prevAction && step.action === 'CLICK') { + warnings.push(`Steps ${step.id - 1} and ${step.id} both use ${step.action}`); + } + + // Check for NAVIGATE without target + if (step.action === 'NAVIGATE' && !step.target) { + warnings.push(`Step ${step.id} is NAVIGATE but has no target URL`); + } + + // Check for CLICK without intent + if (step.action === 'CLICK' && !step.intent) { + warnings.push(`Step ${step.id} is CLICK but has no intent hint`); + } + + // Check for TYPE_AND_SUBMIT without input + if (step.action === 'TYPE_AND_SUBMIT' && !step.input) { + warnings.push(`Step ${step.id} is TYPE_AND_SUBMIT but has no input`); + } + + prevAction = step.action; + } + + return warnings; +} + +// --------------------------------------------------------------------------- +// Context Formatting +// --------------------------------------------------------------------------- + +/** + * Format snapshot elements for LLM context. + * + * Uses compact format: id|role|text|importance|is_primary|bg|clickable|nearby_text|ord|DG|href + * + * Uses multi-strategy selection (like Python SDK) to ensure diverse element coverage: + * 1. Top elements by importance (captures high-priority navigation) + * 2. Elements from dominant group (captures product listings) + * 3. Top elements by position (captures visible content regardless of importance) + * + * @param elements - Array of snapshot elements + * @param limit - Maximum number of elements to include + * @returns Compact string representation + */ +export function formatContext(elements: SnapshotElement[], limit: number = 200): string { + // Filter to interactive elements + const interactiveRoles = new Set([ + 'button', + 'link', + 'textbox', + 'searchbox', + 'combobox', + 'checkbox', + 'radio', + 'slider', + 'tab', + 'menuitem', + 'option', + 'switch', + 'cell', + 'a', + 'input', + 'select', + 'textarea', + ]); + + // Roles that should be prioritized (input elements for typing) + const inputRoles = new Set(['textbox', 'searchbox', 'combobox', 'input', 'textarea']); + + // Include elements that are: + // 1. Have an interactive role (button, link, textbox, etc.) + // 2. Are marked as clickable + // 3. Have an href (links, even if role is not 'link') + const filtered = elements.filter(el => { + const role = (el.role || '').toLowerCase(); + const isInteractive = interactiveRoles.has(role); + const hasHref = Boolean(el.href); + return isInteractive || el.clickable || hasHref; + }); + + // Debug: Log filtering stats + const linkCount = elements.filter(el => (el.role || '').toLowerCase() === 'link').length; + const buttonCount = elements.filter(el => (el.role || '').toLowerCase() === 'button').length; + const hrefCount = elements.filter(el => Boolean(el.href)).length; + const clickableCount = elements.filter(el => el.clickable).length; + console.log( + ` [formatContext] Total: ${elements.length}, Links: ${linkCount}, Buttons: ${buttonCount}, Href: ${hrefCount}, Clickable: ${clickableCount}, After filter: ${filtered.length}` + ); + + // === Multi-strategy selection (like Python SDK) === + const selectedIds = new Set(); + const selected: SnapshotElement[] = []; + + // Helper to add element if not already selected + const addElement = (el: SnapshotElement): boolean => { + if (el.id === undefined || selectedIds.has(el.id)) return false; + selectedIds.add(el.id); + selected.push(el); + return true; + }; + + // 1. First, add all input elements (searchbox, textbox, etc.) - these are critical + for (const el of filtered) { + const role = (el.role || '').toLowerCase(); + if (inputRoles.has(role)) { + addElement(el); + } + } + + // 2. Top 40 by importance (like Python SDK) + const byImportance = [...filtered].sort((a, b) => (b.importance || 0) - (a.importance || 0)); + for (const el of byImportance.slice(0, 40)) { + addElement(el); + } + + // 3. Elements from dominant group (product listings typically have inDominantGroup=true) + const dominantGroup = filtered.filter(el => el.inDominantGroup); + for (const el of dominantGroup) { + if (selected.length >= 80) break; // Cap at ~80 from dominant group + addElement(el); + } + + // 4. Top 30 by position (elements appearing earlier in document, like Python SDK) + // This ensures we capture visible products even if they have low importance + // Sort by element ID as proxy for document order (lower ID = earlier in DOM) + const byPosition = [...filtered].sort((a, b) => { + // Use element ID as proxy for document order + return (a.id || 0) - (b.id || 0); + }); + for (const el of byPosition.slice(0, 30)) { + addElement(el); + } + + // 5. Product-like links: links with longer text and product URLs + // This captures product cards that might have low importance but are clickable + const productLinks = filtered.filter(el => { + const role = (el.role || '').toLowerCase(); + const text = (el.text || '').trim(); + const href = (el.href || '').toLowerCase(); + // Links with substantial text (>15 chars) or product URL patterns + return ( + role === 'link' && + (text.length > 15 || + href.includes('/dp/') || + href.includes('/product/') || + href.includes('/item/') || + href.includes('/p/')) + ); + }); + for (const el of productLinks.slice(0, 20)) { + addElement(el); + } + + // 6. Fill remaining slots with any remaining elements up to limit + for (const el of filtered) { + if (selected.length >= limit) break; + addElement(el); + } + + // Format each element + // Format: id|role|text|importance|is_primary|bg|clickable|nearby_text|ord|DG|href + // (matches Python SDK format exactly) + const lines: string[] = []; + + // Add header row + lines.push('id|role|text|importance|is_primary|bg|clickable|nearby_text|ord|DG|href'); + for (const el of selected) { + // If element has href, treat as link (like Python SDK) + const role = el.href ? 'link' : el.role || ''; + + // Truncate text to 30 chars (like Python SDK) + const text = truncateText(el.text || el.name || '', 30); + + // Build line in Python SDK order + const parts = [ + el.id, + role, + text, + el.importance || 0, + el.isPrimary ? '1' : '0', + '', // bg (background color name) + el.clickable ? '1' : '0', + truncateText(el.nearbyText || '', 20), + el.ordinal || '', + el.inDominantGroup ? '1' : '0', + compressHref(el.href || ''), + ]; + lines.push(parts.join('|')); + } + + return lines.join('\n'); +} + +/** + * Compress href to last path segment (like Python SDK). + */ +function compressHref(href: string): string { + if (!href) return ''; + href = href.trim(); + + // Relative URL - get last segment + if (href.startsWith('/')) { + const parts = href.split('/'); + const last = parts[parts.length - 1] || ''; + return last.slice(0, 20); + } + + // Absolute URL - try to parse + try { + const url = new URL(href); + if (url.pathname && url.pathname !== '/') { + const parts = url.pathname.replace(/\/$/, '').split('/'); + const last = parts[parts.length - 1] || ''; + return last.slice(0, 20) || url.hostname.slice(0, 15); + } + return url.hostname.slice(0, 15); + } catch { + return href.slice(0, 20); + } +} + +/** + * Truncate and sanitize text for LLM context. + * Removes newlines and excessive whitespace to keep pipe-delimited format intact. + */ +function truncateText(text: string, maxLen: number): string { + // Replace newlines and multiple spaces with single space + const sanitized = text + .replace(/[\r\n]+/g, ' ') + .replace(/\s+/g, ' ') + .trim(); + if (sanitized.length <= maxLen) return sanitized; + return sanitized.slice(0, maxLen - 3) + '...'; +} + +// Re-export SnapshotElement for backwards compatibility +export type { SnapshotElement } from './plan-models'; diff --git a/src/agents/planner-executor/planner-executor-agent.ts b/src/agents/planner-executor/planner-executor-agent.ts new file mode 100644 index 0000000..bceb91e --- /dev/null +++ b/src/agents/planner-executor/planner-executor-agent.ts @@ -0,0 +1,1415 @@ +/** + * PlannerExecutorAgent: Two-tier agent architecture for browser automation. + * + * MVP implementation with: + * - Stepwise (ReAct-style) planning + * - Compact context formatting for small models + * - Snapshot escalation for reliable element capture + * - Pre-step verification + * - Retry/repair logic + * - Token usage tracking + * - Pre-action authorization hook (for sidecar policy integration) + * + * Deferred to post-MVP: + * - Vision fallback + * - Modal/overlay dismissal + * - Captcha handling + * - Intent heuristics + * - Recovery navigation + */ + +import type { LLMProvider, LLMResponse } from '../../llm-provider'; +import type { + PlannerExecutorConfig, + SnapshotEscalationConfig, + RetryConfig, + StepwisePlanningConfig, +} from './config'; +import { DEFAULT_CONFIG, mergeConfig, type DeepPartial } from './config'; +import type { + Plan, + PlanStep, + ActionRecord, + StepOutcome, + RunOutcome, + TokenUsageSummary, + TokenUsageTotals, + SnapshotContext, + ParsedAction, + Snapshot, + SnapshotElement, +} from './plan-models'; +import { StepStatus, PlanSchema } from './plan-models'; +import { + buildStepwisePlannerPrompt, + buildExecutorPrompt, + type StepwisePlannerResponse, +} from './prompts'; +import { parseAction, extractJson, normalizePlan, formatContext } from './plan-utils'; + +// --------------------------------------------------------------------------- +// Token Usage Collector +// --------------------------------------------------------------------------- + +/** + * Collects token usage statistics by role (planner/executor) and model. + */ +class TokenUsageCollector { + private byRole: Map = new Map(); + private byModel: Map = new Map(); + + record(role: string, resp: LLMResponse): void { + // By role + const roleTotals = this.byRole.get(role) || { + calls: 0, + promptTokens: 0, + completionTokens: 0, + totalTokens: 0, + }; + roleTotals.calls += 1; + const pt = typeof resp.promptTokens === 'number' ? Math.max(0, resp.promptTokens) : 0; + const ct = typeof resp.completionTokens === 'number' ? Math.max(0, resp.completionTokens) : 0; + const tt = typeof resp.totalTokens === 'number' ? Math.max(0, resp.totalTokens) : pt + ct; + roleTotals.promptTokens += pt; + roleTotals.completionTokens += ct; + roleTotals.totalTokens += tt; + this.byRole.set(role, roleTotals); + + // By model + const modelName = (resp.modelName || '').trim() || 'unknown'; + const modelTotals = this.byModel.get(modelName) || { + calls: 0, + promptTokens: 0, + completionTokens: 0, + totalTokens: 0, + }; + modelTotals.calls += 1; + modelTotals.promptTokens += pt; + modelTotals.completionTokens += ct; + modelTotals.totalTokens += tt; + this.byModel.set(modelName, modelTotals); + } + + reset(): void { + this.byRole.clear(); + this.byModel.clear(); + } + + summary(): TokenUsageSummary { + // Sum totals + const total: TokenUsageTotals = { + calls: 0, + promptTokens: 0, + completionTokens: 0, + totalTokens: 0, + }; + for (const t of this.byRole.values()) { + total.calls += t.calls; + total.promptTokens += t.promptTokens; + total.completionTokens += t.completionTokens; + total.totalTokens += t.totalTokens; + } + + // Convert maps to records + const byRole: Record = {}; + for (const [k, v] of this.byRole) { + byRole[k] = { ...v }; + } + const byModel: Record = {}; + for (const [k, v] of this.byModel) { + byModel[k] = { ...v }; + } + + return { total, byRole, byModel }; + } +} + +// --------------------------------------------------------------------------- +// Pre-Action Authorizer Interface +// --------------------------------------------------------------------------- + +/** + * Authorization result from pre-action check. + */ +export interface AuthorizationResult { + /** Whether the action is allowed */ + allowed: boolean; + /** Reason for denial (if not allowed) */ + reason?: string; + /** Alternative action to take (if any) */ + alternative?: string; +} + +/** + * Hook for pre-action authorization (e.g., sidecar policy evaluation). + * + * Called before each action is executed. If the action is denied, + * the agent will skip the action and record the denial. + * + * @example + * ```typescript + * const authorizer: PreActionAuthorizer = async (action, context) => { + * const response = await fetch('http://localhost:3500/v1/authorize', { + * method: 'POST', + * body: JSON.stringify({ + * principal: 'agent:browser-automation', + * action: `browser.${action.type.toLowerCase()}`, + * resource: context.url, + * }), + * }); + * const result = await response.json(); + * return { allowed: result.decision === 'ALLOW', reason: result.reason }; + * }; + * ``` + */ +export type PreActionAuthorizer = ( + action: { type: string; elementId?: number; value?: string }, + context: { url: string; stepGoal: string; taskGoal: string } +) => Promise; + +// --------------------------------------------------------------------------- +// Intent Heuristics Interface +// --------------------------------------------------------------------------- + +/** + * Interface for intent-based element selection. + * + * Allows bypassing LLM for known patterns (e.g., "add to cart" buttons). + * Can be implemented by domain-specific heuristics. + */ +export interface IntentHeuristics { + /** + * Find element matching the given intent. + * + * @param intent - Intent string (e.g., "add_to_cart", "search") + * @param elements - Available elements from snapshot + * @param url - Current page URL + * @param goal - Step goal description + * @returns Element ID if found, null otherwise + */ + findElementForIntent( + intent: string, + elements: SnapshotElement[], + url: string, + goal: string + ): number | null; + + /** + * Get priority order for intent matching. + * Higher priority intents are tried first. + */ + priorityOrder(): string[]; +} + +/** + * Common text patterns for intent matching. + * Used as fallback when no custom heuristics are provided. + */ +const COMMON_INTENT_PATTERNS: Record = { + add_to_cart: ['add to cart', 'add to bag', 'add to basket', 'buy now', 'add item'], + checkout: ['checkout', 'proceed to checkout', 'go to checkout', 'check out'], + search: ['search', 'find', 'go', 'submit'], + login: ['log in', 'login', 'sign in', 'signin'], + submit: ['submit', 'send', 'continue', 'next', 'confirm'], + close: ['close', 'dismiss', 'x', 'cancel', 'no thanks'], +}; + +/** + * Action verbs to strip from descriptive intents. + * E.g., "click the Add to Cart button" → "add to cart button" + */ +const ACTION_VERBS = [ + 'click', + 'tap', + 'press', + 'select', + 'choose', + 'pick', + 'find', + 'locate', + 'look for', + 'search for', + 'type', + 'enter', + 'input', + 'fill', + 'scroll to', + 'navigate to', + 'go to', + 'open', + 'close', + 'dismiss', + 'accept', + 'the', + 'a', + 'an', + 'on', + 'button', + 'link', + 'field', + 'input', + 'element', +]; + +/** + * Extract meaningful keywords from a descriptive intent. + * Handles cases like: + * - "click the Add to Cart button" → "add to cart" + * - 'click "Add to Cart"' → "add to cart" + * - "Add to Cart" → "add to cart" + */ +function extractIntentKeywords(intent: string): string[] { + let normalized = intent.toLowerCase().trim(); + + // Extract quoted text first (e.g., 'click "Add to Cart"') + const quotedMatch = normalized.match(/["']([^"']+)["']/); + if (quotedMatch) { + return [quotedMatch[1].trim()]; + } + + // Strip action verbs from the beginning + for (const verb of ACTION_VERBS) { + const pattern = new RegExp(`^${verb}\\s+`, 'i'); + normalized = normalized.replace(pattern, ''); + } + + // Strip trailing words like "button", "link", "element" + normalized = normalized.replace(/\s+(button|link|element|field|input)$/i, ''); + + // Clean up extra spaces + normalized = normalized.replace(/\s+/g, ' ').trim(); + + // If the result is too short, return the original + if (normalized.length < 2) { + return [intent.toLowerCase().replace(/[_-]/g, ' ')]; + } + + return [normalized]; +} + +/** + * Simple intent heuristics using text pattern matching. + * Used as default when no custom heuristics are provided. + * + * This is GENERALIZABLE - it works for any intent by: + * 1. Checking common patterns (add_to_cart, checkout, etc.) + * 2. Extracting keywords from descriptive intents (e.g., "click the X button") + * 3. Falling back to direct text matching + */ +class SimpleIntentHeuristics implements IntentHeuristics { + findElementForIntent( + intent: string, + elements: SnapshotElement[], + _url: string, + _goal: string + ): number | null { + const normalizedIntent = intent.toLowerCase().replace(/[_-]/g, ' '); + + // Build patterns list - start with common patterns, then add extracted keywords + const patterns: string[] = []; + + // Check if this is a known intent pattern + if (COMMON_INTENT_PATTERNS[intent.toLowerCase()]) { + patterns.push(...COMMON_INTENT_PATTERNS[intent.toLowerCase()]); + } else if (COMMON_INTENT_PATTERNS[normalizedIntent]) { + patterns.push(...COMMON_INTENT_PATTERNS[normalizedIntent]); + } + + // Extract keywords from descriptive intent (generalizable) + const keywords = extractIntentKeywords(intent); + patterns.push(...keywords); + + // Also add the raw normalized intent as fallback + if (!patterns.includes(normalizedIntent)) { + patterns.push(normalizedIntent); + } + + // Look for elements matching patterns (prefer clickable elements) + for (const pattern of patterns) { + for (const element of elements) { + const text = (element.text || '').toLowerCase(); + const ariaLabel = (element.ariaLabel || '').toLowerCase(); + const name = (element.name || '').toLowerCase(); + + if (text.includes(pattern) || ariaLabel.includes(pattern) || name.includes(pattern)) { + // Prefer clickable buttons/links + if (element.clickable || element.role === 'button' || element.role === 'link') { + return element.id; + } + } + } + } + + // Second pass: less strict matching (any element) + for (const pattern of patterns) { + for (const element of elements) { + const text = (element.text || '').toLowerCase(); + if (text.includes(pattern)) { + return element.id; + } + } + } + + // Third pass: word-by-word matching for multi-word intents + // This handles cases where the element text is slightly different + const intentWords = normalizedIntent.split(/\s+/).filter(w => w.length > 2); + if (intentWords.length >= 2) { + for (const element of elements) { + const text = (element.text || '').toLowerCase(); + const matchCount = intentWords.filter(word => text.includes(word)).length; + // If more than half the words match, consider it a match + if (matchCount >= Math.ceil(intentWords.length / 2)) { + if (element.clickable || element.role === 'button' || element.role === 'link') { + return element.id; + } + } + } + } + + return null; + } + + priorityOrder(): string[] { + return ['add_to_cart', 'checkout', 'search', 'submit', 'close', 'login']; + } +} + +// --------------------------------------------------------------------------- +// AgentRuntime Interface (minimal for MVP) +// --------------------------------------------------------------------------- + +/** + * Minimal runtime interface for browser control. + * This will be replaced with the full AgentRuntime integration. + */ +export interface AgentRuntime { + /** Take a snapshot of the current page */ + snapshot(options?: { + limit?: number; + screenshot?: boolean; + goal?: string; + }): Promise; + + /** Navigate to a URL */ + goto(url: string): Promise; + + /** Click an element by ID */ + click(elementId: number): Promise; + + /** Type text into an element */ + type(elementId: number, text: string): Promise; + + /** Press a key */ + pressKey(key: string): Promise; + + /** Scroll the page */ + scroll(direction: 'up' | 'down'): Promise; + + /** Get current URL */ + getCurrentUrl(): Promise; + + /** Get viewport height */ + getViewportHeight(): Promise; + + /** Scroll by delta (returns true if scroll was effective) */ + scrollBy(dy: number): Promise; +} + +// --------------------------------------------------------------------------- +// PlannerExecutorAgent Options +// --------------------------------------------------------------------------- + +/** + * Options for creating a PlannerExecutorAgent. + */ +export interface PlannerExecutorAgentOptions { + /** LLM for generating plans (recommend 7B+ model) */ + planner: LLMProvider; + /** LLM for executing steps (3B-7B model) */ + executor: LLMProvider; + /** Agent configuration (merged with defaults) */ + config?: DeepPartial; + /** Pre-action authorization hook */ + preActionAuthorizer?: PreActionAuthorizer; + /** Custom intent heuristics for element selection */ + intentHeuristics?: IntentHeuristics; + /** Enable verbose logging */ + verbose?: boolean; +} + +// --------------------------------------------------------------------------- +// PlannerExecutorAgent +// --------------------------------------------------------------------------- + +/** + * Two-tier agent architecture with Planner and Executor models. + * + * The Planner (typically 7B+ parameters) generates JSON execution plans + * with predicates. The Executor (3B-7B parameters) executes each step + * using a snapshot-first approach. + * + * @example + * ```typescript + * import { PlannerExecutorAgent, OllamaProvider } from '@predicatesystems/runtime'; + * + * const planner = new OllamaProvider({ model: 'qwen3:8b' }); + * const executor = new OllamaProvider({ model: 'qwen3:4b' }); + * + * const agent = new PlannerExecutorAgent({ + * planner, + * executor, + * config: { stepwise: { maxSteps: 20 }, verbose: true }, + * }); + * + * const result = await agent.runStepwise(runtime, { + * task: 'Search for laptops and add first result to cart', + * startUrl: 'https://amazon.com', + * }); + * ``` + */ +export class PlannerExecutorAgent { + readonly planner: LLMProvider; + readonly executor: LLMProvider; + readonly config: PlannerExecutorConfig; + + private preActionAuthorizer?: PreActionAuthorizer; + private intentHeuristics: IntentHeuristics; + private tokenCollector = new TokenUsageCollector(); + + // Run state + private runId: string | null = null; + private actionHistory: ActionRecord[] = []; + private currentStepIndex = 0; + private currentStep: { action: string; intent?: string } | null = null; + + constructor(options: PlannerExecutorAgentOptions) { + this.planner = options.planner; + this.executor = options.executor; + this.config = mergeConfig(options.config || {}); + if (options.verbose !== undefined) { + this.config = { ...this.config, verbose: options.verbose }; + } + this.preActionAuthorizer = options.preActionAuthorizer; + this.intentHeuristics = options.intentHeuristics || new SimpleIntentHeuristics(); + } + + // --------------------------------------------------------------------------- + // Token Stats + // --------------------------------------------------------------------------- + + /** + * Get token usage statistics for the agent session. + */ + getTokenStats(): TokenUsageSummary { + return this.tokenCollector.summary(); + } + + /** + * Reset token usage statistics. + */ + resetTokenStats(): void { + this.tokenCollector.reset(); + } + + private recordTokenUsage(role: string, resp: LLMResponse): void { + try { + this.tokenCollector.record(role, resp); + } catch { + // Don't fail on token tracking errors + } + } + + // --------------------------------------------------------------------------- + // Stepwise Run (ReAct-style) + // --------------------------------------------------------------------------- + + /** + * Run task using stepwise (ReAct-style) planning. + * + * Plans one step at a time based on current page state, adapting to + * page changes as they happen. More reliable with small models. + * + * @param runtime - Browser runtime for page control + * @param options - Task options + * @returns Run outcome + */ + async runStepwise( + runtime: AgentRuntime, + options: { + task: string; + startUrl?: string; + } + ): Promise { + const { task, startUrl } = options; + const startTime = Date.now(); + + // Initialize run state + this.runId = `run-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`; + this.actionHistory = []; + this.currentStepIndex = 0; + this.tokenCollector.reset(); + + const stepOutcomes: StepOutcome[] = []; + let currentUrl = ''; + let success = false; + let error: string | undefined; + + try { + // Navigate to start URL if provided + if (startUrl) { + if (this.config.verbose) { + console.log(`[NAVIGATE] ${startUrl}`); + } + await runtime.goto(startUrl); + currentUrl = startUrl; + } else { + currentUrl = await runtime.getCurrentUrl(); + } + + // Stepwise loop + const maxSteps = this.config.stepwise.maxSteps; + + for (let stepNum = 1; stepNum <= maxSteps; stepNum++) { + this.currentStepIndex = stepNum; + const stepStart = Date.now(); + + if (this.config.verbose) { + console.log(`\n${'='.repeat(60)}`); + console.log(`[STEP ${stepNum}/${maxSteps}]`); + console.log(`${'='.repeat(60)}`); + } + + // Take snapshot with escalation + const ctx = await this.snapshotWithEscalation(runtime, task); + currentUrl = ctx.snapshot?.url || currentUrl; + + if (this.config.verbose) { + const elementCount = ctx.snapshot?.elements?.length || 0; + console.log(`[SNAPSHOT] ${elementCount} elements, limit=${ctx.limitUsed}`); + // Debug: show searchbox/textbox elements specifically + const allElements = ctx.snapshot?.elements || []; + const searchElements = allElements.filter(el => + ['textbox', 'searchbox', 'combobox', 'input'].includes((el.role || '').toLowerCase()) + ); + console.log( + ` [SEARCH ELEMENTS] Found ${searchElements.length} textbox/searchbox/combobox/input elements:` + ); + for (const el of searchElements.slice(0, 3)) { + console.log( + ` [EL ${el.id}] role=${el.role}, text=${(el.text || '').slice(0, 60).replace(/\n/g, ' ')}, clickable=${el.clickable}` + ); + } + if (searchElements.length === 0) { + // Show first 5 elements for debugging + console.log(` [FIRST 5 ELEMENTS]:`); + for (const el of allElements.slice(0, 5)) { + console.log( + ` [EL ${el.id}] role=${el.role}, text=${(el.text || '').slice(0, 40).replace(/\n/g, ' ')}` + ); + } + } + } + + // Get planner's next action + const [systemPrompt, userPrompt] = buildStepwisePlannerPrompt( + task, + currentUrl, + ctx.compactRepresentation, + this.actionHistory.slice(-this.config.stepwise.actionHistoryLimit) + ); + + if (this.config.verbose) { + // Show the compact representation being sent to the LLM + const contextLines = ctx.compactRepresentation.split('\n'); + console.log(`[PLANNER PROMPT] Sending ${contextLines.length} element lines to LLM:`); + // Show header and first few elements + for (const line of contextLines.slice(0, 6)) { + console.log(` ${line}`); + } + if (contextLines.length > 6) { + console.log(` ... (${contextLines.length - 6} more elements)`); + } + } + + let plannerResp: LLMResponse; + try { + plannerResp = await this.planner.generate(systemPrompt, userPrompt, { + temperature: this.config.plannerTemperature, + max_tokens: this.config.plannerMaxTokens, + }); + this.recordTokenUsage('planner', plannerResp); + } catch (plannerError) { + // Log planner call failure + if (this.config.verbose) { + console.log(`[PLANNER ERROR] LLM call failed: ${plannerError}`); + } + stepOutcomes.push({ + stepId: stepNum, + goal: 'Call planner LLM', + status: StepStatus.FAILED, + verificationPassed: false, + usedVision: false, + durationMs: Date.now() - stepStart, + error: `Planner LLM call failed: ${plannerError instanceof Error ? plannerError.message : String(plannerError)}`, + }); + continue; + } + + if (this.config.verbose) { + // Show raw response for debugging (truncated if very long) + const rawLen = plannerResp.content.length; + const hasThink = plannerResp.content.includes(''); + const displayContent = + rawLen > 300 + ? plannerResp.content.slice(0, 300) + `... (${rawLen} chars)` + : plannerResp.content; + console.log(`[PLANNER]${hasThink ? ' (has )' : ''} ${displayContent}`); + } + + // Check for empty response + if (!plannerResp.content || plannerResp.content.trim().length === 0) { + if (this.config.verbose) { + console.log(`[PLANNER ERROR] Empty response from LLM`); + } + stepOutcomes.push({ + stepId: stepNum, + goal: 'Parse planner response', + status: StepStatus.FAILED, + verificationPassed: false, + usedVision: false, + durationMs: Date.now() - stepStart, + error: 'Planner returned empty response', + }); + continue; + } + + // Parse planner response + let plannerAction: StepwisePlannerResponse; + try { + plannerAction = extractJson(plannerResp.content) as unknown as StepwisePlannerResponse; + } catch (e) { + // Try to recover from malformed JSON + const parsed = parseAction(plannerResp.content); + if (parsed.action !== 'UNKNOWN') { + plannerAction = { + action: parsed.action as StepwisePlannerResponse['action'], + input: parsed.args[1] as string | undefined, + }; + } else { + if (this.config.verbose) { + console.log(`[PLANNER ERROR] Raw response: ${plannerResp.content.slice(0, 200)}`); + } + stepOutcomes.push({ + stepId: stepNum, + goal: 'Parse planner response', + status: StepStatus.FAILED, + verificationPassed: false, + usedVision: false, + durationMs: Date.now() - stepStart, + error: `Failed to parse planner response: ${e}`, + }); + continue; + } + } + + // Handle DONE action + if (plannerAction.action === 'DONE') { + if (this.config.verbose) { + console.log(`[DONE] Task completed`); + } + stepOutcomes.push({ + stepId: stepNum, + goal: 'Task completed', + status: StepStatus.SUCCESS, + actionTaken: 'DONE', + verificationPassed: true, + usedVision: false, + durationMs: Date.now() - stepStart, + }); + success = true; + break; + } + + // Execute the action + const outcome = await this.executeStepwiseAction( + runtime, + plannerAction, + stepNum, + task, + ctx, + stepStart + ); + stepOutcomes.push(outcome); + + // Record action history + const urlAfter = await runtime.getCurrentUrl(); + this.actionHistory.push({ + stepNum, + action: plannerAction.action, + target: plannerAction.input || plannerAction.intent || null, + result: outcome.status === StepStatus.SUCCESS ? 'success' : 'failed', + urlAfter, + }); + + // Update current URL + currentUrl = urlAfter; + + // Check for repeated failures + const recentFailures = stepOutcomes.slice(-3).filter(o => o.status === StepStatus.FAILED); + if (recentFailures.length >= 3) { + error = 'Too many consecutive failures'; + break; + } + } + + // If we ran out of steps without DONE, mark as failed + if (!success && !error) { + error = `Exceeded maximum steps (${maxSteps})`; + } + } catch (e) { + error = e instanceof Error ? e.message : String(e); + } + + return { + runId: this.runId, + task, + success, + stepsCompleted: stepOutcomes.filter(o => o.status === StepStatus.SUCCESS).length, + stepsTotal: stepOutcomes.length, + replansUsed: 0, // Stepwise doesn't use replanning + stepOutcomes, + totalDurationMs: Date.now() - startTime, + error, + tokenUsage: this.tokenCollector.summary(), + fallbackUsed: false, + }; + } + + // --------------------------------------------------------------------------- + // Execute Stepwise Action + // --------------------------------------------------------------------------- + + private async executeStepwiseAction( + runtime: AgentRuntime, + plannerAction: StepwisePlannerResponse, + stepNum: number, + task: string, + ctx: SnapshotContext, + stepStart: number + ): Promise { + const currentUrl = ctx.snapshot?.url || ''; + + // Handle SCROLL action + if (plannerAction.action === 'SCROLL') { + const direction = plannerAction.direction || 'down'; + try { + await runtime.scroll(direction); + return { + stepId: stepNum, + goal: `Scroll ${direction}`, + status: StepStatus.SUCCESS, + actionTaken: `SCROLL(${direction})`, + verificationPassed: true, + usedVision: false, + durationMs: Date.now() - stepStart, + }; + } catch (e) { + return { + stepId: stepNum, + goal: `Scroll ${direction}`, + status: StepStatus.FAILED, + verificationPassed: false, + usedVision: false, + durationMs: Date.now() - stepStart, + error: e instanceof Error ? e.message : String(e), + }; + } + } + + // For CLICK and TYPE_AND_SUBMIT, we need to find the element + const isTypeAction = plannerAction.action === 'TYPE_AND_SUBMIT'; + + // Actions that need to find a target element + const elementTargetingActions = ['CLICK', 'TYPE_AND_SUBMIT', 'SCROLL_TO']; + const needsElementLookup = elementTargetingActions.includes(plannerAction.action); + + // Check if target element exists in current snapshot + // If not, try scroll-after-escalation to find it + // This is GENERALIZABLE - works for any action that needs to find an element + let activeCtx = ctx; + if (needsElementLookup && this.config.snapshot.scrollAfterEscalation && plannerAction.intent) { + const elements = ctx.snapshot?.elements || []; + const url = ctx.snapshot?.url || ''; + const foundElement = this.tryIntentHeuristics(plannerAction.intent, elements, url, task); + + if (foundElement === null) { + // Element not found in current viewport - try scroll-after-escalation + if (this.config.verbose) { + console.log( + `[SCROLL-TO-FIND] Target "${plannerAction.intent}" not in viewport, scrolling to find...` + ); + } + + const newCtx = await this.snapshotWithEscalation(runtime, task, { + action: plannerAction.action, + intent: plannerAction.intent, + }); + + if (newCtx.snapshot) { + activeCtx = newCtx; + if (this.config.verbose) { + console.log( + `[SCROLL-TO-FIND] Updated context with ${newCtx.snapshot.elements.length} elements` + ); + } + } + } + } + + const [execSystem, execUser] = buildExecutorPrompt( + plannerAction.intent || `${plannerAction.action} element`, + plannerAction.intent, + activeCtx.compactRepresentation, + plannerAction.input, + undefined, // category + plannerAction.action + ); + + if (this.config.verbose) { + console.log(`[EXECUTOR PROMPT] system len=${execSystem.length}, user len=${execUser.length}`); + console.log(`[EXECUTOR USER PROMPT (first 300)]:\n${execUser.slice(0, 300)}...`); + } + + const executorResp = await this.executor.generate(execSystem, execUser, { + temperature: this.config.executorTemperature, + max_tokens: this.config.executorMaxTokens, + }); + this.recordTokenUsage('executor', executorResp); + + if (this.config.verbose) { + // Show raw response for debugging (truncated if very long) + const rawLen = executorResp.content.length; + const hasThink = executorResp.content.includes(''); + // Show more of the response for debugging + const displayContent = + rawLen > 500 + ? executorResp.content.slice(0, 500) + `... (${rawLen} chars total)` + : executorResp.content; + console.log( + `[EXECUTOR RAW]${hasThink ? ' (has )' : ''} len=${rawLen}:\n${displayContent}` + ); + } + + // Parse executor response + const parsed = parseAction(executorResp.content); + + // Debug: Show parsed result + if (this.config.verbose) { + console.log(`[EXECUTOR PARSED] ${parsed.action}, args: ${JSON.stringify(parsed.args)}`); + } + + if (parsed.action === 'NONE') { + return { + stepId: stepNum, + goal: plannerAction.intent || plannerAction.action, + status: StepStatus.FAILED, + verificationPassed: false, + usedVision: false, + durationMs: Date.now() - stepStart, + error: 'Executor could not find suitable element', + }; + } + + if (parsed.action === 'UNKNOWN') { + return { + stepId: stepNum, + goal: plannerAction.intent || plannerAction.action, + status: StepStatus.FAILED, + verificationPassed: false, + usedVision: false, + durationMs: Date.now() - stepStart, + error: `Failed to parse executor response: ${executorResp.content}`, + }; + } + + // Pre-action authorization + if (this.preActionAuthorizer) { + const actionContext = { + type: parsed.action, + elementId: parsed.args[0] as number | undefined, + value: isTypeAction + ? ((plannerAction.input || parsed.args[1]) as string | undefined) + : undefined, + }; + const authResult = await this.preActionAuthorizer(actionContext, { + url: currentUrl, + stepGoal: plannerAction.intent || plannerAction.action, + taskGoal: task, + }); + + if (!authResult.allowed) { + return { + stepId: stepNum, + goal: plannerAction.intent || plannerAction.action, + status: StepStatus.FAILED, + verificationPassed: false, + usedVision: false, + durationMs: Date.now() - stepStart, + error: `Action denied by policy: ${authResult.reason || 'unauthorized'}`, + }; + } + } + + // Execute the action + try { + const elementId = parsed.args[0] as number; + + if (parsed.action === 'CLICK') { + await runtime.click(elementId); + return { + stepId: stepNum, + goal: plannerAction.intent || 'Click element', + status: StepStatus.SUCCESS, + actionTaken: `CLICK(${elementId})`, + verificationPassed: true, + usedVision: false, + durationMs: Date.now() - stepStart, + urlBefore: currentUrl, + }; + } else if (parsed.action === 'TYPE') { + const text = plannerAction.input || (parsed.args[1] as string) || ''; + await runtime.type(elementId, text); + + // Submit with Enter key for TYPE_AND_SUBMIT + if (plannerAction.action === 'TYPE_AND_SUBMIT') { + const preUrl = await runtime.getCurrentUrl(); + let submitMethod: 'enter' | 'click' = 'enter'; + let urlChanged = false; + + // First attempt: Submit with Enter key (more reliable for search) + await runtime.pressKey('Enter'); + + // Wait for URL to change after form submission + urlChanged = await this.waitForUrlChange(runtime, preUrl, 5000); + + if (this.config.verbose) { + if (urlChanged) { + const newUrl = await runtime.getCurrentUrl(); + console.log(`[TYPE_AND_SUBMIT] URL changed after Enter: ${newUrl.slice(0, 60)}...`); + } else { + console.log(`[TYPE_AND_SUBMIT] URL unchanged after Enter, attempting retry...`); + } + } + + // Retry with button click if Enter didn't work + if (!urlChanged && this.config.retry.executorRepairAttempts > 0) { + // Find submit button near the input element + const submitButtonId = this.findSubmitButton( + activeCtx.snapshot?.elements || [], + elementId + ); + + if (submitButtonId !== null) { + if (this.config.verbose) { + console.log( + `[TYPE_AND_SUBMIT-RETRY] Found submit button ${submitButtonId}, retrying with click` + ); + } + + try { + await runtime.click(submitButtonId); + submitMethod = 'click'; + urlChanged = await this.waitForUrlChange(runtime, preUrl, 5000); + + if (this.config.verbose && urlChanged) { + const newUrl = await runtime.getCurrentUrl(); + console.log( + `[TYPE_AND_SUBMIT-RETRY] URL changed after click: ${newUrl.slice(0, 60)}...` + ); + } + } catch (e) { + if (this.config.verbose) { + console.log(`[TYPE_AND_SUBMIT-RETRY] Click failed: ${e}`); + } + } + } else if (this.config.verbose) { + console.log(`[TYPE_AND_SUBMIT-RETRY] No submit button found for retry`); + } + } + + // Wait for page to stabilize + if (urlChanged) { + await new Promise(resolve => setTimeout(resolve, 1000)); + } + } + + return { + stepId: stepNum, + goal: plannerAction.intent || 'Type text', + status: StepStatus.SUCCESS, + actionTaken: `TYPE(${elementId}, "${text}")`, + verificationPassed: true, + usedVision: false, + durationMs: Date.now() - stepStart, + urlBefore: currentUrl, + }; + } else if (parsed.action === 'PRESS') { + const key = parsed.args[0] as string; + await runtime.pressKey(key); + return { + stepId: stepNum, + goal: plannerAction.intent || `Press ${key}`, + status: StepStatus.SUCCESS, + actionTaken: `PRESS(${key})`, + verificationPassed: true, + usedVision: false, + durationMs: Date.now() - stepStart, + }; + } + + return { + stepId: stepNum, + goal: plannerAction.intent || plannerAction.action, + status: StepStatus.FAILED, + verificationPassed: false, + usedVision: false, + durationMs: Date.now() - stepStart, + error: `Unknown action type: ${parsed.action}`, + }; + } catch (e) { + return { + stepId: stepNum, + goal: plannerAction.intent || plannerAction.action, + status: StepStatus.FAILED, + verificationPassed: false, + usedVision: false, + durationMs: Date.now() - stepStart, + error: e instanceof Error ? e.message : String(e), + }; + } + } + + // --------------------------------------------------------------------------- + // Snapshot with Escalation + // --------------------------------------------------------------------------- + + /** + * Take snapshot with progressive limit escalation and optional scroll-to-find. + * + * Starts with base limit and increases on element-not-found scenarios + * to capture more of the page. After exhausting limit escalation, if + * scrollAfterEscalation is enabled, scrolls down/up to find elements + * that may be outside the current viewport. + * + * @param runtime - Browser runtime for snapshots and scrolling + * @param goal - Goal string for context formatting + * @param step - Optional step info for intent-based element detection during scroll + */ + private async snapshotWithEscalation( + runtime: AgentRuntime, + goal: string, + step?: { action: string; intent?: string } + ): Promise { + const cfg = this.config.snapshot; + let currentLimit = cfg.limitBase; + const maxLimit = cfg.enabled ? cfg.limitMax : cfg.limitBase; + let lastSnapshot: Awaited> = null; + let lastCompact = ''; + let requiresVision = false; + let visionReason: string | null = null; + + // Phase 1: Limit escalation loop + while (currentLimit <= maxLimit) { + try { + const snap = await runtime.snapshot({ + limit: currentLimit, + screenshot: false, + goal, + }); + + if (snap === null) { + if (!cfg.enabled) break; + currentLimit = Math.min(currentLimit + cfg.limitStep, maxLimit + 1); + continue; + } + + lastSnapshot = snap; + lastCompact = formatContext(snap.elements || [], currentLimit); + + // If escalation disabled, we're done after first successful snapshot + if (!cfg.enabled) break; + + // Check element count - if sufficient, no need to escalate + const elementCount = snap.elements?.length || 0; + if (elementCount >= 10) break; + + // Escalate limit + if (currentLimit < maxLimit) { + currentLimit = Math.min(currentLimit + cfg.limitStep, maxLimit); + if (this.config.verbose) { + console.log( + `[ESCALATION] Low element count (${elementCount}), increasing limit to ${currentLimit}` + ); + } + } else { + break; + } + } catch (e) { + if (this.config.verbose) { + console.error(`[SNAPSHOT] Error: ${e}`); + } + if (!cfg.enabled) break; + currentLimit = Math.min(currentLimit + cfg.limitStep, maxLimit + 1); + } + } + + // Phase 2: Scroll-after-escalation + // Only trigger for CLICK actions with specific intents + const shouldTryScroll = + cfg.scrollAfterEscalation && + step !== undefined && + lastSnapshot !== null && + !requiresVision && + step.action === 'CLICK' && + step.intent; + + if (shouldTryScroll && lastSnapshot) { + // Check if we can find the target element using intent heuristics + const elements = lastSnapshot.elements || []; + const url = lastSnapshot.url || ''; + let foundElement = this.tryIntentHeuristics(step.intent!, elements, url, goal); + + if (foundElement === null) { + // Element not found in current viewport - try scrolling + if (this.config.verbose) { + console.log( + `[SNAPSHOT-ESCALATION] Target element not found for intent "${step.intent}", trying scroll-after-escalation...` + ); + } + + // Get viewport height and calculate scroll delta + const viewportHeight = await runtime.getViewportHeight(); + const scrollDelta = viewportHeight * cfg.scrollViewportFraction; + + for (const direction of cfg.scrollDirections) { + // Map direction to dy (pixels): down=positive, up=negative + const scrollDy = direction === 'down' ? scrollDelta : -scrollDelta; + + for (let scrollNum = 0; scrollNum < cfg.scrollMaxAttempts; scrollNum++) { + if (this.config.verbose) { + console.log( + `[SNAPSHOT-ESCALATION] Scrolling ${direction} (${scrollNum + 1}/${cfg.scrollMaxAttempts})...` + ); + } + + // Scroll with verification + const scrollEffective = await runtime.scrollBy(scrollDy); + + if (!scrollEffective) { + if (this.config.verbose) { + console.log( + `[SNAPSHOT-ESCALATION] Scroll ${direction} had no effect (reached boundary), skipping remaining attempts` + ); + } + break; // No point trying more scrolls in this direction + } + + // Wait for stabilization after successful scroll + if (cfg.scrollStabilizeMs > 0) { + await new Promise(resolve => setTimeout(resolve, cfg.scrollStabilizeMs)); + } + + // Take new snapshot at max limit (we already escalated) + try { + const snap = await runtime.snapshot({ + limit: cfg.limitMax, + screenshot: false, + goal, + }); + + if (snap === null) continue; + + lastSnapshot = snap; + lastCompact = formatContext(snap.elements || [], cfg.limitMax); + + // Check if target element is now visible + const newElements = snap.elements || []; + const newUrl = snap.url || ''; + foundElement = this.tryIntentHeuristics(step.intent!, newElements, newUrl, goal); + + if (foundElement !== null) { + if (this.config.verbose) { + console.log( + `[SNAPSHOT-ESCALATION] Found target element ${foundElement} after scrolling ${direction}` + ); + } + break; // Break out of scroll attempts loop + } + } catch { + continue; + } + } + + // If found, break out of direction loop + if (foundElement !== null) break; + } + + if (foundElement === null && this.config.verbose) { + console.log(`[SNAPSHOT-ESCALATION] Target element not found after scrolling`); + } + } + } + + // Fallback for failed capture + if (lastSnapshot === null) { + lastSnapshot = { url: '', title: '', elements: [] }; + requiresVision = true; + visionReason = 'snapshot_capture_failed'; + } + + return { + snapshot: lastSnapshot, + compactRepresentation: lastCompact, + screenshotBase64: null, + capturedAt: new Date(), + limitUsed: currentLimit, + snapshotSuccess: !requiresVision, + requiresVision, + visionReason, + pruningCategory: null, + prunedNodeCount: 0, + }; + } + + // --------------------------------------------------------------------------- + // Intent Heuristics Helper + // --------------------------------------------------------------------------- + + /** + * Try to find an element using intent heuristics. + * + * @param intent - Intent string (e.g., "add_to_cart") + * @param elements - Available elements from snapshot + * @param url - Current page URL + * @param goal - Step goal description + * @returns Element ID if found, null otherwise + */ + private tryIntentHeuristics( + intent: string, + elements: SnapshotElement[], + url: string, + goal: string + ): number | null { + try { + return this.intentHeuristics.findElementForIntent(intent, elements, url, goal); + } catch { + return null; + } + } + + // --------------------------------------------------------------------------- + // Retry/Repair Helpers + // --------------------------------------------------------------------------- + + /** + * Wait for URL to change from the given URL. + * + * @param runtime - Browser runtime + * @param originalUrl - URL to compare against + * @param timeoutMs - Maximum wait time in milliseconds + * @returns true if URL changed, false if timeout + */ + private async waitForUrlChange( + runtime: AgentRuntime, + originalUrl: string, + timeoutMs: number + ): Promise { + const startTime = Date.now(); + const pollInterval = 500; + + while (Date.now() - startTime < timeoutMs) { + await new Promise(resolve => setTimeout(resolve, pollInterval)); + try { + const currentUrl = await runtime.getCurrentUrl(); + if (currentUrl !== originalUrl) { + return true; + } + } catch { + // Ignore errors during URL check + } + } + return false; + } + + /** + * Find a submit button near the input element. + * + * Looks for buttons/links with submit-related text that appear after + * the input element in the DOM (higher element ID typically means + * later in DOM order). + * + * @param elements - Snapshot elements + * @param inputElementId - ID of the input element + * @returns Submit button element ID if found, null otherwise + */ + private findSubmitButton(elements: SnapshotElement[], inputElementId: number): number | null { + // Submit-related patterns + const submitPatterns = [ + 'search', + 'go', + 'find', + 'submit', + 'send', + 'enter', + 'apply', + 'ok', + 'done', + ]; + + // Icon patterns (exact match) + const iconPatterns = ['>', '→', '🔍', '⌕']; + + // Look for submit buttons + const candidates: Array<{ id: number; score: number }> = []; + + for (const element of elements) { + // Only consider buttons and links + const role = (element.role || '').toLowerCase(); + if (!['button', 'link', 'searchbox'].includes(role)) continue; + + // Skip if not clickable + if (element.clickable === false) continue; + + // Skip the input element itself + if (element.id === inputElementId) continue; + + const text = (element.text || '').toLowerCase().trim(); + const ariaLabel = (element.ariaLabel || '').toLowerCase(); + + // Check for icon patterns (exact match, high priority) + for (const icon of iconPatterns) { + if (text === icon || ariaLabel === icon) { + candidates.push({ id: element.id, score: 200 + Math.abs(element.id - inputElementId) }); + break; + } + } + + // Check for submit patterns + for (let i = 0; i < submitPatterns.length; i++) { + const pattern = submitPatterns[i]; + if (text.includes(pattern) || ariaLabel.includes(pattern)) { + // Score: pattern priority + proximity to input element + // Lower distance from input = higher score + const proximityBonus = 100 - Math.min(Math.abs(element.id - inputElementId), 100); + candidates.push({ id: element.id, score: 100 - i + proximityBonus }); + break; + } + } + } + + // Return best candidate (highest score) + if (candidates.length === 0) return null; + candidates.sort((a, b) => b.score - a.score); + return candidates[0].id; + } +} diff --git a/src/agents/planner-executor/playwright-runtime.ts b/src/agents/planner-executor/playwright-runtime.ts new file mode 100644 index 0000000..b392b56 --- /dev/null +++ b/src/agents/planner-executor/playwright-runtime.ts @@ -0,0 +1,587 @@ +/** + * PlaywrightRuntime: Browser automation using Playwright/Chromium. + * + * Implements the AgentRuntime interface for PlannerExecutorAgent using + * Playwright for real browser automation with the Predicate browser extension. + */ + +import { Page, BrowserContext } from 'playwright'; +import { PredicateBrowser } from '../../browser'; +import { snapshot as takeSnapshot } from '../../snapshot'; +import { click, typeText, press } from '../../actions'; +import type { Snapshot as SDKSnapshot, Element as SDKElement } from '../../types'; +import type { Snapshot, SnapshotElement } from './plan-models'; +import type { AgentRuntime } from './planner-executor-agent'; + +/** + * Options for creating a PlaywrightRuntime. + */ +export interface PlaywrightRuntimeOptions { + /** + * Run browser in headless mode. + * Defaults to true in CI, false locally. + */ + headless?: boolean; + + /** + * API key for Predicate/Sentience backend processing. + */ + apiKey?: string; + + /** + * API URL for Predicate/Sentience backend. + */ + apiUrl?: string; + + /** + * Proxy server URL (e.g., 'http://user:pass@proxy.example.com:8080'). + */ + proxy?: string; + + /** + * Path to user data directory for persistent sessions. + */ + userDataDir?: string; + + /** + * Storage state to inject (cookies + localStorage). + */ + storageState?: string | object; + + /** + * Directory to save video recordings. + */ + recordVideoDir?: string; + + /** + * Video resolution. + */ + recordVideoSize?: { width: number; height: number }; + + /** + * Viewport size. + */ + viewport?: { width: number; height: number }; + + /** + * Device scale factor (e.g., 2.0 for Retina). + */ + deviceScaleFactor?: number; + + /** + * Allowed domains for navigation. + */ + allowedDomains?: string[]; + + /** + * Prohibited domains for navigation. + */ + prohibitedDomains?: string[]; + + /** + * Keep browser alive after close() (no teardown). + */ + keepAlive?: boolean; + + /** + * Default timeout for operations (ms). + */ + timeout?: number; + + /** + * Show visual overlay highlighting elements in browser. + * Useful for debugging and demos. + */ + showOverlay?: boolean; +} + +/** + * PlaywrightRuntime implements AgentRuntime using Playwright/Chromium. + * + * Provides real browser automation with the Predicate browser extension for + * snapshot-based element selection and interaction. + * + * @example + * ```typescript + * const runtime = new PlaywrightRuntime({ headless: false }); + * await runtime.start(); + * + * const agent = new PlannerExecutorAgent({ planner, executor }); + * const result = await agent.runStepwise(runtime, { + * task: 'Search for laptops on Amazon', + * startUrl: 'https://www.amazon.com', + * }); + * + * await runtime.close(); + * ``` + */ +export class PlaywrightRuntime implements AgentRuntime { + private browser: InstanceType; + private options: PlaywrightRuntimeOptions; + private started = false; + + constructor(options: PlaywrightRuntimeOptions = {}) { + this.options = { + timeout: 30000, + ...options, + }; + + // Create PredicateBrowser with options + this.browser = new PredicateBrowser( + options.apiKey, + options.apiUrl, + options.headless, + options.proxy, + options.userDataDir, + options.storageState, + options.recordVideoDir, + options.recordVideoSize, + options.viewport, + options.deviceScaleFactor, + options.allowedDomains, + options.prohibitedDomains, + options.keepAlive ?? false + ); + } + + /** + * Start the browser and initialize the runtime. + */ + async start(): Promise { + if (this.started) { + return; + } + + await this.browser.start(); + this.started = true; + } + + /** + * Close the browser and clean up resources. + * + * @param outputPath - Optional path to save video recording + * @returns Path to video file if recording was enabled + */ + async close(outputPath?: string): Promise { + if (!this.started) { + return null; + } + + const videoPath = await this.browser.close(outputPath); + this.started = false; + return videoPath; + } + + /** + * Get the underlying Playwright Page instance. + */ + getPage(): Page | null { + return this.browser.getPage(); + } + + /** + * Get the underlying BrowserContext. + */ + getContext(): BrowserContext | null { + return this.browser.getContext(); + } + + /** + * Ensure browser is started. + */ + private ensureStarted(): void { + if (!this.started) { + throw new Error('PlaywrightRuntime not started. Call start() first.'); + } + } + + // --------------------------------------------------------------------------- + // AgentRuntime Interface Implementation + // --------------------------------------------------------------------------- + + /** + * Take a snapshot of the current page. + * + * Uses the Predicate browser extension for semantic element extraction with + * importance ranking and pruning. + */ + async snapshot(options?: { + limit?: number; + screenshot?: boolean; + goal?: string; + }): Promise { + this.ensureStarted(); + + const page = this.browser.getPage(); + + // Wait for page to be stable before taking snapshot + if (page) { + try { + await page.waitForLoadState('domcontentloaded', { timeout: 5000 }); + } catch { + // Best effort - continue even if timeout + } + } + + try { + // Get snapshot using SDK's snapshot function + const snap = await takeSnapshot(this.browser, { + limit: options?.limit || 100, + screenshot: options?.screenshot ?? false, + goal: options?.goal, + show_overlay: this.options.showOverlay ?? true, + }); + + if (!snap || snap.status !== 'success') { + return null; + } + + // Convert to AgentRuntime Snapshot format + return await this.convertSnapshot(snap); + } catch (e) { + console.error('[PlaywrightRuntime] Snapshot error:', e); + return null; + } + } + + /** + * Convert SDK snapshot to AgentRuntime format. + */ + private async convertSnapshot(snap: SDKSnapshot): Promise { + const page = this.browser.getPage(); + const title = page ? await page.title() : ''; + + const elements = (snap.elements || []).map(el => this.convertElement(el)); + + // Debug: Log element role distribution + const roleCounts = new Map(); + const clickableCounts = new Map(); + for (const el of elements) { + const role = (el.role || 'none').toLowerCase(); + roleCounts.set(role, (roleCounts.get(role) || 0) + 1); + if (el.clickable) { + clickableCounts.set(role, (clickableCounts.get(role) || 0) + 1); + } + } + const rolesSummary = Array.from(roleCounts.entries()) + .sort((a, b) => b[1] - a[1]) + .slice(0, 6) + .map(([r, c]) => `${r}:${c}`) + .join(', '); + const clickableSummary = Array.from(clickableCounts.entries()) + .sort((a, b) => b[1] - a[1]) + .map(([r, c]) => `${r}:${c}`) + .join(', '); + console.log(` [convertSnapshot] Elements: ${elements.length}, Roles: ${rolesSummary}`); + console.log(` [convertSnapshot] Clickable by role: ${clickableSummary || 'none'}`); + + return { + url: snap.url, + title, + elements, + screenshot: snap.screenshot, + status: snap.status, + }; + } + + /** + * Convert snapshot element to AgentRuntime format. + */ + private convertElement(el: SDKElement): SnapshotElement { + // SDK Element uses different field naming conventions + // Convert to agent-friendly format + // Extract clickable and isPrimary from visual_cues if available + const visualCues = (el as any).visual_cues; + + // Determine clickable status: + // 1. Input elements (textbox, searchbox, combobox, input, textarea) are ALWAYS interactive + // 2. Elements with href are ALWAYS clickable (they're links) + // 3. For other elements, use visual_cues.is_clickable or fall back to role check + const role = (el.role || '').toLowerCase(); + const isInputRole = ['textbox', 'searchbox', 'combobox', 'input', 'textarea'].includes(role); + const hasHref = Boolean(el.href); + const isClickable = + isInputRole || hasHref || (visualCues?.is_clickable ?? this.isInteractiveRole(el.role)); + + const isPrimary = visualCues?.is_primary ?? false; + const hasBackground = Boolean(visualCues?.background_color_name); + + return { + id: el.id, + role: el.role || '', + text: el.text || el.name || '', + name: el.name || undefined, + clickable: isClickable, + importance: el.importance ?? 0, + isPrimary, + background: hasBackground, + nearbyText: el.nearby_text || undefined, + ordinal: el.group_index?.toString(), + inDominantGroup: el.group_key !== undefined, + href: el.href, + }; + } + + /** + * Check if a role is typically interactive. + */ + private isInteractiveRole(role: string): boolean { + const interactiveRoles = new Set([ + 'button', + 'link', + 'textbox', + 'searchbox', + 'combobox', + 'checkbox', + 'radio', + 'slider', + 'tab', + 'menuitem', + 'option', + 'switch', + 'a', + 'input', + 'select', + 'textarea', + ]); + return interactiveRoles.has((role || '').toLowerCase()); + } + + /** + * Navigate to a URL. + */ + async goto(url: string): Promise { + this.ensureStarted(); + await this.browser.goto(url); + + // Wait for page to be ready after navigation + const page = this.browser.getPage(); + if (page) { + try { + await page.waitForLoadState('domcontentloaded', { timeout: 10000 }); + // Small delay to let any client-side JS settle + await page.waitForTimeout(500); + } catch { + // Best effort - continue even if timeout + } + } + } + + /** + * Click an element by its snapshot ID. + * + * Uses the Predicate browser extension's element registry to find and click + * the element by its semantic ID. + */ + async click(elementId: number): Promise { + this.ensureStarted(); + + const result = await click(this.browser, elementId); + if (!result.success) { + throw new Error(result.error?.reason || `Click failed for element ${elementId}`); + } + } + + /** + * Type text into an element by its snapshot ID. + */ + async type(elementId: number, text: string): Promise { + this.ensureStarted(); + + const result = await typeText(this.browser, elementId, text); + if (!result.success) { + throw new Error(result.error?.reason || `Type failed for element ${elementId}`); + } + } + + /** + * Press a keyboard key. + */ + async pressKey(key: string): Promise { + this.ensureStarted(); + + const result = await press(this.browser, key); + if (!result.success) { + throw new Error(result.error?.reason || `Press failed for key ${key}`); + } + } + + /** + * Scroll the page in a direction. + */ + async scroll(direction: 'up' | 'down'): Promise { + this.ensureStarted(); + + const page = this.browser.getPage(); + if (!page) { + throw new Error('Page not available'); + } + + const delta = direction === 'down' ? 400 : -400; + await page.mouse.wheel(0, delta); + // Wait for scroll to take effect + await page.waitForTimeout(200); + } + + /** + * Get the current URL. + */ + async getCurrentUrl(): Promise { + this.ensureStarted(); + + const page = this.browser.getPage(); + if (!page) { + throw new Error('Page not available'); + } + + return page.url(); + } + + /** + * Get the viewport height. + */ + async getViewportHeight(): Promise { + this.ensureStarted(); + + const page = this.browser.getPage(); + if (!page) { + throw new Error('Page not available'); + } + + const viewport = page.viewportSize(); + return viewport?.height || 800; + } + + /** + * Scroll by a delta amount. + * + * @returns true if scroll was successful + */ + async scrollBy(dy: number): Promise { + this.ensureStarted(); + + const page = this.browser.getPage(); + if (!page) { + return false; + } + + try { + const beforeY = await page.evaluate(() => window.scrollY); + await page.mouse.wheel(0, dy); + // Wait for scroll to take effect + await page.waitForTimeout(100); + const afterY = await page.evaluate(() => window.scrollY); + return Math.abs(afterY - beforeY) > 10; + } catch { + return false; + } + } + + // --------------------------------------------------------------------------- + // Additional Utility Methods + // --------------------------------------------------------------------------- + + /** + * Wait for navigation to complete. + */ + async waitForNavigation(options?: { timeout?: number }): Promise { + this.ensureStarted(); + + const page = this.browser.getPage(); + if (!page) { + throw new Error('Page not available'); + } + + await page.waitForLoadState('domcontentloaded', { + timeout: options?.timeout || this.options.timeout, + }); + } + + /** + * Wait for an element to appear. + */ + async waitForElement(selector: string, options?: { timeout?: number }): Promise { + this.ensureStarted(); + + const page = this.browser.getPage(); + if (!page) { + throw new Error('Page not available'); + } + + await page.waitForSelector(selector, { + timeout: options?.timeout || this.options.timeout, + }); + } + + /** + * Take a screenshot. + * + * @param path - Optional path to save screenshot + * @returns Screenshot as base64 string + */ + async screenshot(path?: string): Promise { + this.ensureStarted(); + + const page = this.browser.getPage(); + if (!page) { + throw new Error('Page not available'); + } + + const buffer = await page.screenshot({ + path, + type: 'jpeg', + quality: 80, + }); + return buffer.toString('base64'); + } + + /** + * Execute JavaScript in the page context. + */ + async evaluate(code: string): Promise { + this.ensureStarted(); + + const page = this.browser.getPage(); + if (!page) { + throw new Error('Page not available'); + } + + return page.evaluate(code); + } + + /** + * Create PlaywrightRuntime from an existing Playwright Page. + * + * Useful for integrating with existing Playwright test setups. + */ + static async fromPage( + page: Page, + options?: Partial + ): Promise { + const runtime = new PlaywrightRuntime(options); + // Use PredicateBrowser.fromPage to wrap existing page + (runtime as any).browser = PredicateBrowser.fromPage(page, options?.apiKey, options?.apiUrl); + runtime.started = true; + return runtime; + } +} + +/** + * Create and start a PlaywrightRuntime. + * + * Convenience function that creates and starts the runtime in one call. + * + * @example + * ```typescript + * const runtime = await createPlaywrightRuntime({ headless: false }); + * // ... use runtime + * await runtime.close(); + * ``` + */ +export async function createPlaywrightRuntime( + options?: PlaywrightRuntimeOptions +): Promise { + const runtime = new PlaywrightRuntime(options); + await runtime.start(); + return runtime; +} diff --git a/src/agents/planner-executor/predicates.ts b/src/agents/planner-executor/predicates.ts new file mode 100644 index 0000000..147a4ae --- /dev/null +++ b/src/agents/planner-executor/predicates.ts @@ -0,0 +1,253 @@ +/** + * Predicate System for Step Verification + * + * Predicates are used to verify step outcomes and enable pre-step verification + * (skipping steps if the desired state is already achieved). + * + * Supported predicates: + * - url_contains: Check if URL contains a substring + * - url_matches: Check if URL matches a regex pattern + * - exists: Check if element with text/selector exists + * - not_exists: Check if element does not exist + * - element_count: Check element count within range + * - any_of: Any of the sub-predicates passes + * - all_of: All sub-predicates pass + */ + +import type { Snapshot, SnapshotElement, PredicateSpec } from './plan-models'; + +// --------------------------------------------------------------------------- +// Predicate Interface +// --------------------------------------------------------------------------- + +/** + * A predicate that can be evaluated against a snapshot. + */ +export interface Predicate { + /** Predicate type name */ + readonly name: string; + /** Evaluate predicate against a snapshot */ + evaluate(snapshot: Snapshot): boolean; +} + +// --------------------------------------------------------------------------- +// URL Predicates +// --------------------------------------------------------------------------- + +/** + * Check if URL contains a substring. + */ +export function urlContains(substring: string): Predicate { + return { + name: 'url_contains', + evaluate(snapshot: Snapshot): boolean { + const url = snapshot.url || ''; + return url.toLowerCase().includes(substring.toLowerCase()); + }, + }; +} + +/** + * Check if URL matches a regex pattern. + */ +export function urlMatches(pattern: string): Predicate { + return { + name: 'url_matches', + evaluate(snapshot: Snapshot): boolean { + const url = snapshot.url || ''; + try { + const regex = new RegExp(pattern, 'i'); + return regex.test(url); + } catch { + // Invalid regex, fall back to substring match + return url.toLowerCase().includes(pattern.toLowerCase()); + } + }, + }; +} + +// --------------------------------------------------------------------------- +// Element Predicates +// --------------------------------------------------------------------------- + +/** + * Check if element matching selector/text exists. + */ +export function exists(selectorOrText: string): Predicate { + return { + name: 'exists', + evaluate(snapshot: Snapshot): boolean { + const elements = snapshot.elements || []; + return elements.some(el => elementMatches(el, selectorOrText)); + }, + }; +} + +/** + * Check if element matching selector/text does NOT exist. + */ +export function notExists(selectorOrText: string): Predicate { + return { + name: 'not_exists', + evaluate(snapshot: Snapshot): boolean { + const elements = snapshot.elements || []; + return !elements.some(el => elementMatches(el, selectorOrText)); + }, + }; +} + +/** + * Check element count is within range. + */ +export function elementCount( + selectorOrText: string, + minCount: number = 0, + maxCount?: number +): Predicate { + return { + name: 'element_count', + evaluate(snapshot: Snapshot): boolean { + const elements = snapshot.elements || []; + const matching = elements.filter(el => elementMatches(el, selectorOrText)); + const count = matching.length; + if (count < minCount) return false; + if (maxCount !== undefined && count > maxCount) return false; + return true; + }, + }; +} + +/** + * Helper to check if element matches selector/text. + */ +function elementMatches(element: SnapshotElement, selectorOrText: string): boolean { + const text = (element.text || '').toLowerCase(); + const role = (element.role || '').toLowerCase(); + const ariaLabel = (element.ariaLabel || '').toLowerCase(); + const query = selectorOrText.toLowerCase(); + + // Direct text match + if (text.includes(query)) return true; + + // Aria label match + if (ariaLabel.includes(query)) return true; + + // Role-based selector (e.g., "button", "link") + if (role === query) return true; + + // Combined role:text selector (e.g., "button:submit") + if (selectorOrText.includes(':')) { + const [roleQuery, textQuery] = selectorOrText.split(':', 2); + if (role === roleQuery.toLowerCase() && text.includes(textQuery.toLowerCase())) { + return true; + } + } + + return false; +} + +// --------------------------------------------------------------------------- +// Composite Predicates +// --------------------------------------------------------------------------- + +/** + * Any of the sub-predicates passes (OR). + */ +export function anyOf(...predicates: Predicate[]): Predicate { + return { + name: 'any_of', + evaluate(snapshot: Snapshot): boolean { + return predicates.some(p => p.evaluate(snapshot)); + }, + }; +} + +/** + * All sub-predicates pass (AND). + */ +export function allOf(...predicates: Predicate[]): Predicate { + return { + name: 'all_of', + evaluate(snapshot: Snapshot): boolean { + return predicates.every(p => p.evaluate(snapshot)); + }, + }; +} + +// --------------------------------------------------------------------------- +// Predicate Builder +// --------------------------------------------------------------------------- + +/** + * Build a Predicate from a specification. + * + * @param spec - Predicate specification (from plan step verify array) + * @returns Predicate instance + * + * @example + * ```typescript + * const pred = buildPredicate({ predicate: 'url_contains', args: ['/cart'] }); + * const passes = pred.evaluate(snapshot); + * ``` + */ +export function buildPredicate(spec: PredicateSpec): Predicate { + const { predicate: name, args } = spec; + + switch (name) { + case 'url_contains': + return urlContains(String(args[0] || '')); + + case 'url_matches': + return urlMatches(String(args[0] || '')); + + case 'exists': + return exists(String(args[0] || '')); + + case 'not_exists': + return notExists(String(args[0] || '')); + + case 'element_count': { + const selector = String(args[0] || ''); + const minCount = typeof args[1] === 'number' ? args[1] : 0; + const maxCount = typeof args[2] === 'number' ? args[2] : undefined; + return elementCount(selector, minCount, maxCount); + } + + case 'any_of': + return anyOf(...(args as PredicateSpec[]).map(buildPredicate)); + + case 'all_of': + return allOf(...(args as PredicateSpec[]).map(buildPredicate)); + + default: + // Unknown predicate - always passes (lenient) + return { + name: `unknown:${name}`, + evaluate(): boolean { + return true; + }, + }; + } +} + +/** + * Evaluate all predicates against a snapshot. + * + * @param predicates - Array of predicate specifications + * @param snapshot - Snapshot to evaluate against + * @returns true if all predicates pass + */ +export function evaluatePredicates(predicates: PredicateSpec[], snapshot: Snapshot): boolean { + for (const spec of predicates) { + try { + const pred = buildPredicate(spec); + if (!pred.evaluate(snapshot)) { + return false; + } + } catch { + // On error, assume predicate fails + return false; + } + } + return true; +} diff --git a/src/agents/planner-executor/prompts.ts b/src/agents/planner-executor/prompts.ts new file mode 100644 index 0000000..f67862b --- /dev/null +++ b/src/agents/planner-executor/prompts.ts @@ -0,0 +1,304 @@ +/** + * Prompt Builders for PlannerExecutorAgent + * + * System and user prompts for Planner and Executor LLMs. + * Optimized for small local models (4B-8B parameters). + */ + +import type { ActionRecord } from './plan-models'; + +// --------------------------------------------------------------------------- +// Stepwise Planner Prompt (ReAct-style) +// --------------------------------------------------------------------------- + +/** + * Build system and user prompts for stepwise (ReAct-style) planning. + * + * Instead of generating a full plan upfront, this prompt asks the LLM to + * decide the next single action based on current page state and history. + * + * @param goal - The overall task goal + * @param currentUrl - Current page URL + * @param pageContext - Compact representation of page elements + * @param actionHistory - List of previously executed actions + * @returns Tuple of [systemPrompt, userPrompt] + */ +export function buildStepwisePlannerPrompt( + goal: string, + currentUrl: string, + pageContext: string, + actionHistory: ActionRecord[] +): [string, string] { + // Build action history text + let historyText = ''; + if (actionHistory.length > 0) { + historyText = 'Actions taken so far:\n'; + for (const rec of actionHistory) { + const targetStr = rec.target ? `(${rec.target})` : ''; + historyText += ` ${rec.stepNum}. ${rec.action}${targetStr} → ${rec.result}`; + if (rec.urlAfter) { + historyText += ` [URL: ${rec.urlAfter.slice(0, 60)}...]`; + } + historyText += '\n'; + } + historyText += '\n'; + } + + // Tight prompt optimized for small local models (7B) + // NOTE: /no_think at END of system prompt for Qwen3 compatibility + const system = `You are a browser automation planner. Decide the NEXT action. + +Actions: +- CLICK: Click an element. Set "intent" to element type/role. Set "input" to EXACT text from elements list. +- TYPE_AND_SUBMIT: Type text into a search box and submit. Set "input" to the SEARCH QUERY from the goal (NOT the element label). +- SCROLL: Scroll page. Set "direction" to "up" or "down". +- DONE: ONLY return DONE when the ENTIRE goal is complete. NOT after just one step. + +WHEN TO USE DONE: +- "Add to Cart" task: DONE only AFTER clicking the Add to Cart button +- "Search and click product" task: DONE only AFTER clicking a product link +- "Search only" task: DONE after search results appear +- If goal has multiple steps, complete ALL steps before returning DONE + +CRITICAL RULE FOR TYPE_AND_SUBMIT: +- "input" must be the SEARCH QUERY you want to type (e.g., "wireless headphones") +- "input" is NOT the element label (e.g., NOT "Search Amazon") +- ONLY use if you see a "searchbox" or "textbox" element + +CRITICAL RULE FOR CLICK (after search): +- After searching, you are on a RESULTS PAGE. Click a PRODUCT LINK to go to product details. +- Look for LINK elements with product names, prices, or /dp/ URLs +- Set "input" to the product title text from the elements list + +CRITICAL RULE FOR ADD TO CART: +- On product page, look for "Add to Cart" or "Add to Bag" buttons +- Set "input" to "Add to Cart" (or exact button text from elements) + +Output ONLY valid JSON (no markdown, no \`\`\`): +{"action":"TYPE_AND_SUBMIT","intent":"searchbox","input":"wireless headphones","reasoning":"search for product"} +{"action":"CLICK","intent":"product link","input":"Sony WH-1000XM4 Wireless...","reasoning":"click first product result"} +{"action":"CLICK","intent":"add to cart button","input":"Add to Cart","reasoning":"add item to cart"} +{"action":"DONE","intent":"completed","reasoning":"clicked add to cart - goal complete"} + +RULES: +1. For TYPE_AND_SUBMIT: "input" = search query from goal (what you want to search for) +2. For CLICK: "input" = exact text from elements list +3. Do NOT type into "email" or "newsletter" fields +4. Do NOT repeat the same action twice +5. Output ONLY JSON - no tags, no markdown, no prose +6. Do NOT output or any reasoning +7. Do NOT return DONE until ALL parts of the goal are complete`; + + // NOTE: /no_think MUST be at the START of user message for Qwen3 models + const user = `/no_think +Goal: ${goal} + +Current URL: ${currentUrl} + +${historyText}Current page elements (ID|role|text|importance|clickable|...): +${pageContext} + +Based on the goal and current page state, what is the NEXT action to take?`; + + return [system, user]; +} + +// --------------------------------------------------------------------------- +// Executor Prompt +// --------------------------------------------------------------------------- + +/** + * Build system and user prompts for the Executor LLM. + * + * @param goal - Human-readable goal for this step + * @param intent - Intent hint for element selection (optional) + * @param compactContext - Compact representation of page elements + * @param inputText - For TYPE_AND_SUBMIT: text to type. For CLICK: target text to match (optional) + * @param category - Task category for category-specific hints (optional) + * @param actionType - Action type (CLICK, TYPE_AND_SUBMIT, etc.) + * @returns Tuple of [systemPrompt, userPrompt] + */ +export function buildExecutorPrompt( + goal: string, + intent: string | undefined, + compactContext: string, + inputText?: string, + category?: string, + actionType?: string +): [string, string] { + const intentLine = intent ? `Intent: ${intent}\n` : ''; + + // For CLICK actions, inputText is target to match (not text to type) + const isTypeAction = actionType === 'TYPE_AND_SUBMIT' || actionType === 'TYPE'; + let inputLine = ''; + if (isTypeAction && inputText) { + inputLine = `Text to type: "${inputText}"\n`; + } else if (inputText) { + inputLine = `Target to find: "${inputText}"\n`; + } + + // Get category-specific hints + const categoryHints = getCategoryExecutorHints(category); + const categoryLine = categoryHints ? `${categoryHints}\n` : ''; + + // Build system prompt based on action type + let system: string; + + if (isTypeAction && inputText) { + // TYPE action - find the INPUT element (textbox/combobox), not the submit button + system = `You are an executor for browser automation. +Task: Find the INPUT element (textbox, combobox, searchbox) to type into. +Return ONLY ONE line: TYPE(, "text") +IMPORTANT: Return the ID of the INPUT/TEXTBOX element, NOT the submit button. +CRITICAL - AVOID these fields (they are NOT search boxes): +- Fields with 'email', 'newsletter', 'subscribe', 'signup' in the text +- Fields labeled 'Your email address', 'Email', 'Enter your email' +- Fields in footer/newsletter sections +ONLY use fields explicitly labeled for SEARCH (placeholder='Search', aria='Search'). +If NO search field exists, return NONE instead of guessing. +If you output anything else, the action fails. +Do NOT output or any reasoning. +No prose, no markdown, no extra whitespace. +Example: TYPE(42, "hello world")`; + } else { + // CLICK action (most common) + const searchKeywords = ['search', 'magnify', 'magnifier', 'find']; + const productKeywords = ['product', 'item', 'result', 'listing']; + const addToCartKeywords = ['add to cart', 'add to bag', 'add to basket', 'buy now']; + + const isSearchAction = + (intent && searchKeywords.some(kw => intent.toLowerCase().includes(kw))) || + searchKeywords.some(kw => goal.toLowerCase().includes(kw)); + + const isProductAction = + (intent && productKeywords.some(kw => intent.toLowerCase().includes(kw))) || + productKeywords.some(kw => goal.toLowerCase().includes(kw)); + + const isAddToCartAction = + (intent && addToCartKeywords.some(kw => intent.toLowerCase().includes(kw))) || + addToCartKeywords.some(kw => goal.toLowerCase().includes(kw)); + + const isTextMatchingAction = intent && intent.toLowerCase().includes('matching'); + const hasTargetText = Boolean(inputText); + + if (isSearchAction) { + system = `You are an executor for browser automation. +Return ONLY a single-line CLICK(id) action. +If you output anything else, the action fails. +Do NOT output or any reasoning. +SEARCH ICON HINTS: Look for links/buttons with 'search' in text/href, or icon-only elements (text='0' or empty) with 'search' in href. +Output MUST match exactly: CLICK() with no spaces. +Example: CLICK(12)`; + } else if (isTextMatchingAction || hasTargetText) { + // When planner specifies target text, executor must match it + const targetText = inputText || ''; + system = `You are an executor for browser automation. +Return ONLY a single-line CLICK(id) action. +If you output anything else, the action fails. +Do NOT output or any reasoning. +CRITICAL: Find an element with text matching '${targetText}'. +- Look for: product titles, category names, link text, button labels +- Text must contain the target words (case-insensitive partial match is OK) +- If NO element contains the target text, return NONE instead of clicking something random +Output: CLICK() or NONE +Example: CLICK(42) or NONE`; + } else if (isProductAction) { + system = `You are an executor for browser automation. +Return ONLY a single-line CLICK(id) action. +If you output anything else, the action fails. +Do NOT output or any reasoning. +PRODUCT CLICK HINTS: +- Look for LINK elements (role=link) with product IDs in href (e.g., /7027762, /dp/B...) +- Prefer links with delivery info text like 'Delivery', 'Ships to Store', 'Get it...' +- These are inside product cards and will navigate to product detail pages +- AVOID buttons like 'Search', 'Shop', category buttons, or filter buttons +- AVOID image slider options (slider image 1, 2, etc.) +Output MUST match exactly: CLICK() with no spaces. +Example: CLICK(1268)`; + } else if (isAddToCartAction) { + system = `You are an executor for browser automation. +Return ONLY a single-line CLICK(id) action. +If you output anything else, the action fails. +Do NOT output or any reasoning. +ADD TO CART HINTS: +- FIRST: Look for buttons with text: 'Add to Cart', 'Add to Bag', 'Add to Basket', 'Buy Now' +- If found, click that button directly +- FALLBACK: If NO 'Add to Cart' button is visible, you are likely on a SEARCH RESULTS page + - In this case, click a PRODUCT LINK to go to the product details page first + - Look for LINK elements with product IDs in href (e.g., /7027762, /dp/B...) + - Prefer links with product names, prices, or delivery info +- AVOID: 'Search' buttons, category buttons, filter buttons, pagination +Output MUST match exactly: CLICK() with no spaces. +Example: CLICK(42)`; + } else { + system = `You are an executor for browser automation. +Return ONLY a single-line CLICK(id) action. +If you output anything else, the action fails. +Do NOT output or any reasoning. +No prose, no markdown, no extra whitespace. +Output MUST match exactly: CLICK() with no spaces. +Example: CLICK(12)`; + } + } + + // Build action instruction based on action type + let actionInstruction: string; + if (isTypeAction && inputText) { + actionInstruction = `Return TYPE(id, "${inputText}"):`; + } else if (inputText) { + actionInstruction = `Return CLICK(id) for element matching "${inputText}", or NONE if not found:`; + } else { + actionInstruction = 'Return CLICK(id):'; + } + + // NOTE: /no_think MUST be at the START of user message for Qwen3 models + const user = `/no_think +Goal: ${goal} +${intentLine}${categoryLine}${inputLine} +Elements: +${compactContext} + +${actionInstruction}`; + + return [system, user]; +} + +// --------------------------------------------------------------------------- +// Helper Functions +// --------------------------------------------------------------------------- + +/** + * Get category-specific hints for the executor. + */ +function getCategoryExecutorHints(category?: string): string { + if (!category) return ''; + + const categoryLower = category.toLowerCase(); + + const hints: Record = { + shopping: "Priority: 'Add to Cart', 'Buy Now', 'Add to Bag', product links, price elements.", + checkout: "Priority: 'Checkout', 'Proceed to Checkout', 'Place Order', payment fields.", + form_filling: 'Priority: input fields, textboxes, submit/send buttons, form labels.', + search: 'Priority: search box, search button, result links, filter controls.', + auth: 'Priority: username/email field, password field, sign in/login button.', + extraction: 'Priority: data elements, table cells, list items, content containers.', + navigation: 'Priority: navigation links, menu items, breadcrumbs.', + }; + + return hints[categoryLower] || ''; +} + +// --------------------------------------------------------------------------- +// Stepwise Planner Response Schema +// --------------------------------------------------------------------------- + +/** + * Expected response format from stepwise planner. + */ +export interface StepwisePlannerResponse { + action: 'CLICK' | 'TYPE_AND_SUBMIT' | 'SCROLL' | 'DONE'; + intent?: string; + input?: string; + direction?: 'up' | 'down'; + reasoning?: string; +} diff --git a/src/agents/planner-executor/recovery.ts b/src/agents/planner-executor/recovery.ts new file mode 100644 index 0000000..d2aedea --- /dev/null +++ b/src/agents/planner-executor/recovery.ts @@ -0,0 +1,263 @@ +/** + * Recovery: Checkpoint and rollback mechanisms for automation recovery. + * + * This module provides state tracking and recovery mechanisms for when + * automation gets off-track. Key concepts: + * + * - RecoveryCheckpoint: A snapshot of known-good state (URL, step, digest) + * - RecoveryState: Tracks checkpoints and manages recovery attempts + * + * Recovery flow: + * 1. After each successful step verification, record a checkpoint + * 2. If verification fails repeatedly, attempt recovery to last checkpoint + * 3. Navigate back to checkpoint URL and re-verify + * 4. If recovery succeeds, resume from checkpoint step + */ + +/** + * Configuration for recovery navigation. + */ +export interface RecoveryNavigationConfig { + /** Whether recovery is enabled (default: true) */ + enabled: boolean; + /** Maximum recovery attempts per run (default: 2) */ + maxRecoveryAttempts: number; + /** Whether to track successful URLs for recovery (default: true) */ + trackSuccessfulUrls: boolean; + /** Maximum checkpoints to retain (default: 10) */ + maxCheckpoints: number; +} + +/** + * Default recovery navigation configuration. + */ +export const DEFAULT_RECOVERY_CONFIG: RecoveryNavigationConfig = { + enabled: true, + maxRecoveryAttempts: 2, + trackSuccessfulUrls: true, + maxCheckpoints: 10, +}; + +/** + * Checkpoint for rollback recovery. + * + * Created after each successful step verification to enable rollback + * if subsequent steps fail. + */ +export interface RecoveryCheckpoint { + /** The URL at this checkpoint */ + url: string; + /** The step index that was just completed (0-indexed) */ + stepIndex: number; + /** Hash of the snapshot for state verification */ + snapshotDigest: string; + /** When the checkpoint was created */ + timestamp: Date; + /** Labels of predicates that passed at this checkpoint */ + predicatesPassed: string[]; +} + +/** + * Tracks recovery state for rollback mechanism. + * + * Checkpoints are created after each successful step verification. + * Recovery can be attempted when steps fail repeatedly. + * + * @example + * ```typescript + * const state = new RecoveryState({ maxRecoveryAttempts: 2 }); + * + * // After successful step + * state.recordCheckpoint({ + * url: 'https://shop.com/cart', + * stepIndex: 2, + * snapshotDigest: 'abc123', + * predicatesPassed: ['url_contains("/cart")'], + * }); + * + * // On repeated failure + * if (state.canRecover()) { + * const checkpoint = state.consumeRecoveryAttempt(); + * // Navigate to checkpoint.url and resume + * } + * ``` + */ +export class RecoveryState { + /** List of recorded checkpoints (most recent last) */ + private checkpoints: RecoveryCheckpoint[] = []; + + /** Number of recovery attempts consumed */ + private recoveryAttemptsUsed: number = 0; + + /** Maximum allowed recovery attempts */ + readonly maxRecoveryAttempts: number; + + /** The checkpoint being recovered to (if any) */ + currentRecoveryTarget: RecoveryCheckpoint | null = null; + + /** Maximum checkpoints to retain */ + readonly maxCheckpoints: number; + + constructor(config: Partial = {}) { + this.maxRecoveryAttempts = + config.maxRecoveryAttempts ?? DEFAULT_RECOVERY_CONFIG.maxRecoveryAttempts; + this.maxCheckpoints = config.maxCheckpoints ?? DEFAULT_RECOVERY_CONFIG.maxCheckpoints; + } + + /** + * Record a successful checkpoint. + * + * Called after step verification passes to enable future rollback. + * + * @param checkpoint - Checkpoint data (without timestamp) + * @returns The created RecoveryCheckpoint + */ + recordCheckpoint(data: Omit): RecoveryCheckpoint { + const checkpoint: RecoveryCheckpoint = { + ...data, + timestamp: new Date(), + predicatesPassed: data.predicatesPassed || [], + }; + + this.checkpoints.push(checkpoint); + + // Keep only last N checkpoints to bound memory + if (this.checkpoints.length > this.maxCheckpoints) { + this.checkpoints = this.checkpoints.slice(-this.maxCheckpoints); + } + + return checkpoint; + } + + /** + * Get the most recent checkpoint for recovery. + * + * @returns Most recent RecoveryCheckpoint, or null if no checkpoints exist + */ + getRecoveryTarget(): RecoveryCheckpoint | null { + if (this.checkpoints.length === 0) { + return null; + } + return this.checkpoints[this.checkpoints.length - 1]; + } + + /** + * Get checkpoint at a specific step index. + * + * @param stepIndex - The step index to find + * @returns RecoveryCheckpoint at that step, or null if not found + */ + getCheckpointAtStep(stepIndex: number): RecoveryCheckpoint | null { + for (let i = this.checkpoints.length - 1; i >= 0; i--) { + if (this.checkpoints[i].stepIndex === stepIndex) { + return this.checkpoints[i]; + } + } + return null; + } + + /** + * Get the most recent checkpoint before a given step. + * + * @param stepIndex - The step index to find checkpoint before + * @returns Most recent checkpoint with stepIndex < given index, or null + */ + getCheckpointBeforeStep(stepIndex: number): RecoveryCheckpoint | null { + for (let i = this.checkpoints.length - 1; i >= 0; i--) { + if (this.checkpoints[i].stepIndex < stepIndex) { + return this.checkpoints[i]; + } + } + return null; + } + + /** + * Check if recovery is still possible. + * + * @returns True if recovery attempts remain and checkpoints exist + */ + canRecover(): boolean { + return this.recoveryAttemptsUsed < this.maxRecoveryAttempts && this.checkpoints.length > 0; + } + + /** + * Consume a recovery attempt and return target checkpoint. + * + * Increments recoveryAttemptsUsed and sets currentRecoveryTarget. + * + * @returns The checkpoint to recover to, or null if recovery not possible + */ + consumeRecoveryAttempt(): RecoveryCheckpoint | null { + if (!this.canRecover()) { + return null; + } + + this.recoveryAttemptsUsed++; + this.currentRecoveryTarget = this.getRecoveryTarget(); + return this.currentRecoveryTarget; + } + + /** + * Clear the current recovery target after recovery completes. + */ + clearRecoveryTarget(): void { + this.currentRecoveryTarget = null; + } + + /** + * Reset recovery state for a new run. + */ + reset(): void { + this.checkpoints = []; + this.recoveryAttemptsUsed = 0; + this.currentRecoveryTarget = null; + } + + /** + * Remove and return the most recent checkpoint. + * + * Useful when recovery fails and we want to try an earlier checkpoint. + * + * @returns The removed checkpoint, or null if no checkpoints exist + */ + popCheckpoint(): RecoveryCheckpoint | null { + if (this.checkpoints.length === 0) { + return null; + } + return this.checkpoints.pop() || null; + } + + /** + * Get the URL of the most recent successful checkpoint. + */ + get lastSuccessfulUrl(): string | null { + if (this.checkpoints.length === 0) { + return null; + } + return this.checkpoints[this.checkpoints.length - 1].url; + } + + /** + * Get the step index of the most recent successful checkpoint. + */ + get lastSuccessfulStep(): number | null { + if (this.checkpoints.length === 0) { + return null; + } + return this.checkpoints[this.checkpoints.length - 1].stepIndex; + } + + /** + * Get the number of checkpoints. + */ + get length(): number { + return this.checkpoints.length; + } + + /** + * Get the number of recovery attempts used. + */ + get attemptsUsed(): number { + return this.recoveryAttemptsUsed; + } +} diff --git a/src/agents/planner-executor/vision-fallback.ts b/src/agents/planner-executor/vision-fallback.ts new file mode 100644 index 0000000..3d280e0 --- /dev/null +++ b/src/agents/planner-executor/vision-fallback.ts @@ -0,0 +1,122 @@ +/** + * Vision Fallback Detection + * + * Detects when a snapshot is unusable and should trigger vision-based + * element detection instead of text-based snapshot analysis. + * + * Vision fallback is triggered when: + * - Snapshot has too few elements (< 10 and status indicates issues) + * - Snapshot status is "require_vision" or "error" + * - Diagnostics indicate low confidence or canvas page + */ + +import type { Snapshot } from './plan-models'; + +/** + * Snapshot diagnostics interface (optional data from runtime). + */ +export interface SnapshotDiagnostics { + /** Confidence score 0-1 (low = unreliable) */ + confidence?: number; + /** Whether page contains canvas elements */ + hasCanvas?: boolean; + /** Whether vision is explicitly required */ + requiresVision?: boolean; +} + +/** + * Result of vision fallback detection. + */ +export interface VisionFallbackResult { + /** Whether vision fallback should be used */ + shouldUseVision: boolean; + /** Reason for vision fallback (if triggered) */ + reason: string | null; +} + +/** + * Detect if snapshot is unusable and should trigger vision fallback. + * + * Returns whether vision-based element detection should be used instead + * of the text-based snapshot analysis. + * + * Note: If we have sufficient elements (10+), we should NOT trigger vision + * fallback even if diagnostics suggest it. This handles cases where the + * API incorrectly flags normal HTML pages as requiring vision. + * + * @param snapshot - The snapshot to analyze + * @returns Vision fallback result with shouldUseVision flag and reason + * + * @example + * ```typescript + * const result = detectSnapshotFailure(snapshot); + * if (result.shouldUseVision) { + * console.log(`Vision fallback needed: ${result.reason}`); + * } + * ``` + */ +export function detectSnapshotFailure(snapshot: Snapshot | null): VisionFallbackResult { + // Null snapshot always requires vision + if (snapshot === null) { + return { shouldUseVision: true, reason: 'snapshot_null' }; + } + + const elements = snapshot.elements || []; + const elementCount = elements.length; + + // If we have sufficient elements, the snapshot is usable + // regardless of what diagnostics say + if (elementCount >= 10) { + return { shouldUseVision: false, reason: null }; + } + + // Check explicit status field (tri-state: success, error, require_vision) + const status = snapshot.status || 'success'; + + if (status === 'require_vision') { + return { shouldUseVision: true, reason: 'require_vision' }; + } + + if (status === 'error') { + return { shouldUseVision: true, reason: `snapshot_error` }; + } + + // Check diagnostics if available (from snapshot metadata) + const diag = (snapshot as unknown as { diagnostics?: SnapshotDiagnostics }).diagnostics; + if (diag) { + // Low confidence + if (diag.confidence !== undefined && diag.confidence < 0.3) { + return { shouldUseVision: true, reason: 'low_confidence' }; + } + + // Canvas page with few elements + if (diag.hasCanvas && elementCount < 5) { + return { shouldUseVision: true, reason: 'canvas_page' }; + } + + // Diagnostics explicitly require vision + if (diag.requiresVision && elementCount < 5) { + return { shouldUseVision: true, reason: 'diagnostics_requires_vision' }; + } + } + + // Very few elements usually indicates a problem + if (elementCount < 3) { + return { shouldUseVision: true, reason: 'too_few_elements' }; + } + + return { shouldUseVision: false, reason: null }; +} + +/** + * Check if vision fallback is needed for a snapshot context. + * + * Convenience wrapper that checks both snapshot success and vision requirement. + * + * @param snapshotSuccess - Whether snapshot capture succeeded + * @param requiresVision - Whether vision is already flagged as required + * @returns true if vision should be used + */ +export function shouldUseVision(snapshotSuccess: boolean, requiresVision: boolean): boolean { + return !snapshotSuccess || requiresVision; +} diff --git a/src/llm-provider.ts b/src/llm-provider.ts index 4fd7753..1f6c88f 100644 --- a/src/llm-provider.ts +++ b/src/llm-provider.ts @@ -169,7 +169,55 @@ export class LocalLLMProvider extends LLMProvider { const data = JSON.parse(text); const choice = data?.choices?.[0]; - const content = choice?.message?.content ?? ''; + const message = choice?.message; + + // Extract content - Ollama/Qwen3 sometimes puts output in 'reasoning' field + // when the model uses thinking mode + let content = message?.content ?? ''; + + // If content is empty but reasoning exists, try to extract the answer from reasoning + // The reasoning field often ends with the actual answer after the thinking + if (!content && message?.reasoning) { + const reasoningLen = message.reasoning.length; + // Debug: log the END of the reasoning field (where the answer usually is) + const reasoningEnd = message.reasoning.slice(-300); + console.log( + `[LocalLLMProvider DEBUG] Empty content, reasoning len=${reasoningLen}, last 300 chars:\n${reasoningEnd}` + ); + + // First, try to extract JSON objects (for planner responses) + // Look for the LAST complete JSON object in the reasoning + const jsonMatches = [ + ...message.reasoning.matchAll(/\{[^{}]*"action"\s*:\s*"[^"]+"\s*[^{}]*\}/g), + ]; + if (jsonMatches.length > 0) { + content = jsonMatches[jsonMatches.length - 1][0]; + console.log(`[LocalLLMProvider DEBUG] Extracted JSON from reasoning: ${content}`); + } else { + // Look for the LAST action pattern in reasoning (answer comes after thinking) + // Use matchAll to find all occurrences and take the last one + // Priority: TYPE > CLICK > DONE > NONE (more specific actions first) + const typeMatches = [...message.reasoning.matchAll(/TYPE\(\d+,\s*"[^"]*"\)/g)]; + if (typeMatches.length > 0) { + content = typeMatches[typeMatches.length - 1][0]; + console.log(`[LocalLLMProvider DEBUG] Extracted TYPE from reasoning: ${content}`); + } else { + const clickMatches = [...message.reasoning.matchAll(/CLICK\(\d+\)/g)]; + if (clickMatches.length > 0) { + content = clickMatches[clickMatches.length - 1][0]; + console.log(`[LocalLLMProvider DEBUG] Extracted CLICK from reasoning: ${content}`); + } else if (message.reasoning.includes('DONE')) { + // Only use DONE if it's near the end (within last 100 chars) + if (message.reasoning.slice(-100).includes('DONE')) { + content = 'DONE'; + console.log(`[LocalLLMProvider DEBUG] Extracted DONE from end of reasoning`); + } + } + // Don't extract NONE - if model is still reasoning, let it continue + } + } + } + const usage = data?.usage; return { @@ -299,9 +347,12 @@ export class LocalVisionLLMProvider extends LocalLLMProvider { export class OllamaProvider extends LocalLLMProvider { private _ollamaBaseUrl: string; private _ollamaModelName: string; + private _disableThinking: boolean; constructor( - options: { model: string; baseUrl?: string; timeoutMs?: number } = { model: 'qwen3:8b' } + options: { model: string; baseUrl?: string; timeoutMs?: number; disableThinking?: boolean } = { + model: 'qwen3:8b', + } ) { const baseUrl = options.baseUrl ?? 'http://localhost:11434'; // Ollama serves OpenAI-compatible API at /v1 @@ -313,6 +364,29 @@ export class OllamaProvider extends LocalLLMProvider { }); this._ollamaBaseUrl = baseUrl; this._ollamaModelName = options.model; + // For Qwen3 and similar models, disable thinking by default if model name contains "qwen" + this._disableThinking = options.disableThinking ?? options.model.toLowerCase().includes('qwen'); + } + + /** + * Override generate to add Ollama-specific options for Qwen3 thinking mode. + */ + async generate( + systemPrompt: string, + userPrompt: string, + options: Record = {} + ): Promise { + // For Qwen3 models, add think: false to disable reasoning output + // Ollama OpenAI-compatible API passes model options via 'options' field + const ollamaOptions = { ...options }; + if (this._disableThinking) { + // Merge with existing options if any + ollamaOptions.options = { + ...(ollamaOptions.options || {}), + think: false, + }; + } + return super.generate(systemPrompt, userPrompt, ollamaOptions); } /** diff --git a/tests/agents/planner-executor/boundary-detection.test.ts b/tests/agents/planner-executor/boundary-detection.test.ts new file mode 100644 index 0000000..957eb50 --- /dev/null +++ b/tests/agents/planner-executor/boundary-detection.test.ts @@ -0,0 +1,279 @@ +/** + * Tests for boundary detection (auth pages, checkout pages). + */ + +import { + detectAuthBoundary, + detectCheckoutPage, + isCheckoutElement, + DEFAULT_AUTH_BOUNDARY_CONFIG, + DEFAULT_CHECKOUT_CONFIG, + type AuthBoundaryConfig, + type CheckoutDetectionConfig, +} from '../../../src/agents/planner-executor/boundary-detection'; + +describe('boundary-detection', () => { + describe('DEFAULT_AUTH_BOUNDARY_CONFIG', () => { + it('should have expected default values', () => { + expect(DEFAULT_AUTH_BOUNDARY_CONFIG.enabled).toBe(true); + expect(DEFAULT_AUTH_BOUNDARY_CONFIG.stopOnAuth).toBe(true); + expect(DEFAULT_AUTH_BOUNDARY_CONFIG.urlPatterns).toContain('/signin'); + expect(DEFAULT_AUTH_BOUNDARY_CONFIG.urlPatterns).toContain('/login'); + expect(DEFAULT_AUTH_BOUNDARY_CONFIG.authSuccessMessage).toBe( + 'Reached authentication boundary (login required)' + ); + }); + }); + + describe('DEFAULT_CHECKOUT_CONFIG', () => { + it('should have expected default values', () => { + expect(DEFAULT_CHECKOUT_CONFIG.enabled).toBe(true); + expect(DEFAULT_CHECKOUT_CONFIG.cartUrlPatterns).toContain('/cart'); + expect(DEFAULT_CHECKOUT_CONFIG.checkoutUrlPatterns).toContain('/checkout'); + expect(DEFAULT_CHECKOUT_CONFIG.checkoutElementPatterns).toContain('proceed to checkout'); + }); + }); + + describe('detectAuthBoundary', () => { + it('should detect signin URL', () => { + const result = detectAuthBoundary('https://example.com/signin'); + expect(result.isAuthBoundary).toBe(true); + expect(result.matchedPattern).toBe('/signin'); + }); + + it('should detect login URL', () => { + const result = detectAuthBoundary('https://example.com/login'); + expect(result.isAuthBoundary).toBe(true); + expect(result.matchedPattern).toBe('/login'); + }); + + it('should detect sign-in URL with hyphen', () => { + const result = detectAuthBoundary('https://example.com/sign-in'); + expect(result.isAuthBoundary).toBe(true); + expect(result.matchedPattern).toBe('/sign-in'); + }); + + it('should detect Amazon signin URL', () => { + const result = detectAuthBoundary('https://amazon.com/ap/signin'); + expect(result.isAuthBoundary).toBe(true); + // Note: /signin matches before /ap/signin, which is fine - both are auth pages + expect(result.matchedPattern).toBeTruthy(); + }); + + it('should detect Amazon register URL', () => { + const result = detectAuthBoundary('https://amazon.com/ap/register'); + expect(result.isAuthBoundary).toBe(true); + expect(result.matchedPattern).toBe('/ap/register'); + }); + + it('should detect Amazon CAPTCHA/claim URL', () => { + const result = detectAuthBoundary('https://amazon.com/ax/claim'); + expect(result.isAuthBoundary).toBe(true); + expect(result.matchedPattern).toBe('/ax/claim'); + }); + + it('should detect auth URL', () => { + const result = detectAuthBoundary('https://example.com/auth/callback'); + expect(result.isAuthBoundary).toBe(true); + expect(result.matchedPattern).toBe('/auth'); + }); + + it('should not detect non-auth URL', () => { + const result = detectAuthBoundary('https://example.com/products'); + expect(result.isAuthBoundary).toBe(false); + expect(result.matchedPattern).toBeNull(); + }); + + it('should be case insensitive', () => { + const result = detectAuthBoundary('https://example.com/LOGIN'); + expect(result.isAuthBoundary).toBe(true); + }); + + it('should handle empty URL', () => { + const result = detectAuthBoundary(''); + expect(result.isAuthBoundary).toBe(false); + expect(result.matchedPattern).toBeNull(); + }); + + it('should respect disabled config', () => { + const config: AuthBoundaryConfig = { + ...DEFAULT_AUTH_BOUNDARY_CONFIG, + enabled: false, + }; + const result = detectAuthBoundary('https://example.com/signin', config); + expect(result.isAuthBoundary).toBe(false); + }); + + it('should use custom URL patterns', () => { + const config: AuthBoundaryConfig = { + ...DEFAULT_AUTH_BOUNDARY_CONFIG, + urlPatterns: ['/custom-auth'], + }; + const result = detectAuthBoundary('https://example.com/custom-auth', config); + expect(result.isAuthBoundary).toBe(true); + expect(result.matchedPattern).toBe('/custom-auth'); + }); + }); + + describe('detectCheckoutPage', () => { + it('should detect cart URL', () => { + const result = detectCheckoutPage('https://shop.com/cart'); + expect(result.isCheckoutRelated).toBe(true); + expect(result.isCart).toBe(true); + expect(result.isCheckout).toBe(false); + expect(result.matchedPattern).toBe('/cart'); + }); + + it('should detect basket URL', () => { + const result = detectCheckoutPage('https://shop.com/basket'); + expect(result.isCheckoutRelated).toBe(true); + expect(result.isCart).toBe(true); + expect(result.isCheckout).toBe(false); + }); + + it('should detect bag URL', () => { + const result = detectCheckoutPage('https://shop.com/bag'); + expect(result.isCheckoutRelated).toBe(true); + expect(result.isCart).toBe(true); + }); + + it('should detect checkout URL', () => { + const result = detectCheckoutPage('https://shop.com/checkout'); + expect(result.isCheckoutRelated).toBe(true); + expect(result.isCart).toBe(false); + expect(result.isCheckout).toBe(true); + expect(result.matchedPattern).toBe('/checkout'); + }); + + it('should detect payment URL', () => { + const result = detectCheckoutPage('https://shop.com/payment'); + expect(result.isCheckoutRelated).toBe(true); + expect(result.isCheckout).toBe(true); + }); + + it('should detect order URL', () => { + const result = detectCheckoutPage('https://shop.com/order'); + expect(result.isCheckoutRelated).toBe(true); + expect(result.isCheckout).toBe(true); + }); + + it('should detect Amazon cart URL', () => { + const result = detectCheckoutPage('https://amazon.com/gp/cart'); + expect(result.isCheckoutRelated).toBe(true); + expect(result.isCart).toBe(true); + }); + + it('should detect Amazon checkout URL', () => { + const result = detectCheckoutPage('https://amazon.com/gp/checkout'); + expect(result.isCheckoutRelated).toBe(true); + expect(result.isCheckout).toBe(true); + }); + + it('should not detect regular product URL', () => { + const result = detectCheckoutPage('https://shop.com/products/widget'); + expect(result.isCheckoutRelated).toBe(false); + expect(result.isCart).toBe(false); + expect(result.isCheckout).toBe(false); + expect(result.matchedPattern).toBeNull(); + }); + + it('should be case insensitive', () => { + const result = detectCheckoutPage('https://shop.com/CHECKOUT'); + expect(result.isCheckoutRelated).toBe(true); + expect(result.isCheckout).toBe(true); + }); + + it('should handle empty URL', () => { + const result = detectCheckoutPage(''); + expect(result.isCheckoutRelated).toBe(false); + }); + + it('should respect disabled config', () => { + const config: CheckoutDetectionConfig = { + ...DEFAULT_CHECKOUT_CONFIG, + enabled: false, + }; + const result = detectCheckoutPage('https://shop.com/checkout', config); + expect(result.isCheckoutRelated).toBe(false); + }); + + it('should prioritize cart over checkout patterns', () => { + // Cart patterns are checked first + const config: CheckoutDetectionConfig = { + ...DEFAULT_CHECKOUT_CONFIG, + cartUrlPatterns: ['/cart'], + checkoutUrlPatterns: ['/cart'], // Same pattern + }; + const result = detectCheckoutPage('https://shop.com/cart', config); + expect(result.isCart).toBe(true); + expect(result.isCheckout).toBe(false); + }); + }); + + describe('isCheckoutElement', () => { + it('should detect "proceed to checkout" text', () => { + expect(isCheckoutElement('Proceed to Checkout')).toBe(true); + }); + + it('should detect "go to checkout" text', () => { + expect(isCheckoutElement('Go to Checkout')).toBe(true); + }); + + it('should detect "view cart" text', () => { + expect(isCheckoutElement('View Cart')).toBe(true); + }); + + it('should detect "shopping cart" text', () => { + expect(isCheckoutElement('Shopping Cart (3 items)')).toBe(true); + }); + + it('should detect "your cart" text', () => { + expect(isCheckoutElement('Your Cart')).toBe(true); + }); + + it('should detect "sign in to checkout" text', () => { + expect(isCheckoutElement('Sign in to checkout')).toBe(true); + }); + + it('should detect "continue to payment" text', () => { + expect(isCheckoutElement('Continue to Payment')).toBe(true); + }); + + it('should detect "place your order" text', () => { + expect(isCheckoutElement('Place Your Order')).toBe(true); + }); + + it('should detect "buy now" text', () => { + expect(isCheckoutElement('Buy Now')).toBe(true); + }); + + it('should not match regular text', () => { + expect(isCheckoutElement('Add to Wishlist')).toBe(false); + }); + + it('should not match unrelated button text', () => { + expect(isCheckoutElement('Continue Shopping')).toBe(false); + }); + + it('should be case insensitive', () => { + expect(isCheckoutElement('PROCEED TO CHECKOUT')).toBe(true); + }); + + it('should handle empty text', () => { + expect(isCheckoutElement('')).toBe(false); + }); + + it('should handle null-like text', () => { + expect(isCheckoutElement(null as unknown as string)).toBe(false); + }); + + it('should use custom patterns', () => { + const config: CheckoutDetectionConfig = { + ...DEFAULT_CHECKOUT_CONFIG, + checkoutElementPatterns: ['custom checkout'], + }; + expect(isCheckoutElement('Custom Checkout', config)).toBe(true); + expect(isCheckoutElement('Proceed to Checkout', config)).toBe(false); + }); + }); +}); diff --git a/tests/agents/planner-executor/config.test.ts b/tests/agents/planner-executor/config.test.ts new file mode 100644 index 0000000..abd5d31 --- /dev/null +++ b/tests/agents/planner-executor/config.test.ts @@ -0,0 +1,225 @@ +/** + * Tests for PlannerExecutorAgent Configuration + */ + +import { + DEFAULT_CONFIG, + mergeConfig, + getConfigPreset, + ConfigPreset, + type SnapshotEscalationConfig, +} from '../../../src/agents/planner-executor/config'; + +describe('PlannerExecutorConfig', () => { + describe('DEFAULT_CONFIG', () => { + it('should have correct snapshot escalation defaults', () => { + const snapshot = DEFAULT_CONFIG.snapshot; + + expect(snapshot.enabled).toBe(true); + // Same defaults as Python SDK - formatContext uses multi-strategy selection + // to ensure product links are captured even with lower snapshot limits + expect(snapshot.limitBase).toBe(60); + expect(snapshot.limitStep).toBe(30); + expect(snapshot.limitMax).toBe(200); + }); + + it('should have correct scroll-after-escalation defaults', () => { + const snapshot = DEFAULT_CONFIG.snapshot; + + expect(snapshot.scrollAfterEscalation).toBe(true); + expect(snapshot.scrollMaxAttempts).toBe(3); + expect(snapshot.scrollDirections).toEqual(['down', 'up']); + expect(snapshot.scrollViewportFraction).toBe(0.4); + expect(snapshot.scrollStabilizeMs).toBe(300); + }); + + it('should have correct retry defaults', () => { + const retry = DEFAULT_CONFIG.retry; + + expect(retry.verifyTimeoutMs).toBe(10000); + expect(retry.verifyPollMs).toBe(500); + expect(retry.verifyMaxAttempts).toBe(4); + expect(retry.executorRepairAttempts).toBe(2); + expect(retry.maxReplans).toBe(2); + }); + + it('should have correct stepwise planning defaults', () => { + const stepwise = DEFAULT_CONFIG.stepwise; + + expect(stepwise.maxSteps).toBe(20); + expect(stepwise.actionHistoryLimit).toBe(5); + expect(stepwise.includePageContext).toBe(true); + }); + + it('should have correct LLM token defaults', () => { + expect(DEFAULT_CONFIG.plannerMaxTokens).toBe(2048); + expect(DEFAULT_CONFIG.plannerTemperature).toBe(0.0); + expect(DEFAULT_CONFIG.executorMaxTokens).toBe(96); + expect(DEFAULT_CONFIG.executorTemperature).toBe(0.0); + }); + + it('should have preStepVerification enabled by default', () => { + expect(DEFAULT_CONFIG.preStepVerification).toBe(true); + }); + }); + + describe('mergeConfig', () => { + it('should merge partial config with defaults', () => { + const partial = { + verbose: true, + plannerMaxTokens: 4096, + }; + + const merged = mergeConfig(partial); + + expect(merged.verbose).toBe(true); + expect(merged.plannerMaxTokens).toBe(4096); + // Other values should be defaults + expect(merged.executorMaxTokens).toBe(DEFAULT_CONFIG.executorMaxTokens); + expect(merged.snapshot.limitBase).toBe(DEFAULT_CONFIG.snapshot.limitBase); + }); + + it('should deep merge nested snapshot config', () => { + const partial = { + snapshot: { + limitBase: 100, + scrollMaxAttempts: 5, + }, + }; + + const merged = mergeConfig(partial); + + expect(merged.snapshot.limitBase).toBe(100); + expect(merged.snapshot.scrollMaxAttempts).toBe(5); + // Other snapshot values should be defaults + expect(merged.snapshot.limitStep).toBe(DEFAULT_CONFIG.snapshot.limitStep); + expect(merged.snapshot.scrollAfterEscalation).toBe( + DEFAULT_CONFIG.snapshot.scrollAfterEscalation + ); + }); + + it('should deep merge nested retry config', () => { + const partial = { + retry: { + verifyTimeoutMs: 20000, + }, + }; + + const merged = mergeConfig(partial); + + expect(merged.retry.verifyTimeoutMs).toBe(20000); + expect(merged.retry.verifyPollMs).toBe(DEFAULT_CONFIG.retry.verifyPollMs); + }); + + it('should handle empty partial config', () => { + const merged = mergeConfig({}); + + expect(merged).toEqual(DEFAULT_CONFIG); + }); + + it('should allow disabling scroll-after-escalation', () => { + const partial = { + snapshot: { + scrollAfterEscalation: false, + }, + }; + + const merged = mergeConfig(partial); + + expect(merged.snapshot.scrollAfterEscalation).toBe(false); + expect(merged.snapshot.limitBase).toBe(DEFAULT_CONFIG.snapshot.limitBase); + }); + }); + + describe('getConfigPreset', () => { + it('should return LOCAL_SMALL_MODEL preset with high token limits', () => { + const config = getConfigPreset(ConfigPreset.LOCAL_SMALL_MODEL); + + expect(config.plannerMaxTokens).toBe(8192); + expect(config.executorMaxTokens).toBe(4096); + expect(config.verbose).toBe(true); + // Should inherit scroll settings from DEFAULT_CONFIG + expect(config.snapshot.scrollAfterEscalation).toBe(true); + }); + + it('should return CLOUD_HIGH_QUALITY preset', () => { + const config = getConfigPreset(ConfigPreset.CLOUD_HIGH_QUALITY); + + expect(config.plannerMaxTokens).toBe(2048); + expect(config.executorMaxTokens).toBe(128); + expect(config.verbose).toBe(false); + }); + + it('should return FAST_ITERATION preset with minimal retries', () => { + const config = getConfigPreset(ConfigPreset.FAST_ITERATION); + + expect(config.retry.verifyMaxAttempts).toBe(2); + expect(config.retry.executorRepairAttempts).toBe(1); + expect(config.plannerMaxTokens).toBe(1024); + }); + + it('should return PRODUCTION preset with more retries', () => { + const config = getConfigPreset(ConfigPreset.PRODUCTION); + + expect(config.retry.verifyMaxAttempts).toBe(8); + expect(config.retry.executorRepairAttempts).toBe(3); + expect(config.retry.verifyTimeoutMs).toBe(20000); + }); + + it('should return DEFAULT preset', () => { + const config = getConfigPreset(ConfigPreset.DEFAULT); + + expect(config).toEqual(DEFAULT_CONFIG); + }); + + it('should handle string preset names', () => { + const config = getConfigPreset('local_small'); + + expect(config.plannerMaxTokens).toBe(8192); + }); + + it('should return default for unknown preset', () => { + const config = getConfigPreset('unknown_preset'); + + expect(config).toEqual(DEFAULT_CONFIG); + }); + }); + + describe('SnapshotEscalationConfig scroll parameters', () => { + it('should support custom scroll directions', () => { + const partial = { + snapshot: { + scrollDirections: ['up'] as Array<'up' | 'down'>, + }, + }; + + const merged = mergeConfig(partial); + + expect(merged.snapshot.scrollDirections).toEqual(['up']); + }); + + it('should support custom viewport fraction', () => { + const partial = { + snapshot: { + scrollViewportFraction: 0.5, + }, + }; + + const merged = mergeConfig(partial); + + expect(merged.snapshot.scrollViewportFraction).toBe(0.5); + }); + + it('should support custom stabilize delay', () => { + const partial = { + snapshot: { + scrollStabilizeMs: 500, + }, + }; + + const merged = mergeConfig(partial); + + expect(merged.snapshot.scrollStabilizeMs).toBe(500); + }); + }); +}); diff --git a/tests/agents/planner-executor/intent-heuristics.test.ts b/tests/agents/planner-executor/intent-heuristics.test.ts new file mode 100644 index 0000000..02c0443 --- /dev/null +++ b/tests/agents/planner-executor/intent-heuristics.test.ts @@ -0,0 +1,155 @@ +/** + * Tests for Intent Heuristics + */ + +import type { SnapshotElement } from '../../../src/agents/planner-executor/plan-models'; + +// We need to test the SimpleIntentHeuristics class, but it's not exported. +// So we'll test via the PlannerExecutorAgent which uses it internally. +// For now, let's test the pattern matching logic conceptually. + +describe('Intent Heuristics', () => { + // Helper to create mock elements + const createElement = ( + id: number, + text: string, + role: string = 'button', + clickable: boolean = true + ): SnapshotElement => ({ + id, + text, + role, + clickable, + }); + + describe('Common Intent Patterns', () => { + // These tests verify the patterns we expect to match + + it('should recognize add_to_cart patterns', () => { + const patterns = ['add to cart', 'add to bag', 'add to basket', 'buy now', 'add item']; + const elements = patterns.map((p, i) => createElement(i + 1, p)); + + // Each element text should match one of the patterns + for (const element of elements) { + const text = (element.text || '').toLowerCase(); + expect(patterns.some(p => text.includes(p))).toBe(true); + } + }); + + it('should recognize checkout patterns', () => { + const patterns = ['checkout', 'proceed to checkout', 'go to checkout', 'check out']; + const elements = patterns.map((p, i) => createElement(i + 1, p)); + + for (const element of elements) { + const text = (element.text || '').toLowerCase(); + expect(patterns.some(p => text.includes(p))).toBe(true); + } + }); + + it('should recognize search patterns', () => { + const patterns = ['search', 'find', 'go', 'submit']; + const elements = patterns.map((p, i) => createElement(i + 1, p)); + + for (const element of elements) { + const text = (element.text || '').toLowerCase(); + expect(patterns.some(p => text.includes(p))).toBe(true); + } + }); + + it('should recognize login patterns', () => { + const patterns = ['log in', 'login', 'sign in', 'signin']; + const elements = patterns.map((p, i) => createElement(i + 1, p)); + + for (const element of elements) { + const text = (element.text || '').toLowerCase(); + expect(patterns.some(p => text.includes(p))).toBe(true); + } + }); + + it('should recognize close/dismiss patterns', () => { + const patterns = ['close', 'dismiss', 'x', 'cancel', 'no thanks']; + const elements = patterns.map((p, i) => createElement(i + 1, p)); + + for (const element of elements) { + const text = (element.text || '').toLowerCase(); + expect(patterns.some(p => text.includes(p))).toBe(true); + } + }); + }); + + describe('Element Matching Priority', () => { + it('should prefer clickable buttons over non-clickable elements', () => { + const elements = [ + createElement(1, 'Add to Cart', 'text', false), + createElement(2, 'Add to Cart', 'button', true), + ]; + + // When matching, clickable buttons should be preferred + const clickableButtons = elements.filter(e => e.clickable && e.role === 'button'); + expect(clickableButtons.length).toBe(1); + expect(clickableButtons[0].id).toBe(2); + }); + + it('should match elements with matching aria-label', () => { + const element: SnapshotElement = { + id: 1, + text: '', + role: 'button', + ariaLabel: 'Add to shopping cart', + clickable: true, + }; + + const ariaLabel = (element.ariaLabel || '').toLowerCase(); + expect(ariaLabel.includes('cart')).toBe(true); + }); + }); + + describe('Intent Normalization', () => { + it('should handle underscore-separated intents', () => { + const intent = 'add_to_cart'; + const normalized = intent.toLowerCase().replace(/[_-]/g, ' '); + expect(normalized).toBe('add to cart'); + }); + + it('should handle hyphen-separated intents', () => { + const intent = 'add-to-cart'; + const normalized = intent.toLowerCase().replace(/[_-]/g, ' '); + expect(normalized).toBe('add to cart'); + }); + + it('should handle mixed case intents', () => { + const intent = 'Add_To_Cart'; + const normalized = intent.toLowerCase().replace(/[_-]/g, ' '); + expect(normalized).toBe('add to cart'); + }); + }); + + describe('Edge Cases', () => { + it('should handle empty elements array', () => { + const elements: SnapshotElement[] = []; + expect(elements.length).toBe(0); + }); + + it('should handle elements with undefined text', () => { + const element: SnapshotElement = { + id: 1, + role: 'button', + text: undefined, + }; + + const text = (element.text || '').toLowerCase(); + expect(text).toBe(''); + }); + + it('should handle elements with empty text', () => { + const element: SnapshotElement = { + id: 1, + role: 'button', + text: '', + }; + + const text = (element.text || '').toLowerCase(); + expect(text).toBe(''); + }); + }); +}); diff --git a/tests/agents/planner-executor/modal-dismissal.test.ts b/tests/agents/planner-executor/modal-dismissal.test.ts new file mode 100644 index 0000000..2ef1c9b --- /dev/null +++ b/tests/agents/planner-executor/modal-dismissal.test.ts @@ -0,0 +1,347 @@ +/** + * Tests for modal/overlay dismissal logic. + */ + +import { + findDismissalTarget, + detectModalAppearance, + detectModalDismissed, + DEFAULT_MODAL_CONFIG, + type ModalDismissalConfig, +} from '../../../src/agents/planner-executor/modal-dismissal'; +import type { SnapshotElement } from '../../../src/agents/planner-executor/plan-models'; + +describe('modal-dismissal', () => { + describe('DEFAULT_MODAL_CONFIG', () => { + it('should have expected default values', () => { + expect(DEFAULT_MODAL_CONFIG.enabled).toBe(true); + expect(DEFAULT_MODAL_CONFIG.maxAttempts).toBe(2); + expect(DEFAULT_MODAL_CONFIG.minNewElements).toBe(5); + expect(DEFAULT_MODAL_CONFIG.roleFilter).toContain('button'); + expect(DEFAULT_MODAL_CONFIG.roleFilter).toContain('link'); + expect(DEFAULT_MODAL_CONFIG.dismissPatterns).toContain('no thanks'); + expect(DEFAULT_MODAL_CONFIG.dismissPatterns).toContain('close'); + expect(DEFAULT_MODAL_CONFIG.iconPatterns).toContain('x'); + expect(DEFAULT_MODAL_CONFIG.iconPatterns).toContain('×'); + expect(DEFAULT_MODAL_CONFIG.checkoutPatterns).toContain('checkout'); + }); + }); + + describe('findDismissalTarget', () => { + const createButton = (id: number, text: string, role = 'button'): SnapshotElement => ({ + id, + role, + text, + }); + + it('should find "No thanks" button', () => { + const elements: SnapshotElement[] = [ + createButton(1, 'Add Protection'), + createButton(2, 'No thanks'), + ]; + + const result = findDismissalTarget(elements); + expect(result.found).toBe(true); + expect(result.elementId).toBe(2); + expect(result.matchedPattern).toBe('no thanks'); + expect(result.hasCheckoutButton).toBe(false); + }); + + it('should find "Close" button', () => { + const elements: SnapshotElement[] = [createButton(1, 'Subscribe'), createButton(2, 'Close')]; + + const result = findDismissalTarget(elements); + expect(result.found).toBe(true); + expect(result.elementId).toBe(2); + expect(result.matchedPattern).toBe('close'); + }); + + it('should find "X" icon button (exact match)', () => { + const elements: SnapshotElement[] = [createButton(1, 'Accept'), createButton(2, 'x')]; + + const result = findDismissalTarget(elements); + expect(result.found).toBe(true); + expect(result.elementId).toBe(2); + expect(result.matchedPattern).toBe('x'); + }); + + it('should find multiplication sign icon', () => { + const elements: SnapshotElement[] = [createButton(1, 'Subscribe'), createButton(2, '×')]; + + const result = findDismissalTarget(elements); + expect(result.found).toBe(true); + expect(result.elementId).toBe(2); + expect(result.matchedPattern).toBe('×'); + }); + + it('should not match "x" within words like "mexico"', () => { + const elements: SnapshotElement[] = [ + createButton(1, 'Ship to Mexico'), + createButton(2, 'Continue'), + ]; + + const result = findDismissalTarget(elements); + expect(result.found).toBe(true); + expect(result.elementId).toBe(2); + expect(result.matchedPattern).toBe('continue'); + }); + + it('should not match "close" within "enclosed"', () => { + const elements: SnapshotElement[] = [ + createButton(1, 'Enclosed package'), + createButton(2, 'Dismiss'), + ]; + + const result = findDismissalTarget(elements); + expect(result.found).toBe(true); + expect(result.matchedPattern).toBe('dismiss'); + }); + + it('should prioritize icon patterns over text patterns', () => { + const elements: SnapshotElement[] = [createButton(1, 'No thanks'), createButton(2, 'x')]; + + const result = findDismissalTarget(elements); + expect(result.found).toBe(true); + expect(result.elementId).toBe(2); + expect(result.matchedPattern).toBe('x'); + }); + + it('should prioritize earlier dismiss patterns', () => { + // "no thanks" comes before "close" in the pattern list + const elements: SnapshotElement[] = [createButton(1, 'Close'), createButton(2, 'No thanks')]; + + const result = findDismissalTarget(elements); + expect(result.found).toBe(true); + expect(result.elementId).toBe(2); + expect(result.matchedPattern).toBe('no thanks'); + }); + + it('should skip global nav cart links', () => { + const elements: SnapshotElement[] = [ + { id: 1, role: 'link', text: 'Cart', href: '/cart' }, + createButton(2, 'Close'), + ]; + + const result = findDismissalTarget(elements); + expect(result.found).toBe(true); + expect(result.elementId).toBe(2); + }); + + it('should skip cart count indicators', () => { + const elements: SnapshotElement[] = [ + { id: 1, role: 'button', text: '3' }, + createButton(2, 'Dismiss'), + ]; + + const result = findDismissalTarget(elements); + expect(result.found).toBe(true); + expect(result.elementId).toBe(2); + }); + + it('should not dismiss when checkout button found', () => { + const elements: SnapshotElement[] = [ + createButton(1, 'Proceed to Checkout'), + createButton(2, 'Close'), + ]; + + const result = findDismissalTarget(elements); + expect(result.found).toBe(false); + expect(result.elementId).toBeNull(); + expect(result.hasCheckoutButton).toBe(true); + }); + + it('should not dismiss when view cart button found', () => { + const elements: SnapshotElement[] = [ + createButton(1, 'View Cart'), + createButton(2, 'No thanks'), + ]; + + const result = findDismissalTarget(elements); + expect(result.found).toBe(false); + expect(result.hasCheckoutButton).toBe(true); + }); + + it('should not dismiss when cart/checkout link found', () => { + const elements: SnapshotElement[] = [ + { id: 1, role: 'link', text: 'Review Order', href: '/checkout/review' }, + createButton(2, 'Close'), + ]; + + const result = findDismissalTarget(elements); + expect(result.found).toBe(false); + expect(result.hasCheckoutButton).toBe(true); + }); + + it('should only consider buttons and links', () => { + const elements: SnapshotElement[] = [ + { id: 1, role: 'text', text: 'Close' }, + { id: 2, role: 'heading', text: 'No thanks' }, + createButton(3, 'OK'), + ]; + + const result = findDismissalTarget(elements); + expect(result.found).toBe(true); + expect(result.elementId).toBe(3); + }); + + it('should use aria-label for matching', () => { + const elements: SnapshotElement[] = [ + { id: 1, role: 'button', text: '', ariaLabel: 'Close dialog' }, + ]; + + const result = findDismissalTarget(elements); + expect(result.found).toBe(true); + expect(result.elementId).toBe(1); + expect(result.matchedPattern).toBe('close'); + }); + + it('should return not found for empty elements', () => { + const result = findDismissalTarget([]); + expect(result.found).toBe(false); + expect(result.elementId).toBeNull(); + expect(result.hasCheckoutButton).toBe(false); + }); + + it('should respect disabled config', () => { + const config: ModalDismissalConfig = { + ...DEFAULT_MODAL_CONFIG, + enabled: false, + }; + const elements: SnapshotElement[] = [createButton(1, 'No thanks')]; + + const result = findDismissalTarget(elements, config); + expect(result.found).toBe(false); + }); + + it('should use custom dismiss patterns', () => { + const config: ModalDismissalConfig = { + ...DEFAULT_MODAL_CONFIG, + dismissPatterns: ['custom dismiss'], + }; + const elements: SnapshotElement[] = [ + createButton(1, 'No thanks'), + createButton(2, 'Custom Dismiss'), + ]; + + const result = findDismissalTarget(elements, config); + expect(result.found).toBe(true); + expect(result.elementId).toBe(2); + expect(result.matchedPattern).toBe('custom dismiss'); + }); + + it('should skip elements without id', () => { + const elements: SnapshotElement[] = [ + { role: 'button', text: 'No thanks' } as SnapshotElement, + createButton(2, 'Close'), + ]; + + const result = findDismissalTarget(elements); + expect(result.found).toBe(true); + expect(result.elementId).toBe(2); + }); + }); + + describe('detectModalAppearance', () => { + it('should detect modal appearance with many new elements', () => { + const preElements = new Set([1, 2, 3]); + const postElements = new Set([1, 2, 3, 4, 5, 6, 7, 8]); + + expect(detectModalAppearance(preElements, postElements)).toBe(true); + }); + + it('should not detect modal with few new elements', () => { + const preElements = new Set([1, 2, 3]); + const postElements = new Set([1, 2, 3, 4, 5]); + + expect(detectModalAppearance(preElements, postElements)).toBe(false); + }); + + it('should detect modal with exactly minNewElements', () => { + const preElements = new Set([1, 2]); + const postElements = new Set([1, 2, 3, 4, 5, 6, 7]); + + expect(detectModalAppearance(preElements, postElements, 5)).toBe(true); + }); + + it('should not detect modal when elements removed', () => { + const preElements = new Set([1, 2, 3, 4, 5, 6, 7, 8]); + const postElements = new Set([1, 2]); + + expect(detectModalAppearance(preElements, postElements)).toBe(false); + }); + + it('should handle empty pre-elements', () => { + const preElements = new Set(); + const postElements = new Set([1, 2, 3, 4, 5, 6]); + + expect(detectModalAppearance(preElements, postElements)).toBe(true); + }); + + it('should handle empty post-elements', () => { + const preElements = new Set([1, 2, 3]); + const postElements = new Set(); + + expect(detectModalAppearance(preElements, postElements)).toBe(false); + }); + + it('should use custom minNewElements', () => { + const preElements = new Set([1, 2]); + const postElements = new Set([1, 2, 3, 4, 5]); + + expect(detectModalAppearance(preElements, postElements, 3)).toBe(true); + expect(detectModalAppearance(preElements, postElements, 5)).toBe(false); + }); + }); + + describe('detectModalDismissed', () => { + it('should detect modal dismissed with many removed elements', () => { + const preElements = new Set([1, 2, 3, 4, 5, 6, 7, 8]); + const postElements = new Set([1, 2, 3]); + + expect(detectModalDismissed(preElements, postElements)).toBe(true); + }); + + it('should not detect dismissal with few removed elements', () => { + const preElements = new Set([1, 2, 3, 4, 5]); + const postElements = new Set([1, 2, 3, 4]); + + expect(detectModalDismissed(preElements, postElements)).toBe(false); + }); + + it('should detect dismissal with exactly minRemovedElements', () => { + const preElements = new Set([1, 2, 3, 4, 5]); + const postElements = new Set([1, 2]); + + expect(detectModalDismissed(preElements, postElements, 3)).toBe(true); + }); + + it('should not detect dismissal when elements added', () => { + const preElements = new Set([1, 2]); + const postElements = new Set([1, 2, 3, 4, 5, 6, 7, 8]); + + expect(detectModalDismissed(preElements, postElements)).toBe(false); + }); + + it('should handle empty pre-elements', () => { + const preElements = new Set(); + const postElements = new Set([1, 2, 3]); + + expect(detectModalDismissed(preElements, postElements)).toBe(false); + }); + + it('should handle empty post-elements', () => { + const preElements = new Set([1, 2, 3, 4, 5]); + const postElements = new Set(); + + expect(detectModalDismissed(preElements, postElements)).toBe(true); + }); + + it('should use custom minRemovedElements', () => { + const preElements = new Set([1, 2, 3, 4, 5]); + const postElements = new Set([1, 2, 3]); + + expect(detectModalDismissed(preElements, postElements, 2)).toBe(true); + expect(detectModalDismissed(preElements, postElements, 5)).toBe(false); + }); + }); +}); diff --git a/tests/agents/planner-executor/predicates.test.ts b/tests/agents/planner-executor/predicates.test.ts new file mode 100644 index 0000000..e9cbb69 --- /dev/null +++ b/tests/agents/planner-executor/predicates.test.ts @@ -0,0 +1,359 @@ +/** + * Tests for Predicate System + */ + +import { + urlContains, + urlMatches, + exists, + notExists, + elementCount, + anyOf, + allOf, + buildPredicate, + evaluatePredicates, + type Predicate, +} from '../../../src/agents/planner-executor/predicates'; +import type { Snapshot, SnapshotElement } from '../../../src/agents/planner-executor/plan-models'; + +describe('Predicates', () => { + // Helper to create a mock snapshot + const createSnapshot = (url: string, elements: Partial[] = []): Snapshot => ({ + url, + title: 'Test Page', + elements: elements.map((el, i) => ({ + id: i + 1, + role: el.role || 'button', + text: el.text || '', + ariaLabel: el.ariaLabel, + ...el, + })) as SnapshotElement[], + }); + + describe('urlContains', () => { + it('should return true when URL contains substring', () => { + const pred = urlContains('/cart'); + const snapshot = createSnapshot('https://example.com/cart'); + + expect(pred.evaluate(snapshot)).toBe(true); + }); + + it('should return false when URL does not contain substring', () => { + const pred = urlContains('/checkout'); + const snapshot = createSnapshot('https://example.com/cart'); + + expect(pred.evaluate(snapshot)).toBe(false); + }); + + it('should be case-insensitive', () => { + const pred = urlContains('/CART'); + const snapshot = createSnapshot('https://example.com/cart'); + + expect(pred.evaluate(snapshot)).toBe(true); + }); + + it('should handle empty URL', () => { + const pred = urlContains('/cart'); + const snapshot = createSnapshot(''); + + expect(pred.evaluate(snapshot)).toBe(false); + }); + }); + + describe('urlMatches', () => { + it('should match regex pattern', () => { + const pred = urlMatches('/dp/[A-Z0-9]+'); + const snapshot = createSnapshot('https://amazon.com/dp/B08N5WRWNW'); + + expect(pred.evaluate(snapshot)).toBe(true); + }); + + it('should return false for non-matching URL', () => { + const pred = urlMatches('/dp/[A-Z0-9]+'); + const snapshot = createSnapshot('https://amazon.com/s?k=laptop'); + + expect(pred.evaluate(snapshot)).toBe(false); + }); + + it('should fall back to substring match for invalid regex', () => { + const pred = urlMatches('[invalid(regex'); + const snapshot = createSnapshot('https://example.com/[invalid(regex'); + + expect(pred.evaluate(snapshot)).toBe(true); + }); + }); + + describe('exists', () => { + it('should return true when element with text exists', () => { + const pred = exists('Add to Cart'); + const snapshot = createSnapshot('https://example.com', [ + { text: 'Add to Cart', role: 'button' }, + ]); + + expect(pred.evaluate(snapshot)).toBe(true); + }); + + it('should return false when element does not exist', () => { + const pred = exists('Checkout'); + const snapshot = createSnapshot('https://example.com', [ + { text: 'Add to Cart', role: 'button' }, + ]); + + expect(pred.evaluate(snapshot)).toBe(false); + }); + + it('should match partial text', () => { + const pred = exists('cart'); + const snapshot = createSnapshot('https://example.com', [ + { text: 'Add to Cart', role: 'button' }, + ]); + + expect(pred.evaluate(snapshot)).toBe(true); + }); + + it('should match aria-label', () => { + const pred = exists('search'); + const snapshot = createSnapshot('https://example.com', [ + { text: '', ariaLabel: 'Search products', role: 'textbox' }, + ]); + + expect(pred.evaluate(snapshot)).toBe(true); + }); + + it('should match by role', () => { + const pred = exists('button'); + const snapshot = createSnapshot('https://example.com', [ + { text: 'Click me', role: 'button' }, + ]); + + expect(pred.evaluate(snapshot)).toBe(true); + }); + }); + + describe('notExists', () => { + it('should return true when element does not exist', () => { + const pred = notExists('Error message'); + const snapshot = createSnapshot('https://example.com', [{ text: 'Success', role: 'alert' }]); + + expect(pred.evaluate(snapshot)).toBe(true); + }); + + it('should return false when element exists', () => { + const pred = notExists('Error'); + const snapshot = createSnapshot('https://example.com', [ + { text: 'Error: Something went wrong', role: 'alert' }, + ]); + + expect(pred.evaluate(snapshot)).toBe(false); + }); + }); + + describe('elementCount', () => { + it('should return true when count is within range', () => { + const pred = elementCount('button', 2, 5); + const snapshot = createSnapshot('https://example.com', [ + { text: 'Button 1', role: 'button' }, + { text: 'Button 2', role: 'button' }, + { text: 'Button 3', role: 'button' }, + ]); + + expect(pred.evaluate(snapshot)).toBe(true); + }); + + it('should return false when count is below minimum', () => { + const pred = elementCount('button', 5); + const snapshot = createSnapshot('https://example.com', [ + { text: 'Button 1', role: 'button' }, + { text: 'Button 2', role: 'button' }, + ]); + + expect(pred.evaluate(snapshot)).toBe(false); + }); + + it('should return false when count exceeds maximum', () => { + const pred = elementCount('button', 0, 2); + const snapshot = createSnapshot('https://example.com', [ + { text: 'Button 1', role: 'button' }, + { text: 'Button 2', role: 'button' }, + { text: 'Button 3', role: 'button' }, + ]); + + expect(pred.evaluate(snapshot)).toBe(false); + }); + + it('should work with only minimum specified', () => { + const pred = elementCount('button', 1); + const snapshot = createSnapshot('https://example.com', [ + { text: 'Button 1', role: 'button' }, + { text: 'Button 2', role: 'button' }, + ]); + + expect(pred.evaluate(snapshot)).toBe(true); + }); + }); + + describe('anyOf', () => { + it('should return true when any predicate passes', () => { + const pred = anyOf(urlContains('/cart'), urlContains('/checkout')); + const snapshot = createSnapshot('https://example.com/checkout'); + + expect(pred.evaluate(snapshot)).toBe(true); + }); + + it('should return false when no predicate passes', () => { + const pred = anyOf(urlContains('/cart'), urlContains('/checkout')); + const snapshot = createSnapshot('https://example.com/home'); + + expect(pred.evaluate(snapshot)).toBe(false); + }); + + it('should short-circuit on first true', () => { + const pred = anyOf(urlContains('/cart'), urlContains('/cart')); + const snapshot = createSnapshot('https://example.com/cart'); + + expect(pred.evaluate(snapshot)).toBe(true); + }); + }); + + describe('allOf', () => { + it('should return true when all predicates pass', () => { + const pred = allOf(urlContains('amazon'), urlContains('/dp/')); + const snapshot = createSnapshot('https://amazon.com/dp/B123456'); + + expect(pred.evaluate(snapshot)).toBe(true); + }); + + it('should return false when any predicate fails', () => { + const pred = allOf(urlContains('amazon'), urlContains('/checkout')); + const snapshot = createSnapshot('https://amazon.com/dp/B123456'); + + expect(pred.evaluate(snapshot)).toBe(false); + }); + }); + + describe('buildPredicate', () => { + it('should build url_contains predicate', () => { + const pred = buildPredicate({ predicate: 'url_contains', args: ['/cart'] }); + const snapshot = createSnapshot('https://example.com/cart'); + + expect(pred.name).toBe('url_contains'); + expect(pred.evaluate(snapshot)).toBe(true); + }); + + it('should build url_matches predicate', () => { + const pred = buildPredicate({ predicate: 'url_matches', args: ['/dp/.*'] }); + const snapshot = createSnapshot('https://example.com/dp/123'); + + expect(pred.name).toBe('url_matches'); + expect(pred.evaluate(snapshot)).toBe(true); + }); + + it('should build exists predicate', () => { + const pred = buildPredicate({ predicate: 'exists', args: ['Add to Cart'] }); + const snapshot = createSnapshot('https://example.com', [ + { text: 'Add to Cart', role: 'button' }, + ]); + + expect(pred.name).toBe('exists'); + expect(pred.evaluate(snapshot)).toBe(true); + }); + + it('should build not_exists predicate', () => { + const pred = buildPredicate({ predicate: 'not_exists', args: ['Error'] }); + const snapshot = createSnapshot('https://example.com', []); + + expect(pred.name).toBe('not_exists'); + expect(pred.evaluate(snapshot)).toBe(true); + }); + + it('should build element_count predicate', () => { + const pred = buildPredicate({ predicate: 'element_count', args: ['button', 1, 5] }); + const snapshot = createSnapshot('https://example.com', [ + { text: 'Button 1', role: 'button' }, + { text: 'Button 2', role: 'button' }, + ]); + + expect(pred.name).toBe('element_count'); + expect(pred.evaluate(snapshot)).toBe(true); + }); + + it('should build any_of predicate', () => { + const pred = buildPredicate({ + predicate: 'any_of', + args: [ + { predicate: 'url_contains', args: ['/cart'] }, + { predicate: 'url_contains', args: ['/checkout'] }, + ], + }); + const snapshot = createSnapshot('https://example.com/cart'); + + expect(pred.name).toBe('any_of'); + expect(pred.evaluate(snapshot)).toBe(true); + }); + + it('should build all_of predicate', () => { + const pred = buildPredicate({ + predicate: 'all_of', + args: [ + { predicate: 'url_contains', args: ['amazon'] }, + { predicate: 'url_contains', args: ['/dp/'] }, + ], + }); + const snapshot = createSnapshot('https://amazon.com/dp/123'); + + expect(pred.name).toBe('all_of'); + expect(pred.evaluate(snapshot)).toBe(true); + }); + + it('should handle unknown predicate gracefully', () => { + const pred = buildPredicate({ predicate: 'unknown_predicate', args: ['foo'] }); + + expect(pred.name).toBe('unknown:unknown_predicate'); + expect(pred.evaluate(createSnapshot(''))).toBe(true); // Always passes + }); + }); + + describe('evaluatePredicates', () => { + it('should return true when all predicates pass', () => { + const predicates = [ + { predicate: 'url_contains', args: ['/cart'] }, + { predicate: 'exists', args: ['Checkout'] }, + ]; + const snapshot = createSnapshot('https://example.com/cart', [ + { text: 'Proceed to Checkout', role: 'button' }, + ]); + + expect(evaluatePredicates(predicates, snapshot)).toBe(true); + }); + + it('should return false when any predicate fails', () => { + const predicates = [ + { predicate: 'url_contains', args: ['/cart'] }, + { predicate: 'exists', args: ['Login'] }, + ]; + const snapshot = createSnapshot('https://example.com/cart', [ + { text: 'Checkout', role: 'button' }, + ]); + + expect(evaluatePredicates(predicates, snapshot)).toBe(false); + }); + + it('should return true for empty predicates array', () => { + const snapshot = createSnapshot('https://example.com'); + + expect(evaluatePredicates([], snapshot)).toBe(true); + }); + + it('should handle errors gracefully', () => { + // Create an intentionally broken predicate by passing a circular reference + const circular: any = { predicate: 'any_of', args: [] }; + circular.args.push(circular); // Create circular reference + + const predicates = [circular]; + const snapshot = createSnapshot('https://example.com'); + + // Should not throw - the buildPredicate catches errors + expect(() => evaluatePredicates(predicates, snapshot)).not.toThrow(); + }); + }); +}); diff --git a/tests/agents/planner-executor/recovery.test.ts b/tests/agents/planner-executor/recovery.test.ts new file mode 100644 index 0000000..18c7012 --- /dev/null +++ b/tests/agents/planner-executor/recovery.test.ts @@ -0,0 +1,388 @@ +/** + * Tests for recovery navigation state management. + */ + +import { + RecoveryState, + DEFAULT_RECOVERY_CONFIG, + type RecoveryCheckpoint, + type RecoveryNavigationConfig, +} from '../../../src/agents/planner-executor/recovery'; + +describe('recovery', () => { + describe('DEFAULT_RECOVERY_CONFIG', () => { + it('should have expected default values', () => { + expect(DEFAULT_RECOVERY_CONFIG.enabled).toBe(true); + expect(DEFAULT_RECOVERY_CONFIG.maxRecoveryAttempts).toBe(2); + expect(DEFAULT_RECOVERY_CONFIG.trackSuccessfulUrls).toBe(true); + expect(DEFAULT_RECOVERY_CONFIG.maxCheckpoints).toBe(10); + }); + }); + + describe('RecoveryState', () => { + let state: RecoveryState; + + beforeEach(() => { + state = new RecoveryState(); + }); + + describe('initial state', () => { + it('should start with no checkpoints', () => { + expect(state.length).toBe(0); + expect(state.lastSuccessfulUrl).toBeNull(); + expect(state.lastSuccessfulStep).toBeNull(); + }); + + it('should have zero attempts used', () => { + expect(state.attemptsUsed).toBe(0); + }); + + it('should not be able to recover with no checkpoints', () => { + expect(state.canRecover()).toBe(false); + }); + + it('should return null for getRecoveryTarget with no checkpoints', () => { + expect(state.getRecoveryTarget()).toBeNull(); + }); + }); + + describe('recordCheckpoint', () => { + it('should record a checkpoint', () => { + const checkpoint = state.recordCheckpoint({ + url: 'https://example.com/page1', + stepIndex: 0, + snapshotDigest: 'abc123', + predicatesPassed: ['url_contains("/page1")'], + }); + + expect(checkpoint.url).toBe('https://example.com/page1'); + expect(checkpoint.stepIndex).toBe(0); + expect(checkpoint.snapshotDigest).toBe('abc123'); + expect(checkpoint.predicatesPassed).toEqual(['url_contains("/page1")']); + expect(checkpoint.timestamp).toBeInstanceOf(Date); + }); + + it('should update lastSuccessfulUrl and lastSuccessfulStep', () => { + state.recordCheckpoint({ + url: 'https://example.com/page1', + stepIndex: 0, + snapshotDigest: 'abc123', + predicatesPassed: [], + }); + + expect(state.lastSuccessfulUrl).toBe('https://example.com/page1'); + expect(state.lastSuccessfulStep).toBe(0); + + state.recordCheckpoint({ + url: 'https://example.com/page2', + stepIndex: 1, + snapshotDigest: 'def456', + predicatesPassed: [], + }); + + expect(state.lastSuccessfulUrl).toBe('https://example.com/page2'); + expect(state.lastSuccessfulStep).toBe(1); + }); + + it('should increment length', () => { + expect(state.length).toBe(0); + + state.recordCheckpoint({ + url: 'https://example.com', + stepIndex: 0, + snapshotDigest: 'abc', + predicatesPassed: [], + }); + + expect(state.length).toBe(1); + + state.recordCheckpoint({ + url: 'https://example.com/page2', + stepIndex: 1, + snapshotDigest: 'def', + predicatesPassed: [], + }); + + expect(state.length).toBe(2); + }); + + it('should limit checkpoints to maxCheckpoints', () => { + const customState = new RecoveryState({ maxCheckpoints: 3 }); + + for (let i = 0; i < 5; i++) { + customState.recordCheckpoint({ + url: `https://example.com/page${i}`, + stepIndex: i, + snapshotDigest: `digest${i}`, + predicatesPassed: [], + }); + } + + expect(customState.length).toBe(3); + expect(customState.lastSuccessfulStep).toBe(4); + expect(customState.lastSuccessfulUrl).toBe('https://example.com/page4'); + }); + + it('should handle missing predicatesPassed', () => { + const checkpoint = state.recordCheckpoint({ + url: 'https://example.com', + stepIndex: 0, + snapshotDigest: 'abc', + } as Omit); + + expect(checkpoint.predicatesPassed).toEqual([]); + }); + }); + + describe('getRecoveryTarget', () => { + it('should return most recent checkpoint', () => { + state.recordCheckpoint({ + url: 'https://example.com/page1', + stepIndex: 0, + snapshotDigest: 'abc', + predicatesPassed: [], + }); + + state.recordCheckpoint({ + url: 'https://example.com/page2', + stepIndex: 1, + snapshotDigest: 'def', + predicatesPassed: [], + }); + + const target = state.getRecoveryTarget(); + expect(target?.url).toBe('https://example.com/page2'); + expect(target?.stepIndex).toBe(1); + }); + }); + + describe('getCheckpointAtStep', () => { + beforeEach(() => { + state.recordCheckpoint({ + url: 'https://example.com/page0', + stepIndex: 0, + snapshotDigest: 'digest0', + predicatesPassed: [], + }); + state.recordCheckpoint({ + url: 'https://example.com/page1', + stepIndex: 1, + snapshotDigest: 'digest1', + predicatesPassed: [], + }); + state.recordCheckpoint({ + url: 'https://example.com/page2', + stepIndex: 2, + snapshotDigest: 'digest2', + predicatesPassed: [], + }); + }); + + it('should return checkpoint at specific step', () => { + const checkpoint = state.getCheckpointAtStep(1); + expect(checkpoint?.url).toBe('https://example.com/page1'); + expect(checkpoint?.stepIndex).toBe(1); + }); + + it('should return null for non-existent step', () => { + expect(state.getCheckpointAtStep(5)).toBeNull(); + }); + }); + + describe('getCheckpointBeforeStep', () => { + beforeEach(() => { + state.recordCheckpoint({ + url: 'https://example.com/page0', + stepIndex: 0, + snapshotDigest: 'digest0', + predicatesPassed: [], + }); + state.recordCheckpoint({ + url: 'https://example.com/page2', + stepIndex: 2, + snapshotDigest: 'digest2', + predicatesPassed: [], + }); + state.recordCheckpoint({ + url: 'https://example.com/page4', + stepIndex: 4, + snapshotDigest: 'digest4', + predicatesPassed: [], + }); + }); + + it('should return most recent checkpoint before step', () => { + const checkpoint = state.getCheckpointBeforeStep(3); + expect(checkpoint?.stepIndex).toBe(2); + }); + + it('should return null for step 0', () => { + expect(state.getCheckpointBeforeStep(0)).toBeNull(); + }); + + it('should skip steps and find earlier checkpoint', () => { + const checkpoint = state.getCheckpointBeforeStep(2); + expect(checkpoint?.stepIndex).toBe(0); + }); + }); + + describe('canRecover', () => { + it('should return true with checkpoints and attempts remaining', () => { + state.recordCheckpoint({ + url: 'https://example.com', + stepIndex: 0, + snapshotDigest: 'abc', + predicatesPassed: [], + }); + + expect(state.canRecover()).toBe(true); + }); + + it('should return false when all attempts used', () => { + state.recordCheckpoint({ + url: 'https://example.com', + stepIndex: 0, + snapshotDigest: 'abc', + predicatesPassed: [], + }); + + // Use all attempts + state.consumeRecoveryAttempt(); + state.consumeRecoveryAttempt(); + + expect(state.canRecover()).toBe(false); + }); + + it('should return false with no checkpoints', () => { + expect(state.canRecover()).toBe(false); + }); + }); + + describe('consumeRecoveryAttempt', () => { + it('should return checkpoint and increment attempts', () => { + state.recordCheckpoint({ + url: 'https://example.com', + stepIndex: 0, + snapshotDigest: 'abc', + predicatesPassed: [], + }); + + const checkpoint = state.consumeRecoveryAttempt(); + expect(checkpoint?.url).toBe('https://example.com'); + expect(state.attemptsUsed).toBe(1); + }); + + it('should set currentRecoveryTarget', () => { + state.recordCheckpoint({ + url: 'https://example.com', + stepIndex: 0, + snapshotDigest: 'abc', + predicatesPassed: [], + }); + + state.consumeRecoveryAttempt(); + expect(state.currentRecoveryTarget?.url).toBe('https://example.com'); + }); + + it('should return null when recovery not possible', () => { + expect(state.consumeRecoveryAttempt()).toBeNull(); + }); + + it('should return null when attempts exhausted', () => { + state.recordCheckpoint({ + url: 'https://example.com', + stepIndex: 0, + snapshotDigest: 'abc', + predicatesPassed: [], + }); + + state.consumeRecoveryAttempt(); + state.consumeRecoveryAttempt(); + + expect(state.consumeRecoveryAttempt()).toBeNull(); + }); + }); + + describe('clearRecoveryTarget', () => { + it('should clear current recovery target', () => { + state.recordCheckpoint({ + url: 'https://example.com', + stepIndex: 0, + snapshotDigest: 'abc', + predicatesPassed: [], + }); + + state.consumeRecoveryAttempt(); + expect(state.currentRecoveryTarget).not.toBeNull(); + + state.clearRecoveryTarget(); + expect(state.currentRecoveryTarget).toBeNull(); + }); + }); + + describe('reset', () => { + it('should reset all state', () => { + state.recordCheckpoint({ + url: 'https://example.com', + stepIndex: 0, + snapshotDigest: 'abc', + predicatesPassed: [], + }); + state.consumeRecoveryAttempt(); + + state.reset(); + + expect(state.length).toBe(0); + expect(state.attemptsUsed).toBe(0); + expect(state.currentRecoveryTarget).toBeNull(); + expect(state.lastSuccessfulUrl).toBeNull(); + expect(state.lastSuccessfulStep).toBeNull(); + }); + }); + + describe('popCheckpoint', () => { + it('should remove and return most recent checkpoint', () => { + state.recordCheckpoint({ + url: 'https://example.com/page1', + stepIndex: 0, + snapshotDigest: 'abc', + predicatesPassed: [], + }); + state.recordCheckpoint({ + url: 'https://example.com/page2', + stepIndex: 1, + snapshotDigest: 'def', + predicatesPassed: [], + }); + + const popped = state.popCheckpoint(); + expect(popped?.url).toBe('https://example.com/page2'); + expect(state.length).toBe(1); + expect(state.lastSuccessfulUrl).toBe('https://example.com/page1'); + }); + + it('should return null for empty state', () => { + expect(state.popCheckpoint()).toBeNull(); + }); + }); + + describe('custom config', () => { + it('should respect custom maxRecoveryAttempts', () => { + const customState = new RecoveryState({ maxRecoveryAttempts: 5 }); + customState.recordCheckpoint({ + url: 'https://example.com', + stepIndex: 0, + snapshotDigest: 'abc', + predicatesPassed: [], + }); + + for (let i = 0; i < 5; i++) { + expect(customState.canRecover()).toBe(true); + customState.consumeRecoveryAttempt(); + } + + expect(customState.canRecover()).toBe(false); + }); + }); + }); +}); diff --git a/tests/agents/planner-executor/vision-fallback.test.ts b/tests/agents/planner-executor/vision-fallback.test.ts new file mode 100644 index 0000000..e0dfc4d --- /dev/null +++ b/tests/agents/planner-executor/vision-fallback.test.ts @@ -0,0 +1,186 @@ +/** + * Tests for vision fallback detection. + */ + +import { + detectSnapshotFailure, + shouldUseVision, + type VisionFallbackResult, +} from '../../../src/agents/planner-executor/vision-fallback'; +import type { Snapshot } from '../../../src/agents/planner-executor/plan-models'; + +describe('vision-fallback', () => { + describe('detectSnapshotFailure', () => { + it('should require vision for null snapshot', () => { + const result = detectSnapshotFailure(null); + expect(result.shouldUseVision).toBe(true); + expect(result.reason).toBe('snapshot_null'); + }); + + it('should not require vision for snapshot with 10+ elements', () => { + const snapshot: Snapshot = { + url: 'https://example.com', + title: 'Test Page', + elements: Array.from({ length: 15 }, (_, i) => ({ + id: i, + role: 'button', + text: `Button ${i}`, + })), + }; + const result = detectSnapshotFailure(snapshot); + expect(result.shouldUseVision).toBe(false); + expect(result.reason).toBeNull(); + }); + + it('should require vision when status is require_vision', () => { + const snapshot: Snapshot = { + url: 'https://example.com', + title: 'Test Page', + elements: Array.from({ length: 5 }, (_, i) => ({ + id: i, + role: 'button', + text: `Button ${i}`, + })), + status: 'require_vision', + }; + const result = detectSnapshotFailure(snapshot); + expect(result.shouldUseVision).toBe(true); + expect(result.reason).toBe('require_vision'); + }); + + it('should require vision when status is error', () => { + const snapshot: Snapshot = { + url: 'https://example.com', + title: 'Test Page', + elements: Array.from({ length: 5 }, (_, i) => ({ + id: i, + role: 'button', + text: `Button ${i}`, + })), + status: 'error', + }; + const result = detectSnapshotFailure(snapshot); + expect(result.shouldUseVision).toBe(true); + expect(result.reason).toBe('snapshot_error'); + }); + + it('should require vision for too few elements (< 3)', () => { + const snapshot: Snapshot = { + url: 'https://example.com', + title: 'Test Page', + elements: [ + { id: 0, role: 'button', text: 'Button 0' }, + { id: 1, role: 'button', text: 'Button 1' }, + ], + }; + const result = detectSnapshotFailure(snapshot); + expect(result.shouldUseVision).toBe(true); + expect(result.reason).toBe('too_few_elements'); + }); + + it('should not require vision with 3-9 elements and success status', () => { + const snapshot: Snapshot = { + url: 'https://example.com', + title: 'Test Page', + elements: Array.from({ length: 5 }, (_, i) => ({ + id: i, + role: 'button', + text: `Button ${i}`, + })), + status: 'success', + }; + const result = detectSnapshotFailure(snapshot); + expect(result.shouldUseVision).toBe(false); + expect(result.reason).toBeNull(); + }); + + it('should require vision for low confidence diagnostics', () => { + const snapshot = { + url: 'https://example.com', + elements: Array.from({ length: 5 }, (_, i) => ({ + id: i, + role: 'button', + text: `Button ${i}`, + })), + diagnostics: { + confidence: 0.2, + }, + } as unknown as Snapshot; + const result = detectSnapshotFailure(snapshot); + expect(result.shouldUseVision).toBe(true); + expect(result.reason).toBe('low_confidence'); + }); + + it('should require vision for canvas page with few elements', () => { + const snapshot = { + url: 'https://example.com', + elements: Array.from({ length: 4 }, (_, i) => ({ + id: i, + role: 'button', + text: `Button ${i}`, + })), + diagnostics: { + hasCanvas: true, + }, + } as unknown as Snapshot; + const result = detectSnapshotFailure(snapshot); + expect(result.shouldUseVision).toBe(true); + expect(result.reason).toBe('canvas_page'); + }); + + it('should not require vision for canvas page with many elements', () => { + const snapshot = { + url: 'https://example.com', + elements: Array.from({ length: 15 }, (_, i) => ({ + id: i, + role: 'button', + text: `Button ${i}`, + })), + diagnostics: { + hasCanvas: true, + }, + } as unknown as Snapshot; + const result = detectSnapshotFailure(snapshot); + expect(result.shouldUseVision).toBe(false); + expect(result.reason).toBeNull(); + }); + + it('should handle empty elements array', () => { + const snapshot: Snapshot = { + url: 'https://example.com', + title: 'Test Page', + elements: [], + }; + const result = detectSnapshotFailure(snapshot); + expect(result.shouldUseVision).toBe(true); + expect(result.reason).toBe('too_few_elements'); + }); + + it('should handle undefined elements', () => { + const snapshot = { + url: 'https://example.com', + } as Snapshot; + const result = detectSnapshotFailure(snapshot); + expect(result.shouldUseVision).toBe(true); + expect(result.reason).toBe('too_few_elements'); + }); + }); + + describe('shouldUseVision', () => { + it('should return true if snapshot failed', () => { + expect(shouldUseVision(false, false)).toBe(true); + }); + + it('should return true if requiresVision is true', () => { + expect(shouldUseVision(true, true)).toBe(true); + }); + + it('should return false if snapshot succeeded and vision not required', () => { + expect(shouldUseVision(true, false)).toBe(false); + }); + + it('should return true if both conditions fail', () => { + expect(shouldUseVision(false, true)).toBe(true); + }); + }); +}); diff --git a/tests/planner-executor-config.test.ts b/tests/planner-executor-config.test.ts index 4ec6763..4bb78a9 100644 --- a/tests/planner-executor-config.test.ts +++ b/tests/planner-executor-config.test.ts @@ -28,8 +28,9 @@ describe('getConfigPreset', () => { it('should return optimized config for LOCAL_SMALL_MODEL', () => { const config = getConfigPreset(ConfigPreset.LOCAL_SMALL_MODEL); - expect(config.plannerMaxTokens).toBe(1024); - expect(config.executorMaxTokens).toBe(64); + // HIGH token limits for local models like Qwen3 that include reasoning in output + expect(config.plannerMaxTokens).toBe(8192); + expect(config.executorMaxTokens).toBe(4096); expect(config.retry.verifyTimeoutMs).toBe(15000); expect(config.retry.verifyMaxAttempts).toBe(6); expect(config.verbose).toBe(true); @@ -59,7 +60,7 @@ describe('getConfigPreset', () => { it('should accept string preset names', () => { const config = getConfigPreset('local_small'); - expect(config.plannerMaxTokens).toBe(1024); + expect(config.plannerMaxTokens).toBe(8192); }); }); @@ -169,7 +170,7 @@ describe('resolveConfig', () => { it('should resolve string preset', () => { const config = resolveConfig('local_small'); - expect(config.plannerMaxTokens).toBe(1024); + expect(config.plannerMaxTokens).toBe(8192); }); it('should resolve ConfigPreset enum', () => { @@ -206,7 +207,7 @@ describe('createPlannerExecutorAgentProviders', () => { config: ConfigPreset.LOCAL_SMALL_MODEL, }); - expect(result.config.plannerMaxTokens).toBe(1024); + expect(result.config.plannerMaxTokens).toBe(8192); expect(result.config.verbose).toBe(true); });