diff --git a/src/main/browser/BrowserMcpIngress.ts b/src/main/browser/BrowserMcpIngress.ts index 64c780a7..297e51c6 100644 --- a/src/main/browser/BrowserMcpIngress.ts +++ b/src/main/browser/BrowserMcpIngress.ts @@ -1,60 +1,46 @@ -import { createServer, type IncomingMessage, type Server, type ServerResponse } from "node:http"; -import { randomBytes, randomUUID } from "node:crypto"; import type { BrowserPanelManager } from "./BrowserPanelManager"; +import { + StreamableHttpMcpIngress, + type StreamableHttpMcpIngressInfo, +} from "../mcp/StreamableHttpMcpIngress"; import { BROWSER_MCP_INSTRUCTIONS, TOOLS, dispatchTool, formatToolResult, isKnownToolName, - type McpToolResult, normalizeToolName, type ToolContext, } from "./mcp/toolRegistry"; -export interface BrowserMcpIngressInfo { - url: string; - token: string; - port: number; -} +export type BrowserMcpIngressInfo = StreamableHttpMcpIngressInfo; -const MAX_BODY = 1024 * 1024; -const MCP_PROTOCOL_VERSION = "2025-03-26"; const PASSIVE_TOOLS = new Set(["api", "list_tabs", "get_url", "get_title"]); -interface JsonRpcRequest { - jsonrpc: "2.0"; - id?: number | string | null; - method: string; - params?: unknown; -} - -interface JsonRpcResponseOk { - jsonrpc: "2.0"; - id: number | string | null; - result: unknown; -} - -interface JsonRpcResponseErr { - jsonrpc: "2.0"; - id: number | string | null; - error: { code: number; message: string; data?: unknown }; -} - -type JsonRpcResponse = JsonRpcResponseOk | JsonRpcResponseErr; - /** * Single in-process MCP server. Speaks Streamable-HTTP MCP at `POST /mcp` * (JSON-RPC body, single JSON response). All five agent providers connect * here by URL — no per-thread Node child process. */ export class BrowserMcpIngress { - private server: Server | null = null; - private token = randomBytes(32).toString("hex"); - private info: BrowserMcpIngressInfo | null = null; private allowEval = false; private allowDataAccess = false; private getManager: (() => BrowserPanelManager | null) | null = null; + private readonly ingress = new StreamableHttpMcpIngress({ + serverInfo: { name: "browser", version: "2.0.0" }, + instructions: BROWSER_MCP_INSTRUCTIONS, + tools: TOOLS, + isKnownToolName, + buildContext: () => this.buildContext(), + contextUnavailableMessage: "browser panel not ready", + onBeforeToolCall: (name, ctx) => { + if (shouldRevealPanelForTool(name)) { + ctx.manager.revealPanel(); + } + }, + dispatchTool, + formatToolResult, + }); setManagerAccessor(getter: () => BrowserPanelManager | null): void { this.getManager = getter; @@ -68,36 +54,16 @@ export class BrowserMcpIngress { this.allowDataAccess = allow; } - async start(): Promise { - if (this.info) return this.info; - return await new Promise((resolve, reject) => { - const server = createServer((req, res) => this.handle(req, res)); - server.on("error", reject); - // Bind 0.0.0.0 so WSL agents can reach the host via gateway IP. Access - // is guarded by a 256-bit bearer token regenerated per app launch; the - // URL is only ever passed to immediate child processes via env vars. - server.listen(0, "0.0.0.0", () => { - const addr = server.address(); - const port = typeof addr === "object" && addr ? addr.port : 0; - this.server = server; - this.info = { url: `http://127.0.0.1:${port}`, token: this.token, port }; - resolve(this.info); - }); - }); + start(): Promise { + return this.ingress.start(); } getInfo(): BrowserMcpIngressInfo | null { - return this.info; + return this.ingress.getInfo(); } dispose(): void { - try { - this.server?.closeAllConnections?.(); - } catch {} - try { - this.server?.close(); - } catch {} - this.server = null; + this.ingress.dispose(); } private buildContext(): ToolContext | null { @@ -109,227 +75,6 @@ export class BrowserMcpIngress { allowDataAccess: this.allowDataAccess, }; } - - private async readBody(req: IncomingMessage): Promise { - return await new Promise((resolve, reject) => { - let total = 0; - const chunks: Buffer[] = []; - req.on("data", (chunk: Buffer) => { - total += chunk.length; - if (total > MAX_BODY) { - req.destroy(); - reject(new Error("body too large")); - return; - } - chunks.push(chunk); - }); - req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8"))); - req.on("error", reject); - }); - } - - private sendJson(res: ServerResponse, status: number, body: unknown): void { - res.statusCode = status; - res.setHeader("Content-Type", "application/json"); - res.setHeader("Cache-Control", "no-store"); - res.end(JSON.stringify(body)); - } - - private checkAuth(req: IncomingMessage): boolean { - const auth = req.headers.authorization; - if (auth && auth.startsWith("Bearer ") && auth.slice(7).trim() === this.token) { - return true; - } - // Some MCP clients pass the token in a custom header. - const xToken = req.headers["x-lightcode-token"]; - if (typeof xToken === "string" && xToken === this.token) return true; - return false; - } - - private async handle(req: IncomingMessage, res: ServerResponse): Promise { - try { - if (!req.url) { - this.sendJson(res, 404, { error: "not found" }); - return; - } - const path = new URL(req.url, "http://x").pathname; - - // CORS preflight — not strictly required for an MCP HTTP endpoint hit - // from local processes, but harmless. - if (req.method === "OPTIONS") { - res.statusCode = 204; - res.setHeader( - "Access-Control-Allow-Headers", - "Authorization, X-Lightcode-Token, Content-Type, Mcp-Session-Id", - ); - res.setHeader("Access-Control-Allow-Methods", "POST, GET, OPTIONS"); - res.setHeader("Access-Control-Allow-Origin", "*"); - res.end(); - return; - } - - if (!this.checkAuth(req)) { - this.sendJson(res, 401, { error: "unauthorized" }); - return; - } - - if (path === "/mcp" || path === "/mcp/") { - if (req.method === "GET") { - // MCP Streamable HTTP allows GET to open an SSE stream. We don't - // push server-initiated events; return 405 with Allow header. - res.statusCode = 405; - res.setHeader("Allow", "POST"); - res.end(); - return; - } - if (req.method !== "POST") { - this.sendJson(res, 405, { error: "method not allowed" }); - return; - } - await this.handleMcp(req, res); - return; - } - - this.sendJson(res, 404, { error: "not found" }); - } catch (err) { - this.sendJson(res, 500, { error: (err as Error).message ?? "internal" }); - } - } - - private async handleMcp(req: IncomingMessage, res: ServerResponse): Promise { - const raw = await this.readBody(req); - let body: unknown; - try { - body = raw ? JSON.parse(raw) : {}; - } catch { - this.sendJson(res, 400, { - jsonrpc: "2.0", - id: null, - error: { code: -32700, message: "Parse error" }, - }); - return; - } - - // Mcp-Session-Id: stateless server, but echo a session id so clients - // that key off of it have one. - let sessionId = req.headers["mcp-session-id"]; - if (Array.isArray(sessionId)) sessionId = sessionId[0]; - if (typeof sessionId !== "string" || !sessionId) { - sessionId = randomUUID(); - } - res.setHeader("Mcp-Session-Id", sessionId); - - // Streamable HTTP allows a single response or a batch. Match the input. - if (Array.isArray(body)) { - const out: JsonRpcResponse[] = []; - for (const m of body) { - const reply = await this.handleSingle(m); - if (reply) out.push(reply); - } - this.sendJson(res, 200, out); - return; - } - const reply = await this.handleSingle(body); - if (!reply) { - // notification — no response - res.statusCode = 202; - res.end(); - return; - } - this.sendJson(res, 200, reply); - } - - private async handleSingle(message: unknown): Promise { - if (!isJsonRpcRequest(message)) return null; - const { id = null, method, params } = message; - try { - if (method === "initialize") { - return { - jsonrpc: "2.0", - id, - result: { - protocolVersion: MCP_PROTOCOL_VERSION, - capabilities: { tools: {} }, - serverInfo: { name: "browser", version: "2.0.0" }, - instructions: BROWSER_MCP_INSTRUCTIONS, - }, - }; - } - if (method === "notifications/initialized" || method === "initialized") { - return null; - } - if (method === "ping") { - return { jsonrpc: "2.0", id, result: {} }; - } - if (method === "tools/list") { - return { jsonrpc: "2.0", id, result: { tools: TOOLS } }; - } - if (method === "tools/call") { - const p = (params ?? {}) as { name?: string; arguments?: Record }; - const name = String(p.name ?? ""); - const args = (p.arguments ?? {}) as Record; - if (!isKnownToolName(name)) { - return { - jsonrpc: "2.0", - id, - result: { - isError: true, - content: [{ type: "text", text: `Unknown tool: ${name}` }], - }, - }; - } - const ctx = this.buildContext(); - if (!ctx) { - return { - jsonrpc: "2.0", - id, - result: { - isError: true, - content: [{ type: "text", text: "browser panel not ready" }], - }, - }; - } - if (shouldRevealPanelForTool(name)) { - ctx.manager.revealPanel(); - } - let raw: unknown; - try { - raw = await dispatchTool(name, args, ctx); - } catch (err) { - return { - jsonrpc: "2.0", - id, - result: { - isError: true, - content: [{ type: "text", text: (err as Error).message ?? String(err) }], - }, - }; - } - const result: McpToolResult = formatToolResult(name, raw); - return { jsonrpc: "2.0", id, result }; - } - return { - jsonrpc: "2.0", - id, - error: { code: -32601, message: `Method not found: ${method}` }, - }; - } catch (err) { - return { - jsonrpc: "2.0", - id, - error: { code: -32000, message: (err as Error).message ?? "internal" }, - }; - } - } -} - -function isJsonRpcRequest(value: unknown): value is JsonRpcRequest { - return ( - typeof value === "object" && - value !== null && - (value as { jsonrpc?: unknown }).jsonrpc === "2.0" && - typeof (value as { method?: unknown }).method === "string" - ); } function shouldRevealPanelForTool(name: string): boolean { diff --git a/src/main/computer-use/ComputerUseMcpIngress.test.ts b/src/main/computer-use/ComputerUseMcpIngress.test.ts new file mode 100644 index 00000000..8e31f7c7 --- /dev/null +++ b/src/main/computer-use/ComputerUseMcpIngress.test.ts @@ -0,0 +1,64 @@ +import { afterEach, describe, expect, it } from "vitest"; +import { ComputerUseMcpIngress } from "./ComputerUseMcpIngress"; + +let ingress: ComputerUseMcpIngress | null = null; + +afterEach(() => { + ingress?.dispose(); + ingress = null; +}); + +describe("ComputerUseMcpIngress", () => { + it("advertises computer_use instructions and tools on initialize", async () => { + ingress = new ComputerUseMcpIngress(); + const info = await ingress.start(); + + const response = await fetch(`${info.url}/mcp`, { + method: "POST", + headers: { + Authorization: `Bearer ${info.token}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + jsonrpc: "2.0", + id: 1, + method: "initialize", + params: {}, + }), + }); + + const body = (await response.json()) as { + result: { + serverInfo: { name: string }; + instructions: string; + }; + }; + + expect(body.result.serverInfo.name).toBe("computer_use"); + expect(body.result.instructions).toContain("computer_use.api"); + expect(body.result.instructions).toContain("switch to interactive mode"); + }); + + it("requires bearer auth before listing tools", async () => { + ingress = new ComputerUseMcpIngress(); + const info = await ingress.start(); + + const unauthorized = await fetch(`${info.url}/mcp`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ jsonrpc: "2.0", id: 1, method: "tools/list" }), + }); + expect(unauthorized.status).toBe(401); + + const authorized = await fetch(`${info.url}/mcp`, { + method: "POST", + headers: { + Authorization: `Bearer ${info.token}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ jsonrpc: "2.0", id: 2, method: "tools/list" }), + }); + const body = (await authorized.json()) as { result: { tools: Array<{ name: string }> } }; + expect(body.result.tools.map((tool) => tool.name)).toContain("get_window_state"); + }); +}); diff --git a/src/main/computer-use/ComputerUseMcpIngress.ts b/src/main/computer-use/ComputerUseMcpIngress.ts new file mode 100644 index 00000000..0f42ea42 --- /dev/null +++ b/src/main/computer-use/ComputerUseMcpIngress.ts @@ -0,0 +1,44 @@ +import { createComputerUseDriver } from "./drivers"; +import { + StreamableHttpMcpIngress, + type StreamableHttpMcpIngressInfo, +} from "../mcp/StreamableHttpMcpIngress"; +import { + COMPUTER_USE_MCP_INSTRUCTIONS, + TOOLS, + dispatchTool, + formatToolResult, + isKnownToolName, + type ToolContext, +} from "./mcp/toolRegistry"; + +export type ComputerUseMcpIngressInfo = StreamableHttpMcpIngressInfo; + +export class ComputerUseMcpIngress { + private readonly driver = createComputerUseDriver(); + private readonly ingress = new StreamableHttpMcpIngress({ + serverInfo: { name: "computer_use", version: "0.1.0" }, + instructions: COMPUTER_USE_MCP_INSTRUCTIONS, + tools: TOOLS, + isKnownToolName, + buildContext: () => this.buildContext(), + dispatchTool, + formatToolResult, + }); + + start(): Promise { + return this.ingress.start(); + } + + getInfo(): ComputerUseMcpIngressInfo | null { + return this.ingress.getInfo(); + } + + dispose(): void { + this.ingress.dispose(); + } + + private buildContext(): ToolContext { + return { driver: this.driver }; + } +} diff --git a/src/main/computer-use/drivers/common.ts b/src/main/computer-use/drivers/common.ts new file mode 100644 index 00000000..c74fbed6 --- /dev/null +++ b/src/main/computer-use/drivers/common.ts @@ -0,0 +1,104 @@ +import { spawn } from "node:child_process"; +import type { ComputerUseWindow } from "../mcp/types"; + +export function readRecord(value: unknown): Record { + return value && typeof value === "object" ? (value as Record) : {}; +} + +export function readWindow(value: unknown): ComputerUseWindow { + const obj = readRecord(value); + const id = Number(obj.id); + const app = typeof obj.app === "string" ? obj.app : ""; + if (!Number.isFinite(id) || !app) { + throw new Error("window with app and id is required"); + } + return { + app, + id, + ...(typeof obj.title === "string" ? { title: obj.title } : {}), + ...(typeof obj.x === "number" ? { x: obj.x } : {}), + ...(typeof obj.y === "number" ? { y: obj.y } : {}), + ...(typeof obj.width === "number" ? { width: obj.width } : {}), + ...(typeof obj.height === "number" ? { height: obj.height } : {}), + }; +} + +export function readNumber(value: unknown, name: string): number { + const next = Number(value); + if (!Number.isFinite(next)) throw new Error(`${name} is required`); + return next; +} + +export function readString(value: unknown, name: string): string { + if (typeof value !== "string" || value.length === 0) throw new Error(`${name} is required`); + return value; +} + +export function runProcess( + command: string, + args: string[], + options?: { + input?: string; + timeoutMs?: number; + maxBufferBytes?: number; + }, +): Promise<{ stdout: string; stderr: string }> { + return new Promise((resolve, reject) => { + const child = spawn(command, args, { + stdio: ["pipe", "pipe", "pipe"], + windowsHide: true, + }); + const maxBufferBytes = options?.maxBufferBytes ?? 12 * 1024 * 1024; + const stdout: Buffer[] = []; + const stderr: Buffer[] = []; + let stdoutBytes = 0; + let stderrBytes = 0; + let settled = false; + const timer = + options?.timeoutMs && options.timeoutMs > 0 + ? setTimeout(() => { + if (settled) return; + settled = true; + child.kill(); + reject(new Error(`${command} timed out after ${options.timeoutMs}ms`)); + }, options.timeoutMs) + : undefined; + + child.stdout.on("data", (chunk: Buffer) => { + stdoutBytes += chunk.length; + if (stdoutBytes > maxBufferBytes) { + child.kill(); + return; + } + stdout.push(chunk); + }); + child.stderr.on("data", (chunk: Buffer) => { + stderrBytes += chunk.length; + if (stderrBytes <= maxBufferBytes) stderr.push(chunk); + }); + child.on("error", (error) => { + if (settled) return; + settled = true; + if (timer) clearTimeout(timer); + reject(error); + }); + child.on("close", (code) => { + if (settled) return; + settled = true; + if (timer) clearTimeout(timer); + const out = Buffer.concat(stdout).toString("utf8"); + const err = Buffer.concat(stderr).toString("utf8"); + if (stdoutBytes > maxBufferBytes) { + reject(new Error(`${command} output exceeded ${maxBufferBytes} bytes`)); + return; + } + if (code !== 0) { + reject(new Error(err.trim() || `${command} exited with code ${code}`)); + return; + } + resolve({ stdout: out, stderr: err }); + }); + if (options?.input) child.stdin.end(options.input); + else child.stdin.end(); + }); +} diff --git a/src/main/computer-use/drivers/index.ts b/src/main/computer-use/drivers/index.ts new file mode 100644 index 00000000..237359a1 --- /dev/null +++ b/src/main/computer-use/drivers/index.ts @@ -0,0 +1,67 @@ +import type { ComputerUseDriver } from "../mcp/types"; +import { MacComputerUseDriver } from "./macos"; +import { WindowsComputerUseDriver } from "./windows"; + +class UnsupportedComputerUseDriver implements ComputerUseDriver { + private unavailable(): never { + throw new Error("Computer Use is only available on macOS and Windows."); + } + + listApps(): Promise { + return Promise.reject(this.unavailable()); + } + + listWindows(): Promise { + return Promise.reject(this.unavailable()); + } + + getWindow(): Promise { + return Promise.reject(this.unavailable()); + } + + getWindowState(): Promise { + return Promise.reject(this.unavailable()); + } + + activateWindow(): Promise { + return Promise.reject(this.unavailable()); + } + + click(): Promise { + return Promise.reject(this.unavailable()); + } + + typeText(): Promise { + return Promise.reject(this.unavailable()); + } + + pressKey(): Promise { + return Promise.reject(this.unavailable()); + } + + scroll(): Promise { + return Promise.reject(this.unavailable()); + } + + drag(): Promise { + return Promise.reject(this.unavailable()); + } + + launchApp(): Promise { + return Promise.reject(this.unavailable()); + } + + setValue(): Promise { + return Promise.reject(this.unavailable()); + } + + performSecondaryAction(): Promise { + return Promise.reject(this.unavailable()); + } +} + +export function createComputerUseDriver(): ComputerUseDriver { + if (process.platform === "win32") return new WindowsComputerUseDriver(); + if (process.platform === "darwin") return new MacComputerUseDriver(); + return new UnsupportedComputerUseDriver(); +} diff --git a/src/main/computer-use/drivers/macos.ts b/src/main/computer-use/drivers/macos.ts new file mode 100644 index 00000000..a78ef9f8 --- /dev/null +++ b/src/main/computer-use/drivers/macos.ts @@ -0,0 +1,365 @@ +import { mkdir, readFile, rm } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import type { + ComputerUseApp, + ComputerUseDriver, + ComputerUseWindow, + ComputerUseWindowState, +} from "../mcp/types"; +import { readNumber, runProcess } from "./common"; + +function hashWindowId(input: string): number { + let hash = 2166136261; + for (let i = 0; i < input.length; i += 1) { + hash ^= input.charCodeAt(i); + hash = Math.imul(hash, 16777619); + } + return hash >>> 0; +} + +function normalizeWindows(value: unknown): ComputerUseWindow[] { + const items = Array.isArray(value) ? value : value ? [value] : []; + const windows: ComputerUseWindow[] = []; + for (const item of items) { + const obj = item && typeof item === "object" ? (item as Record) : {}; + const app = typeof obj.app === "string" ? obj.app : ""; + const title = typeof obj.title === "string" ? obj.title : undefined; + const x = typeof obj.x === "number" ? obj.x : 0; + const y = typeof obj.y === "number" ? obj.y : 0; + const width = typeof obj.width === "number" ? obj.width : 0; + const height = typeof obj.height === "number" ? obj.height : 0; + if (!app || width <= 0 || height <= 0) continue; + windows.push({ + app, + id: hashWindowId(`${app}\n${title ?? ""}\n${x},${y},${width},${height}`), + ...(title ? { title } : {}), + x, + y, + width, + height, + }); + } + return windows; +} + +async function osascript(script: string): Promise { + const { stdout } = await runProcess("/usr/bin/osascript", ["-l", "JavaScript", "-e", script], { + timeoutMs: 15_000, + }); + return JSON.parse(stdout.trim()) as T; +} + +async function listMacWindows(): Promise { + const raw = await osascript(` +ObjC.import("stdlib"); +const app = Application("System Events"); +const windows = []; +for (const process of app.applicationProcesses()) { + if (!process.visible()) continue; + const appName = process.name(); + for (const window of process.windows()) { + let position = [0, 0]; + let size = [0, 0]; + try { position = window.position(); } catch {} + try { size = window.size(); } catch {} + let title = ""; + try { title = window.name(); } catch {} + windows.push({ + app: appName, + title, + x: Number(position[0]) || 0, + y: Number(position[1]) || 0, + width: Number(size[0]) || 0, + height: Number(size[1]) || 0, + }); + } +} +JSON.stringify(windows); +`); + return normalizeWindows(raw); +} + +function keyCodeForToken(token: string): number | undefined { + const t = token.trim().toLowerCase(); + const map: Record = { + return: 36, + enter: 36, + tab: 48, + escape: 53, + esc: 53, + delete: 51, + backspace: 51, + left: 123, + arrowleft: 123, + right: 124, + arrowright: 124, + down: 125, + arrowdown: 125, + up: 126, + arrowup: 126, + home: 115, + end: 119, + pageup: 116, + page_up: 116, + pagedown: 121, + page_down: 121, + space: 49, + }; + if (map[t] !== undefined) return map[t]; + const fKey = /^f([1-9]|1[0-9]|2[0])$/.exec(t); + if (fKey) { + const codes = [ + 122, 120, 99, 118, 96, 97, 98, 100, 101, 109, 103, 111, 105, 107, 113, 106, 64, 79, 80, 90, + ]; + return codes[Number(fKey[1]) - 1]; + } + return undefined; +} + +function modifierForToken(token: string): string | undefined { + const t = token.trim().toLowerCase(); + if (t === "control" || t === "ctrl" || t === "control_l" || t === "control_r") + return "control down"; + if (t === "shift" || t === "shift_l" || t === "shift_r") return "shift down"; + if (t === "alt" || t === "option" || t === "alt_l" || t === "alt_r") return "option down"; + if (t === "command" || t === "cmd" || t === "meta") return "command down"; + return undefined; +} + +function quoteAppleScript(value: string): string { + return JSON.stringify(value); +} + +async function runAppleScript(script: string): Promise { + await runProcess("/usr/bin/osascript", ["-e", script], { timeoutMs: 10_000 }); +} + +async function activateApp(app: string): Promise { + await runAppleScript(` +tell application "System Events" + set frontmost of first application process whose name is ${quoteAppleScript(app)} to true +end tell +`); +} + +export class MacComputerUseDriver implements ComputerUseDriver { + async listApps(): Promise { + const windows = await listMacWindows(); + const groups = new Map(); + for (const window of windows) { + const prev = groups.get(window.app) ?? []; + prev.push(window); + groups.set(window.app, prev); + } + return [...groups.entries()].map(([id, appWindows]) => ({ + id, + displayName: id, + isRunning: true, + windows: appWindows, + })); + } + + listWindows(): Promise { + return listMacWindows(); + } + + async getWindow(input: { app?: string; id: number }): Promise { + const windows = await listMacWindows(); + const window = windows.find( + (candidate) => + candidate.id === input.id && (input.app === undefined || candidate.app === input.app), + ); + if (!window) throw new Error("Window is no longer available."); + return window; + } + + async getWindowState(input: { + include_screenshot?: boolean; + include_text?: boolean; + window: ComputerUseWindow; + }): Promise { + const window = await this.getWindow(input.window); + const screenshots: ComputerUseWindowState["screenshots"] = []; + const notes = [ + "macOS window listing and screenshots are passive. Input actions switch to interactive mode and activate the target app.", + "macOS captures the visible screen region; occluded windows and locked screens may require the user to reveal or unlock the desktop.", + ]; + if (input.include_screenshot !== false) { + const captureDir = join(tmpdir(), "lightcode-computer-use"); + await mkdir(captureDir, { recursive: true }); + const path = join( + captureDir, + `capture-${Date.now()}-${Math.random().toString(16).slice(2)}.png`, + ); + try { + const x = readNumber(window.x, "window.x"); + const y = readNumber(window.y, "window.y"); + const width = Math.max(1, readNumber(window.width, "window.width")); + const height = Math.max(1, readNumber(window.height, "window.height")); + await runProcess( + "/usr/sbin/screencapture", + ["-x", "-R", `${x},${y},${width},${height}`, path], + { + timeoutMs: 10_000, + maxBufferBytes: 1024 * 1024, + }, + ); + const bytes = await readFile(path); + screenshots.push({ + id: "window", + mimeType: "image/png", + data: bytes.toString("base64"), + width, + height, + originX: x, + originY: y, + zIndex: 0, + }); + } finally { + await rm(path, { force: true }); + } + } + return { + window, + accessibility: + input.include_text === true + ? { + tree: `Window: "${window.title ?? ""}", App: ${window.app}`, + } + : null, + screenshots, + mode: "passive", + notes, + }; + } + + async activateWindow(input: { + window: ComputerUseWindow; + }): Promise<{ ok: true; mode: "interactive" }> { + const window = await this.getWindow(input.window); + await activateApp(window.app); + return { ok: true, mode: "interactive" }; + } + + async click(input: { + click_count?: number; + mouse_button?: string; + window: ComputerUseWindow; + x?: number; + y?: number; + }): Promise<{ ok: true; mode: "interactive" }> { + const window = await this.getWindow(input.window); + await activateApp(window.app); + const x = readNumber(window.x, "window.x") + readNumber(input.x, "x"); + const y = readNumber(window.y, "window.y") + readNumber(input.y, "y"); + const count = Math.max(1, Math.trunc(input.click_count ?? 1)); + await runAppleScript(` +tell application "System Events" + click at {${x}, ${y}} + ${count > 1 ? `click at {${x}, ${y}}` : ""} +end tell +`); + return { ok: true, mode: "interactive" }; + } + + async typeText(input: { text: string; window: ComputerUseWindow }): Promise<{ + ok: true; + mode: "interactive"; + }> { + const window = await this.getWindow(input.window); + await activateApp(window.app); + await runAppleScript(` +tell application "System Events" + keystroke ${quoteAppleScript(input.text)} +end tell +`); + return { ok: true, mode: "interactive" }; + } + + async pressKey(input: { key: string; window: ComputerUseWindow }): Promise<{ + ok: true; + mode: "interactive"; + }> { + const window = await this.getWindow(input.window); + await activateApp(window.app); + const tokens = input.key + .split("+") + .map((token) => token.trim()) + .filter(Boolean); + const modifiers = tokens + .map(modifierForToken) + .filter((token): token is string => Boolean(token)); + const keyToken = tokens.find((token) => !modifierForToken(token)); + if (!keyToken) throw new Error("key is required"); + const using = modifiers.length ? ` using {${modifiers.join(", ")}}` : ""; + const keyCode = keyCodeForToken(keyToken); + await runAppleScript(` +tell application "System Events" + ${keyCode === undefined ? `keystroke ${quoteAppleScript(keyToken)}${using}` : `key code ${keyCode}${using}`} +end tell +`); + return { ok: true, mode: "interactive" }; + } + + async scroll(input: { + scrollX: number; + scrollY: number; + window: ComputerUseWindow; + x: number; + y: number; + }): Promise<{ ok: true; mode: "interactive" }> { + const window = await this.getWindow(input.window); + await activateApp(window.app); + const direction = input.scrollY >= 0 ? "down" : "up"; + const steps = Math.max(1, Math.min(20, Math.round(Math.abs(input.scrollY) / 120))); + await runAppleScript(` +tell application "System Events" + scroll ${direction} ${steps} +end tell +`); + return { ok: true, mode: "interactive" }; + } + + async drag(input: { + from_x: number; + from_y: number; + to_x: number; + to_y: number; + window: ComputerUseWindow; + }): Promise<{ ok: true; mode: "interactive" }> { + const window = await this.getWindow(input.window); + await activateApp(window.app); + const fromX = readNumber(window.x, "window.x") + input.from_x; + const fromY = readNumber(window.y, "window.y") + input.from_y; + const toX = readNumber(window.x, "window.x") + input.to_x; + const toY = readNumber(window.y, "window.y") + input.to_y; + await runAppleScript(` +tell application "System Events" + drag from {${fromX}, ${fromY}} to {${toX}, ${toY}} +end tell +`); + return { ok: true, mode: "interactive" }; + } + + async launchApp(input: { app: string }): Promise<{ ok: true }> { + if (input.app.startsWith("/") || input.app.endsWith(".app")) { + await runProcess("/usr/bin/open", [input.app], { timeoutMs: 10_000 }); + } else { + await runProcess("/usr/bin/open", ["-a", input.app], { timeoutMs: 10_000 }); + } + return { ok: true }; + } + + setValue(): Promise<{ ok: true; mode: "interactive" }> { + throw new Error( + "set_value is not supported yet; click or focus the target field, then use type_text.", + ); + } + + performSecondaryAction(): Promise<{ ok: true; mode: "interactive" }> { + throw new Error( + "perform_secondary_action is not supported yet; use keyboard navigation or coordinate input.", + ); + } +} diff --git a/src/main/computer-use/drivers/windows.ts b/src/main/computer-use/drivers/windows.ts new file mode 100644 index 00000000..2ae41c11 --- /dev/null +++ b/src/main/computer-use/drivers/windows.ts @@ -0,0 +1,554 @@ +import { mkdirSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import type { + ComputerUseApp, + ComputerUseDriver, + ComputerUseWindow, + ComputerUseWindowState, +} from "../mcp/types"; +import { runProcess } from "./common"; + +const WINDOWS_HELPER = String.raw` +$ErrorActionPreference = 'Stop' +$raw = [Console]::In.ReadToEnd() +$request = if ($raw.Trim().Length -gt 0) { $raw | ConvertFrom-Json } else { @{} } + +Add-Type -AssemblyName System.Drawing +Add-Type -TypeDefinition @" +using System; +using System.Collections.Generic; +using System.Runtime.InteropServices; +using System.Text; + +public static class LightcodeComputerUseNative { + public delegate bool EnumWindowsProc(IntPtr hWnd, IntPtr lParam); + + [StructLayout(LayoutKind.Sequential)] + public struct RECT { + public int Left; + public int Top; + public int Right; + public int Bottom; + } + + [StructLayout(LayoutKind.Sequential)] + public struct INPUT { + public int type; + public InputUnion U; + } + + [StructLayout(LayoutKind.Explicit)] + public struct InputUnion { + [FieldOffset(0)] public MOUSEINPUT mi; + [FieldOffset(0)] public KEYBDINPUT ki; + } + + [StructLayout(LayoutKind.Sequential)] + public struct MOUSEINPUT { + public int dx; + public int dy; + public uint mouseData; + public uint dwFlags; + public uint time; + public IntPtr dwExtraInfo; + } + + [StructLayout(LayoutKind.Sequential)] + public struct KEYBDINPUT { + public ushort wVk; + public ushort wScan; + public uint dwFlags; + public uint time; + public IntPtr dwExtraInfo; + } + + [DllImport("user32.dll")] public static extern bool EnumWindows(EnumWindowsProc lpEnumFunc, IntPtr lParam); + [DllImport("user32.dll")] public static extern bool IsWindow(IntPtr hWnd); + [DllImport("user32.dll")] public static extern bool IsWindowVisible(IntPtr hWnd); + [DllImport("user32.dll")] public static extern int GetWindowText(IntPtr hWnd, StringBuilder text, int count); + [DllImport("user32.dll")] public static extern uint GetWindowThreadProcessId(IntPtr hWnd, out uint processId); + [DllImport("user32.dll")] public static extern bool GetWindowRect(IntPtr hWnd, out RECT rect); + [DllImport("user32.dll")] public static extern bool PrintWindow(IntPtr hWnd, IntPtr hdcBlt, uint nFlags); + [DllImport("user32.dll")] public static extern bool SetForegroundWindow(IntPtr hWnd); + [DllImport("user32.dll")] public static extern bool ShowWindow(IntPtr hWnd, int nCmdShow); + [DllImport("user32.dll")] public static extern bool SetCursorPos(int x, int y); + [DllImport("user32.dll")] public static extern void mouse_event(uint flags, uint dx, uint dy, uint data, UIntPtr extraInfo); + [DllImport("user32.dll")] public static extern uint SendInput(uint nInputs, INPUT[] pInputs, int cbSize); + [DllImport("user32.dll")] public static extern short VkKeyScan(char ch); + + public const uint MOUSEEVENTF_LEFTDOWN = 0x0002; + public const uint MOUSEEVENTF_LEFTUP = 0x0004; + public const uint MOUSEEVENTF_RIGHTDOWN = 0x0008; + public const uint MOUSEEVENTF_RIGHTUP = 0x0010; + public const uint MOUSEEVENTF_MIDDLEDOWN = 0x0020; + public const uint MOUSEEVENTF_MIDDLEUP = 0x0040; + public const uint MOUSEEVENTF_WHEEL = 0x0800; + public const uint MOUSEEVENTF_HWHEEL = 0x01000; + public const uint KEYEVENTF_KEYUP = 0x0002; + public const uint KEYEVENTF_UNICODE = 0x0004; + + public static List Windows() { + var windows = new List(); + EnumWindows((hWnd, lParam) => { + windows.Add(hWnd); + return true; + }, IntPtr.Zero); + return windows; + } + + public static void SendUnicodeText(string text) { + if (String.IsNullOrEmpty(text)) return; + var inputs = new INPUT[text.Length * 2]; + for (var i = 0; i < text.Length; i++) { + inputs[i * 2].type = 1; + inputs[i * 2].U.ki.wScan = text[i]; + inputs[i * 2].U.ki.dwFlags = KEYEVENTF_UNICODE; + inputs[i * 2 + 1].type = 1; + inputs[i * 2 + 1].U.ki.wScan = text[i]; + inputs[i * 2 + 1].U.ki.dwFlags = KEYEVENTF_UNICODE | KEYEVENTF_KEYUP; + } + SendInput((uint)inputs.Length, inputs, Marshal.SizeOf(typeof(INPUT))); + } + + public static void Key(ushort vk, bool up) { + var input = new INPUT[1]; + input[0].type = 1; + input[0].U.ki.wVk = vk; + input[0].U.ki.dwFlags = up ? KEYEVENTF_KEYUP : 0; + SendInput(1, input, Marshal.SizeOf(typeof(INPUT))); + } +} +"@ + +function Get-WindowObject([IntPtr]$hWnd) { + if (-not [LightcodeComputerUseNative]::IsWindow($hWnd)) { return $null } + if (-not [LightcodeComputerUseNative]::IsWindowVisible($hWnd)) { return $null } + $titleBuilder = [Text.StringBuilder]::new(512) + [void][LightcodeComputerUseNative]::GetWindowText($hWnd, $titleBuilder, $titleBuilder.Capacity) + $title = $titleBuilder.ToString() + if ($title.Trim().Length -eq 0) { return $null } + $procId = [uint32]0 + [void][LightcodeComputerUseNative]::GetWindowThreadProcessId($hWnd, [ref]$procId) + try { $process = Get-Process -Id ([int]$procId) -ErrorAction Stop } catch { $process = $null } + $rect = New-Object LightcodeComputerUseNative+RECT + [void][LightcodeComputerUseNative]::GetWindowRect($hWnd, [ref]$rect) + $width = [Math]::Max(0, $rect.Right - $rect.Left) + $height = [Math]::Max(0, $rect.Bottom - $rect.Top) + $app = if ($process -and $process.Path) { $process.Path } elseif ($process) { $process.ProcessName } else { "unknown" } + $displayName = if ($process) { $process.ProcessName } else { $app } + [pscustomobject]@{ + app = $app + displayName = $displayName + id = [int64]$hWnd + title = $title + x = $rect.Left + y = $rect.Top + width = $width + height = $height + } +} + +function Get-WindowList { + $items = New-Object System.Collections.Generic.List[object] + foreach ($hWnd in [LightcodeComputerUseNative]::Windows()) { + $window = Get-WindowObject $hWnd + if ($null -ne $window -and $window.width -gt 0 -and $window.height -gt 0) { + $items.Add($window) + } + } + $items +} + +function Require-Window($req) { + $hWnd = [IntPtr]([int64]$req.id) + $window = Get-WindowObject $hWnd + if ($null -eq $window) { throw "Window is no longer available." } + if ($req.app -and $window.app -ne [string]$req.app) { + throw "Window app no longer matches the requested app." + } + $window +} + +function Activate-Window($window) { + $hWnd = [IntPtr]([int64]$window.id) + [void][LightcodeComputerUseNative]::ShowWindow($hWnd, 9) + Start-Sleep -Milliseconds 40 + if (-not [LightcodeComputerUseNative]::SetForegroundWindow($hWnd)) { + throw "Unable to activate window. The desktop may be locked or another secure surface may be active." + } + Start-Sleep -Milliseconds 40 +} + +function Capture-Window($window) { + $hWnd = [IntPtr]([int64]$window.id) + $width = [Math]::Max(1, [int]$window.width) + $height = [Math]::Max(1, [int]$window.height) + $bitmap = New-Object Drawing.Bitmap($width, $height) + $graphics = [Drawing.Graphics]::FromImage($bitmap) + $usedFallback = $false + try { + $hdc = $graphics.GetHdc() + try { + $ok = [LightcodeComputerUseNative]::PrintWindow($hWnd, $hdc, 2) + } finally { + $graphics.ReleaseHdc($hdc) + } + if (-not $ok) { + $usedFallback = $true + $graphics.CopyFromScreen([int]$window.x, [int]$window.y, 0, 0, [Drawing.Size]::new($width, $height)) + } + $stream = New-Object IO.MemoryStream + try { + $bitmap.Save($stream, [Drawing.Imaging.ImageFormat]::Png) + [pscustomobject]@{ + id = "window" + mimeType = "image/png" + data = [Convert]::ToBase64String($stream.ToArray()) + width = $width + height = $height + originX = [int]$window.x + originY = [int]$window.y + zIndex = 0 + fallback = $usedFallback + } + } finally { + $stream.Dispose() + } + } finally { + $graphics.Dispose() + $bitmap.Dispose() + } +} + +function Resolve-Key($token) { + $t = ([string]$token).Trim().ToLowerInvariant() + switch ($t) { + "control" { return 0x11 } + "ctrl" { return 0x11 } + "control_l" { return 0x11 } + "control_r" { return 0x11 } + "shift" { return 0x10 } + "shift_l" { return 0x10 } + "shift_r" { return 0x10 } + "alt" { return 0x12 } + "alt_l" { return 0x12 } + "alt_r" { return 0x12 } + "return" { return 0x0D } + "enter" { return 0x0D } + "tab" { return 0x09 } + "escape" { return 0x1B } + "esc" { return 0x1B } + "space" { return 0x20 } + "backspace" { return 0x08 } + "delete" { return 0x2E } + "left" { return 0x25 } + "arrowleft" { return 0x25 } + "up" { return 0x26 } + "arrowup" { return 0x26 } + "right" { return 0x27 } + "arrowright" { return 0x27 } + "down" { return 0x28 } + "arrowdown" { return 0x28 } + "home" { return 0x24 } + "end" { return 0x23 } + "page_up" { return 0x21 } + "pageup" { return 0x21 } + "page_down" { return 0x22 } + "pagedown" { return 0x22 } + "period" { return 0xBE } + "comma" { return 0xBC } + "slash" { return 0xBF } + "minus" { return 0xBD } + "plus" { return 0xBB } + "equal" { return 0xBB } + } + if ($t -match '^f([1-9]|1[0-9]|2[0-4])$') { return 0x70 + [int]$Matches[1] - 1 } + if ($t -match '^kp_([0-9])$' -or $t -match '^numpad_([0-9])$') { return 0x60 + [int]$Matches[1] } + if ($t.Length -eq 1) { + $vk = [LightcodeComputerUseNative]::VkKeyScan([char]$t[0]) + if ($vk -eq -1) { throw "Unsupported key: $token" } + return ($vk -band 0xff) + } + throw "Unsupported key: $token" +} + +function Press-Chord($key) { + $tokens = ([string]$key) -split '\+' | ForEach-Object { $_.Trim() } | Where-Object { $_.Length -gt 0 } + if ($tokens.Count -eq 0) { throw "key is required" } + $vks = @($tokens | ForEach-Object { [uint16](Resolve-Key $_) }) + foreach ($vk in $vks) { [LightcodeComputerUseNative]::Key($vk, $false) } + [Array]::Reverse($vks) + foreach ($vk in $vks) { [LightcodeComputerUseNative]::Key($vk, $true) } +} + +function Mouse-Click($button, $count) { + $down = [LightcodeComputerUseNative]::MOUSEEVENTF_LEFTDOWN + $up = [LightcodeComputerUseNative]::MOUSEEVENTF_LEFTUP + $b = ([string]$button).ToLowerInvariant() + if ($b -eq "right" -or $b -eq "r") { + $down = [LightcodeComputerUseNative]::MOUSEEVENTF_RIGHTDOWN + $up = [LightcodeComputerUseNative]::MOUSEEVENTF_RIGHTUP + } elseif ($b -eq "middle" -or $b -eq "m") { + $down = [LightcodeComputerUseNative]::MOUSEEVENTF_MIDDLEDOWN + $up = [LightcodeComputerUseNative]::MOUSEEVENTF_MIDDLEUP + } + for ($i = 0; $i -lt [Math]::Max(1, [int]$count); $i++) { + [LightcodeComputerUseNative]::mouse_event($down, 0, 0, 0, [UIntPtr]::Zero) + Start-Sleep -Milliseconds 20 + [LightcodeComputerUseNative]::mouse_event($up, 0, 0, 0, [UIntPtr]::Zero) + } +} + +switch ([string]$request.action) { + "list_windows" { + $result = @(Get-WindowList | ForEach-Object { + [pscustomobject]@{ app = $_.app; id = $_.id; title = $_.title; x = $_.x; y = $_.y; width = $_.width; height = $_.height } + }) + } + "list_apps" { + $groups = Get-WindowList | Group-Object app + $result = @($groups | ForEach-Object { + $first = $_.Group[0] + [pscustomobject]@{ + id = $_.Name + displayName = $first.displayName + isRunning = $true + windows = @($_.Group | ForEach-Object { + [pscustomobject]@{ app = $_.app; id = $_.id; title = $_.title; x = $_.x; y = $_.y; width = $_.width; height = $_.height } + }) + } + }) + } + "get_window" { + $window = Require-Window $request.input + $result = [pscustomobject]@{ app = $window.app; id = $window.id; title = $window.title; x = $window.x; y = $window.y; width = $window.width; height = $window.height } + } + "get_window_state" { + $window = Require-Window $request.input.window + $screenshots = @() + $notes = @("Window listing and screenshots are passive. Input actions switch to interactive mode and activate the target window.") + if ($request.input.include_screenshot -ne $false) { + $capture = Capture-Window $window + if ($capture.fallback) { $notes += "Passive PrintWindow capture was unavailable; used visible screen-region capture." } + $screenshots = @([pscustomobject]@{ + id = $capture.id + mimeType = $capture.mimeType + data = $capture.data + width = $capture.width + height = $capture.height + originX = $capture.originX + originY = $capture.originY + zIndex = $capture.zIndex + }) + } + $accessibility = $null + if ($request.input.include_text -eq $true) { + $accessibility = [pscustomobject]@{ + tree = 'Window: "' + $window.title + '", App: ' + $window.app + } + $notes += "Detailed UI Automation text is not available in this Lightcode helper yet." + } + $result = [pscustomobject]@{ + window = [pscustomobject]@{ app = $window.app; id = $window.id; title = $window.title; x = $window.x; y = $window.y; width = $window.width; height = $window.height } + accessibility = $accessibility + screenshots = $screenshots + mode = "passive" + notes = $notes + } + } + "activate_window" { + $window = Require-Window $request.input.window + Activate-Window $window + $result = [pscustomobject]@{ ok = $true; mode = "interactive" } + } + "click" { + $window = Require-Window $request.input.window + Activate-Window $window + $x = [int]$request.input.x + $y = [int]$request.input.y + [void][LightcodeComputerUseNative]::SetCursorPos([int]$window.x + $x, [int]$window.y + $y) + Mouse-Click $request.input.mouse_button $request.input.click_count + $result = [pscustomobject]@{ ok = $true; mode = "interactive" } + } + "type_text" { + $window = Require-Window $request.input.window + Activate-Window $window + [LightcodeComputerUseNative]::SendUnicodeText([string]$request.input.text) + $result = [pscustomobject]@{ ok = $true; mode = "interactive" } + } + "press_key" { + $window = Require-Window $request.input.window + Activate-Window $window + Press-Chord $request.input.key + $result = [pscustomobject]@{ ok = $true; mode = "interactive" } + } + "scroll" { + $window = Require-Window $request.input.window + Activate-Window $window + [void][LightcodeComputerUseNative]::SetCursorPos([int]$window.x + [int]$request.input.x, [int]$window.y + [int]$request.input.y) + if ([int]$request.input.scrollY -ne 0) { + [LightcodeComputerUseNative]::mouse_event([LightcodeComputerUseNative]::MOUSEEVENTF_WHEEL, 0, 0, [uint32](-1 * [int]$request.input.scrollY), [UIntPtr]::Zero) + } + if ([int]$request.input.scrollX -ne 0) { + [LightcodeComputerUseNative]::mouse_event([LightcodeComputerUseNative]::MOUSEEVENTF_HWHEEL, 0, 0, [uint32]([int]$request.input.scrollX), [UIntPtr]::Zero) + } + $result = [pscustomobject]@{ ok = $true; mode = "interactive" } + } + "drag" { + $window = Require-Window $request.input.window + Activate-Window $window + [void][LightcodeComputerUseNative]::SetCursorPos([int]$window.x + [int]$request.input.from_x, [int]$window.y + [int]$request.input.from_y) + [LightcodeComputerUseNative]::mouse_event([LightcodeComputerUseNative]::MOUSEEVENTF_LEFTDOWN, 0, 0, 0, [UIntPtr]::Zero) + Start-Sleep -Milliseconds 40 + [void][LightcodeComputerUseNative]::SetCursorPos([int]$window.x + [int]$request.input.to_x, [int]$window.y + [int]$request.input.to_y) + Start-Sleep -Milliseconds 40 + [LightcodeComputerUseNative]::mouse_event([LightcodeComputerUseNative]::MOUSEEVENTF_LEFTUP, 0, 0, 0, [UIntPtr]::Zero) + $result = [pscustomobject]@{ ok = $true; mode = "interactive" } + } + "launch_app" { + Start-Process -FilePath ([string]$request.input.app) + $result = [pscustomobject]@{ ok = $true } + } + "set_value" { + throw "set_value is not supported yet; click or focus the target field, then use type_text." + } + "perform_secondary_action" { + throw "perform_secondary_action is not supported yet; use keyboard navigation or coordinate input." + } + default { + throw "Unknown action: $($request.action)" + } +} + +$result | ConvertTo-Json -Depth 32 -Compress +`; + +// The helper is too large to pass via `-EncodedCommand` (base64 of the +// UTF-16LE script exceeds the ~32k Windows command-line limit and spawn fails +// with ENAMETOOLONG). Stage it to a temp `.ps1` once per process and run it +// with `-File`, leaving stdin free for the JSON request payload. +let cachedHelperPath: string | null = null; + +function ensureWindowsHelperScript(): string { + if (cachedHelperPath) return cachedHelperPath; + const dir = join(tmpdir(), "lightcode-computer-use"); + mkdirSync(dir, { recursive: true }); + const path = join(dir, "windows-helper.ps1"); + writeFileSync(path, WINDOWS_HELPER, "utf8"); + cachedHelperPath = path; + return path; +} + +async function runWindowsComputerUse(action: string, input?: unknown): Promise { + const scriptPath = ensureWindowsHelperScript(); + const { stdout } = await runProcess( + "C:\\Windows\\System32\\WindowsPowerShell\\v1.0\\powershell.exe", + ["-NoLogo", "-NoProfile", "-NonInteractive", "-ExecutionPolicy", "Bypass", "-File", scriptPath], + { + input: JSON.stringify({ action, input: input ?? {} }), + timeoutMs: 20_000, + maxBufferBytes: 24 * 1024 * 1024, + }, + ); + return JSON.parse(stdout.trim()) as T; +} + +function normalizeArray(value: T | T[] | null | undefined): T[] { + if (Array.isArray(value)) return value; + return value == null ? [] : [value]; +} + +export class WindowsComputerUseDriver implements ComputerUseDriver { + async listApps(): Promise { + return normalizeArray( + await runWindowsComputerUse("list_apps"), + ); + } + + async listWindows(): Promise { + return normalizeArray( + await runWindowsComputerUse("list_windows"), + ); + } + + getWindow(input: { app?: string; id: number }): Promise { + return runWindowsComputerUse("get_window", input); + } + + getWindowState(input: { + include_screenshot?: boolean; + include_text?: boolean; + window: ComputerUseWindow; + }): Promise { + return runWindowsComputerUse("get_window_state", input); + } + + activateWindow(input: { window: ComputerUseWindow }): Promise<{ ok: true; mode: "interactive" }> { + return runWindowsComputerUse("activate_window", input); + } + + click(input: { + click_count?: number; + mouse_button?: string; + window: ComputerUseWindow; + x?: number; + y?: number; + }): Promise<{ ok: true; mode: "interactive" }> { + return runWindowsComputerUse("click", input); + } + + typeText(input: { text: string; window: ComputerUseWindow }): Promise<{ + ok: true; + mode: "interactive"; + }> { + return runWindowsComputerUse("type_text", input); + } + + pressKey(input: { key: string; window: ComputerUseWindow }): Promise<{ + ok: true; + mode: "interactive"; + }> { + return runWindowsComputerUse("press_key", input); + } + + scroll(input: { + scrollX: number; + scrollY: number; + window: ComputerUseWindow; + x: number; + y: number; + }): Promise<{ ok: true; mode: "interactive" }> { + return runWindowsComputerUse("scroll", input); + } + + drag(input: { + from_x: number; + from_y: number; + to_x: number; + to_y: number; + window: ComputerUseWindow; + }): Promise<{ ok: true; mode: "interactive" }> { + return runWindowsComputerUse("drag", input); + } + + launchApp(input: { app: string }): Promise<{ ok: true }> { + return runWindowsComputerUse("launch_app", input); + } + + setValue(input: { + element_index: number; + value: string; + window: ComputerUseWindow; + }): Promise<{ ok: true; mode: "interactive" }> { + return runWindowsComputerUse("set_value", input); + } + + performSecondaryAction(input: { + action: string; + element_index: number; + window: ComputerUseWindow; + }): Promise<{ ok: true; mode: "interactive" }> { + return runWindowsComputerUse("perform_secondary_action", input); + } +} diff --git a/src/main/computer-use/index.ts b/src/main/computer-use/index.ts new file mode 100644 index 00000000..1aed8dfc --- /dev/null +++ b/src/main/computer-use/index.ts @@ -0,0 +1 @@ +export { ComputerUseMcpIngress, type ComputerUseMcpIngressInfo } from "./ComputerUseMcpIngress"; diff --git a/src/main/computer-use/mcp/toolRegistry.ts b/src/main/computer-use/mcp/toolRegistry.ts new file mode 100644 index 00000000..a1c2d15e --- /dev/null +++ b/src/main/computer-use/mcp/toolRegistry.ts @@ -0,0 +1,336 @@ +import type { ComputerUseDriver, ComputerUseScreenshot, ComputerUseWindowState } from "./types"; +import { readNumber, readRecord, readString, readWindow } from "../drivers/common"; + +export interface ToolContext { + driver: ComputerUseDriver; +} + +export interface ToolSpec { + name: string; + description: string; + inputSchema: Record; +} + +const WINDOW_SCHEMA = { + type: "object", + required: ["app", "id"], + properties: { + app: { type: "string" }, + id: { type: "number" }, + title: { type: "string" }, + x: { type: "number" }, + y: { type: "number" }, + width: { type: "number" }, + height: { type: "number" }, + }, +}; + +export const COMPUTER_USE_MCP_INSTRUCTIONS = + "Use the computer_use MCP server to inspect and control native macOS or Windows apps. Start with computer_use.api or computer_use.list_apps, choose a returned window, then call computer_use.get_window_state before coordinate input. list/get/screenshot operations are passive where the OS allows it; click, drag, scroll, type_text, press_key, set_value, perform_secondary_action, activate_window, and launch_app switch to interactive mode and may activate the target app. Prefer the browser MCP server for web pages. Locked desktops, secure prompts, OS permission prompts, and password/authentication surfaces require the user."; + +export const TOOLS: ToolSpec[] = [ + { + name: "api", + description: + "Return the complete Computer Use API and guidance. Call first when controlling a native app.", + inputSchema: { type: "object", properties: {} }, + }, + { + name: "list_apps", + description: "List discoverable apps and their currently targetable windows.", + inputSchema: { type: "object", properties: {} }, + }, + { + name: "list_windows", + description: "List currently targetable windows.", + inputSchema: { type: "object", properties: {} }, + }, + { + name: "launch_app", + description: "Launch an app by a list_apps id, app name, or explicit executable/app path.", + inputSchema: { + type: "object", + required: ["app"], + properties: { app: { type: "string" } }, + }, + }, + { + name: "get_window", + description: + "Refresh a window object returned by list_apps, list_windows, or get_window_state.", + inputSchema: { + type: "object", + required: ["id"], + properties: { app: { type: "string" }, id: { type: "number" } }, + }, + }, + { + name: "get_window_state", + description: + "Capture a point-in-time passive window screenshot and optional accessibility text.", + inputSchema: { + type: "object", + required: ["window"], + properties: { + window: WINDOW_SCHEMA, + include_screenshot: { type: "boolean" }, + include_text: { type: "boolean" }, + }, + }, + }, + { + name: "activate_window", + description: "Bring a returned window to the foreground. This is interactive.", + inputSchema: { + type: "object", + required: ["window"], + properties: { window: WINDOW_SCHEMA }, + }, + }, + { + name: "click", + description: + "Click window-relative coordinates from the latest screenshot. This is interactive.", + inputSchema: { + type: "object", + required: ["window", "x", "y"], + properties: { + window: WINDOW_SCHEMA, + x: { type: "number" }, + y: { type: "number" }, + click_count: { type: "number" }, + mouse_button: { type: "string", enum: ["left", "right", "middle", "l", "r", "m"] }, + }, + }, + }, + { + name: "press_key", + description: + "Press a key or + separated chord such as Return, Escape, Control_L+a, or KP_0. This is interactive.", + inputSchema: { + type: "object", + required: ["window", "key"], + properties: { window: WINDOW_SCHEMA, key: { type: "string" } }, + }, + }, + { + name: "type_text", + description: + "Type literal text into the focused control in a returned window. This is interactive.", + inputSchema: { + type: "object", + required: ["window", "text"], + properties: { window: WINDOW_SCHEMA, text: { type: "string" } }, + }, + }, + { + name: "scroll", + description: "Scroll from window-relative coordinates. This is interactive.", + inputSchema: { + type: "object", + required: ["window", "x", "y", "scrollX", "scrollY"], + properties: { + window: WINDOW_SCHEMA, + x: { type: "number" }, + y: { type: "number" }, + scrollX: { type: "number" }, + scrollY: { type: "number" }, + }, + }, + }, + { + name: "drag", + description: "Drag between window-relative coordinates. This is interactive.", + inputSchema: { + type: "object", + required: ["window", "from_x", "from_y", "to_x", "to_y"], + properties: { + window: WINDOW_SCHEMA, + from_x: { type: "number" }, + from_y: { type: "number" }, + to_x: { type: "number" }, + to_y: { type: "number" }, + }, + }, + }, + { + name: "set_value", + description: + "Set the value of an editable element by accessibility element_index when supported. This is interactive.", + inputSchema: { + type: "object", + required: ["window", "element_index", "value"], + properties: { + window: WINDOW_SCHEMA, + element_index: { type: "number" }, + value: { type: "string" }, + }, + }, + }, + { + name: "perform_secondary_action", + description: + "Invoke a secondary accessibility action by element_index when supported. This is interactive.", + inputSchema: { + type: "object", + required: ["window", "element_index", "action"], + properties: { + window: WINDOW_SCHEMA, + element_index: { type: "number" }, + action: { type: "string" }, + }, + }, + }, +]; + +export const TOOL_NAMES = new Set(TOOLS.map((tool) => tool.name)); + +const TOOL_ALIASES = new Map([ + ["apps", "list_apps"], + ["windows", "list_windows"], + ["screenshot", "get_window_state"], + ["key", "press_key"], + ["type", "type_text"], +]); + +export function normalizeToolName(name: string): string { + return TOOL_ALIASES.get(name) ?? name; +} + +export function isKnownToolName(name: string): boolean { + return TOOL_NAMES.has(normalizeToolName(name)); +} + +export async function dispatchTool( + name: string, + args: Record, + ctx: ToolContext, +): Promise { + const tool = normalizeToolName(name); + switch (tool) { + case "api": + return { + instructions: COMPUTER_USE_MCP_INSTRUCTIONS, + platform: process.platform, + tools: TOOLS.map((entry) => ({ + name: `computer_use.${entry.name}`, + description: entry.description, + })), + }; + case "list_apps": + return await ctx.driver.listApps(); + case "list_windows": + return await ctx.driver.listWindows(); + case "launch_app": + return await ctx.driver.launchApp({ app: readString(args.app, "app") }); + case "get_window": + return await ctx.driver.getWindow({ + ...(typeof args.app === "string" ? { app: args.app } : {}), + id: readNumber(args.id, "id"), + }); + case "get_window_state": { + const payload = readRecord(args); + return await ctx.driver.getWindowState({ + window: readWindow(payload.window), + ...(typeof payload.include_screenshot === "boolean" + ? { include_screenshot: payload.include_screenshot } + : {}), + ...(typeof payload.include_text === "boolean" + ? { include_text: payload.include_text } + : {}), + }); + } + case "activate_window": + return await ctx.driver.activateWindow({ window: readWindow(args.window) }); + case "click": + return await ctx.driver.click({ + window: readWindow(args.window), + x: readNumber(args.x, "x"), + y: readNumber(args.y, "y"), + ...(typeof args.click_count === "number" ? { click_count: args.click_count } : {}), + ...(typeof args.mouse_button === "string" ? { mouse_button: args.mouse_button } : {}), + }); + case "press_key": + return await ctx.driver.pressKey({ + window: readWindow(args.window), + key: readString(args.key, "key"), + }); + case "type_text": + return await ctx.driver.typeText({ + window: readWindow(args.window), + text: readString(args.text, "text"), + }); + case "scroll": + return await ctx.driver.scroll({ + window: readWindow(args.window), + x: readNumber(args.x, "x"), + y: readNumber(args.y, "y"), + scrollX: readNumber(args.scrollX, "scrollX"), + scrollY: readNumber(args.scrollY, "scrollY"), + }); + case "drag": + return await ctx.driver.drag({ + window: readWindow(args.window), + from_x: readNumber(args.from_x, "from_x"), + from_y: readNumber(args.from_y, "from_y"), + to_x: readNumber(args.to_x, "to_x"), + to_y: readNumber(args.to_y, "to_y"), + }); + case "set_value": + return await ctx.driver.setValue({ + window: readWindow(args.window), + element_index: readNumber(args.element_index, "element_index"), + value: readString(args.value, "value"), + }); + case "perform_secondary_action": + return await ctx.driver.performSecondaryAction({ + window: readWindow(args.window), + element_index: readNumber(args.element_index, "element_index"), + action: readString(args.action, "action"), + }); + default: + throw new Error(`unknown tool: ${name}`); + } +} + +export interface McpContent { + type: "text" | "image"; + text?: string; + data?: string; + mimeType?: string; +} + +export interface McpToolResult { + content: McpContent[]; + isError?: boolean; +} + +function screenshotMetadata( + screenshot: ComputerUseScreenshot, +): Omit { + const { data: _data, ...metadata } = screenshot; + return metadata; +} + +export function formatToolResult(name: string, result: unknown): McpToolResult { + if (normalizeToolName(name) === "get_window_state" && result && typeof result === "object") { + const state = result as ComputerUseWindowState; + const metadata = { + ...state, + screenshots: state.screenshots.map(screenshotMetadata), + }; + return { + content: [ + { type: "text", text: JSON.stringify(metadata, null, 2) }, + ...state.screenshots.map((screenshot) => ({ + type: "image" as const, + data: screenshot.data, + mimeType: screenshot.mimeType, + })), + ], + }; + } + return { + content: [{ type: "text", text: JSON.stringify(result, null, 2) }], + }; +} diff --git a/src/main/computer-use/mcp/types.ts b/src/main/computer-use/mcp/types.ts new file mode 100644 index 00000000..f55958a6 --- /dev/null +++ b/src/main/computer-use/mcp/types.ts @@ -0,0 +1,97 @@ +export interface ComputerUseWindow { + app: string; + id: number; + title?: string; + x?: number; + y?: number; + width?: number; + height?: number; +} + +export interface ComputerUseApp { + displayName?: string; + id: string; + isRunning?: boolean; + lastUsedDate?: string; + useCount?: number; + windows: ComputerUseWindow[]; +} + +export interface ComputerUseAccessibilityState { + document_text?: string; + focused_element?: string; + selected_elements?: string[]; + selected_text?: string; + tree: string; +} + +export interface ComputerUseScreenshot { + data: string; + height?: number; + id: string; + mimeType: string; + originX?: number; + originY?: number; + width?: number; + zIndex: number; +} + +export interface ComputerUseWindowState { + accessibility: ComputerUseAccessibilityState | null; + mode: "passive" | "interactive"; + notes?: string[]; + screenshots: ComputerUseScreenshot[]; + window: ComputerUseWindow; +} + +export interface ComputerUseDriver { + activateWindow(input: { window: ComputerUseWindow }): Promise<{ ok: true; mode: "interactive" }>; + click(input: { + click_count?: number; + mouse_button?: string; + window: ComputerUseWindow; + x?: number; + y?: number; + }): Promise<{ ok: true; mode: "interactive" }>; + drag(input: { + from_x: number; + from_y: number; + to_x: number; + to_y: number; + window: ComputerUseWindow; + }): Promise<{ ok: true; mode: "interactive" }>; + getWindow(input: { app?: string; id: number }): Promise; + getWindowState(input: { + include_screenshot?: boolean; + include_text?: boolean; + window: ComputerUseWindow; + }): Promise; + launchApp(input: { app: string }): Promise<{ ok: true }>; + listApps(): Promise; + listWindows(): Promise; + performSecondaryAction(input: { + action: string; + element_index: number; + window: ComputerUseWindow; + }): Promise<{ ok: true; mode: "interactive" }>; + pressKey(input: { key: string; window: ComputerUseWindow }): Promise<{ + ok: true; + mode: "interactive"; + }>; + scroll(input: { + scrollX: number; + scrollY: number; + window: ComputerUseWindow; + x: number; + y: number; + }): Promise<{ ok: true; mode: "interactive" }>; + setValue(input: { + element_index: number; + value: string; + window: ComputerUseWindow; + }): Promise<{ ok: true; mode: "interactive" }>; + typeText(input: { text: string; window: ComputerUseWindow }): Promise<{ + ok: true; + mode: "interactive"; + }>; +} diff --git a/src/main/main.ts b/src/main/main.ts index 4e300e8b..54d11e82 100644 --- a/src/main/main.ts +++ b/src/main/main.ts @@ -18,6 +18,7 @@ import { installPickerProtocolHandler, registerPickerProtocolScheme, } from "./browser"; +import { ComputerUseMcpIngress } from "./computer-use"; import { SupervisorClient } from "./supervisor/SupervisorClient"; import { createAutoUpdaterController } from "./updates/autoUpdater"; import { createMainWindow } from "./window/createMainWindow"; @@ -75,6 +76,7 @@ let lightcodePaths: LightcodePaths | null = null; let windowsJobObjectManager: WindowsJobObjectManager | null = null; let browserPanelManager: BrowserPanelManager | null = null; let browserMcpIngress: BrowserMcpIngress | null = null; +let computerUseMcpIngress: ComputerUseMcpIngress | null = null; // Retained module-scope so the native Tray icon stays reachable from GC. let tray: TrayHandle | null = null; let isQuitting = false; @@ -221,11 +223,21 @@ if (!hasSingleInstanceLock) { wslHelpersDir, secretStorageKey, resolveExtraEnv: () => { - const info = browserMcpIngress?.getInfo(); - if (!info) return {}; + const browserInfo = browserMcpIngress?.getInfo(); + const computerUseInfo = computerUseMcpIngress?.getInfo(); return { - LIGHTCODE_BROWSER_MCP_URL: info.url, - LIGHTCODE_BROWSER_MCP_TOKEN: info.token, + ...(browserInfo + ? { + LIGHTCODE_BROWSER_MCP_URL: browserInfo.url, + LIGHTCODE_BROWSER_MCP_TOKEN: browserInfo.token, + } + : {}), + ...(computerUseInfo + ? { + LIGHTCODE_COMPUTER_USE_MCP_URL: computerUseInfo.url, + LIGHTCODE_COMPUTER_USE_MCP_TOKEN: computerUseInfo.token, + } + : {}), }; }, assignPid: async (pid) => { @@ -264,6 +276,11 @@ if (!hasSingleInstanceLock) { console.error("[lightcode] browser MCP ingress failed to start:", err); return null; }); + computerUseMcpIngress = new ComputerUseMcpIngress(); + const computerUseMcpInfoReady = computerUseMcpIngress.start().catch((err) => { + console.error("[lightcode] computer use MCP ingress failed to start:", err); + return null; + }); registerIpcHandlers({ localHandlers: createLocalIpcHandlers({ @@ -326,7 +343,7 @@ if (!hasSingleInstanceLock) { ); } - await mcpInfoReady; + await Promise.all([mcpInfoReady, computerUseMcpInfoReady]); supervisorClient.start(lightcodePaths.baseDir); mainWindow.once("ready-to-show", () => { @@ -403,6 +420,8 @@ if (!hasSingleInstanceLock) { windowsJobObjectManager = null; browserMcpIngress?.dispose(); browserMcpIngress = null; + computerUseMcpIngress?.dispose(); + computerUseMcpIngress = null; browserPanelManager?.dispose(); browserPanelManager = null; sleepInhibitor.dispose(); diff --git a/src/main/mcp/StreamableHttpMcpIngress.ts b/src/main/mcp/StreamableHttpMcpIngress.ts new file mode 100644 index 00000000..aa65bc2c --- /dev/null +++ b/src/main/mcp/StreamableHttpMcpIngress.ts @@ -0,0 +1,313 @@ +import { randomBytes, randomUUID } from "node:crypto"; +import { createServer, type IncomingMessage, type Server, type ServerResponse } from "node:http"; + +export interface StreamableHttpMcpIngressInfo { + url: string; + token: string; + port: number; +} + +export interface StreamableHttpMcpToolSpec { + name: string; + description: string; + inputSchema: Record; +} + +export interface StreamableHttpMcpContent { + type: "text" | "image"; + text?: string; + data?: string; + mimeType?: string; +} + +export interface StreamableHttpMcpToolResult { + content: StreamableHttpMcpContent[]; + isError?: boolean; +} + +export interface StreamableHttpMcpIngressOptions { + contextUnavailableMessage?: string; + dispatchTool(name: string, args: Record, ctx: TContext): Promise; + formatToolResult(name: string, result: unknown): StreamableHttpMcpToolResult; + buildContext(): TContext | null; + instructions: string; + isKnownToolName(name: string): boolean; + onBeforeToolCall?(name: string, ctx: TContext): void; + serverInfo: { name: string; version: string }; + tools: readonly StreamableHttpMcpToolSpec[]; +} + +const MAX_BODY = 1024 * 1024; +const MCP_PROTOCOL_VERSION = "2025-03-26"; + +interface JsonRpcRequest { + jsonrpc: "2.0"; + id?: number | string | null; + method: string; + params?: unknown; +} + +interface JsonRpcResponseOk { + jsonrpc: "2.0"; + id: number | string | null; + result: unknown; +} + +interface JsonRpcResponseErr { + jsonrpc: "2.0"; + id: number | string | null; + error: { code: number; message: string; data?: unknown }; +} + +type JsonRpcResponse = JsonRpcResponseOk | JsonRpcResponseErr; + +export class StreamableHttpMcpIngress { + private server: Server | null = null; + private token = randomBytes(32).toString("hex"); + private info: StreamableHttpMcpIngressInfo | null = null; + + constructor(private readonly options: StreamableHttpMcpIngressOptions) {} + + async start(): Promise { + if (this.info) return this.info; + return await new Promise((resolve, reject) => { + const server = createServer((req, res) => this.handle(req, res)); + server.on("error", reject); + // Bind 0.0.0.0 so WSL agents can reach host-side ingress endpoints when + // they are proxied through an in-distro bridge. Access is guarded by a + // per-launch bearer token passed only to child agent processes. + server.listen(0, "0.0.0.0", () => { + const addr = server.address(); + const port = typeof addr === "object" && addr ? addr.port : 0; + this.server = server; + this.info = { url: `http://127.0.0.1:${port}`, token: this.token, port }; + resolve(this.info); + }); + }); + } + + getInfo(): StreamableHttpMcpIngressInfo | null { + return this.info; + } + + dispose(): void { + try { + this.server?.closeAllConnections?.(); + } catch {} + try { + this.server?.close(); + } catch {} + this.server = null; + } + + private async readBody(req: IncomingMessage): Promise { + return await new Promise((resolve, reject) => { + let total = 0; + const chunks: Buffer[] = []; + req.on("data", (chunk: Buffer) => { + total += chunk.length; + if (total > MAX_BODY) { + req.destroy(); + reject(new Error("body too large")); + return; + } + chunks.push(chunk); + }); + req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8"))); + req.on("error", reject); + }); + } + + private sendJson(res: ServerResponse, status: number, body: unknown): void { + res.statusCode = status; + res.setHeader("Content-Type", "application/json"); + res.setHeader("Cache-Control", "no-store"); + res.end(JSON.stringify(body)); + } + + private checkAuth(req: IncomingMessage): boolean { + const auth = req.headers.authorization; + if (auth && auth.startsWith("Bearer ") && auth.slice(7).trim() === this.token) { + return true; + } + const xToken = req.headers["x-lightcode-token"]; + return typeof xToken === "string" && xToken === this.token; + } + + private async handle(req: IncomingMessage, res: ServerResponse): Promise { + try { + if (!req.url) { + this.sendJson(res, 404, { error: "not found" }); + return; + } + const path = new URL(req.url, "http://x").pathname; + + if (req.method === "OPTIONS") { + res.statusCode = 204; + res.setHeader( + "Access-Control-Allow-Headers", + "Authorization, X-Lightcode-Token, Content-Type, Mcp-Session-Id", + ); + res.setHeader("Access-Control-Allow-Methods", "POST, GET, OPTIONS"); + res.setHeader("Access-Control-Allow-Origin", "*"); + res.end(); + return; + } + + if (!this.checkAuth(req)) { + this.sendJson(res, 401, { error: "unauthorized" }); + return; + } + + if (path === "/mcp" || path === "/mcp/") { + if (req.method === "GET") { + res.statusCode = 405; + res.setHeader("Allow", "POST"); + res.end(); + return; + } + if (req.method !== "POST") { + this.sendJson(res, 405, { error: "method not allowed" }); + return; + } + await this.handleMcp(req, res); + return; + } + + this.sendJson(res, 404, { error: "not found" }); + } catch (err) { + this.sendJson(res, 500, { error: (err as Error).message ?? "internal" }); + } + } + + private async handleMcp(req: IncomingMessage, res: ServerResponse): Promise { + const raw = await this.readBody(req); + let body: unknown; + try { + body = raw ? JSON.parse(raw) : {}; + } catch { + this.sendJson(res, 400, { + jsonrpc: "2.0", + id: null, + error: { code: -32700, message: "Parse error" }, + }); + return; + } + + let sessionId = req.headers["mcp-session-id"]; + if (Array.isArray(sessionId)) sessionId = sessionId[0]; + if (typeof sessionId !== "string" || !sessionId) { + sessionId = randomUUID(); + } + res.setHeader("Mcp-Session-Id", sessionId); + + if (Array.isArray(body)) { + const out: JsonRpcResponse[] = []; + for (const message of body) { + const reply = await this.handleSingle(message); + if (reply) out.push(reply); + } + this.sendJson(res, 200, out); + return; + } + const reply = await this.handleSingle(body); + if (!reply) { + res.statusCode = 202; + res.end(); + return; + } + this.sendJson(res, 200, reply); + } + + private async handleSingle(message: unknown): Promise { + if (!isJsonRpcRequest(message)) return null; + const { id = null, method, params } = message; + try { + if (method === "initialize") { + return { + jsonrpc: "2.0", + id, + result: { + protocolVersion: MCP_PROTOCOL_VERSION, + capabilities: { tools: {} }, + serverInfo: this.options.serverInfo, + instructions: this.options.instructions, + }, + }; + } + if (method === "notifications/initialized" || method === "initialized") { + return null; + } + if (method === "ping") { + return { jsonrpc: "2.0", id, result: {} }; + } + if (method === "tools/list") { + return { jsonrpc: "2.0", id, result: { tools: this.options.tools } }; + } + if (method === "tools/call") { + const p = (params ?? {}) as { name?: string; arguments?: Record }; + const name = String(p.name ?? ""); + const args = (p.arguments ?? {}) as Record; + if (!this.options.isKnownToolName(name)) { + return { + jsonrpc: "2.0", + id, + result: { + isError: true, + content: [{ type: "text", text: `Unknown tool: ${name}` }], + }, + }; + } + const ctx = this.options.buildContext(); + if (!ctx) { + return { + jsonrpc: "2.0", + id, + result: { + isError: true, + content: [ + { type: "text", text: this.options.contextUnavailableMessage ?? "not ready" }, + ], + }, + }; + } + this.options.onBeforeToolCall?.(name, ctx); + let raw: unknown; + try { + raw = await this.options.dispatchTool(name, args, ctx); + } catch (err) { + return { + jsonrpc: "2.0", + id, + result: { + isError: true, + content: [{ type: "text", text: (err as Error).message ?? String(err) }], + }, + }; + } + const result = this.options.formatToolResult(name, raw); + return { jsonrpc: "2.0", id, result }; + } + return { + jsonrpc: "2.0", + id, + error: { code: -32601, message: `Method not found: ${method}` }, + }; + } catch (err) { + return { + jsonrpc: "2.0", + id, + error: { code: -32000, message: (err as Error).message ?? "internal" }, + }; + } + } +} + +function isJsonRpcRequest(value: unknown): value is JsonRpcRequest { + return ( + typeof value === "object" && + value !== null && + (value as { jsonrpc?: unknown }).jsonrpc === "2.0" && + typeof (value as { method?: unknown }).method === "string" + ); +} diff --git a/src/renderer/components/composer/AttachmentBar.tsx b/src/renderer/components/composer/AttachmentBar.tsx index 96f9a2a3..545fd401 100644 --- a/src/renderer/components/composer/AttachmentBar.tsx +++ b/src/renderer/components/composer/AttachmentBar.tsx @@ -1,6 +1,6 @@ import type { ReactNode } from "react"; import { Tooltip } from "@heroui/react"; -import { Globe, X } from "lucide-react"; +import { Globe, Monitor, X } from "lucide-react"; import { getEntryIconUrl } from "@/renderer/components/common/fileIcons"; import { toLocalFileUrl } from "@/shared/promptContent"; import type { Attachment } from "./useAttachments"; @@ -60,6 +60,56 @@ export function BrowserChip(props: { ); } +export function ComputerUseChip(props: { + onRemove?: (() => void) | undefined; + title?: string; + variant?: "chip" | "header"; +}) { + const { onRemove, title = "Computer Use enabled for this thread", variant = "chip" } = props; + if (variant === "header") { + return ( + + + + + {title} + + ); + } + return ( +
+
+ ); +} + function AttachmentChip(props: { attachment: Attachment; onRemove?: ((id: string) => void) | undefined; diff --git a/src/renderer/components/composer/ComposerAddMenu.tsx b/src/renderer/components/composer/ComposerAddMenu.tsx index 148466dc..c6a5a365 100644 --- a/src/renderer/components/composer/ComposerAddMenu.tsx +++ b/src/renderer/components/composer/ComposerAddMenu.tsx @@ -1,15 +1,26 @@ import { useState } from "react"; -import { Globe, Paperclip, Plus } from "lucide-react"; +import { Globe, Monitor, Paperclip, Plus } from "lucide-react"; import { Label, ListBox, Popover, Tooltip } from "@heroui/react"; import { Button } from "@/renderer/components/common"; export function ComposerAddMenu(props: { browserMcpEnabled: boolean; + computerUseEnabled: boolean; showBrowserOption: boolean; + showComputerUseOption: boolean; onPickFiles: () => void; onToggleBrowserMcp: (next: boolean) => void; + onToggleComputerUse: (next: boolean) => void; }) { - const { browserMcpEnabled, showBrowserOption, onPickFiles, onToggleBrowserMcp } = props; + const { + browserMcpEnabled, + computerUseEnabled, + showBrowserOption, + showComputerUseOption, + onPickFiles, + onToggleBrowserMcp, + onToggleComputerUse, + } = props; const [isOpen, setIsOpen] = useState(false); const handleSelect = (id: string) => { @@ -20,6 +31,10 @@ export function ComposerAddMenu(props: { } if (id === "browser") { onToggleBrowserMcp(!browserMcpEnabled); + return; + } + if (id === "computer-use") { + onToggleComputerUse(!computerUseEnabled); } }; @@ -70,6 +85,19 @@ export function ComposerAddMenu(props: { ) : null} + {showComputerUseOption ? ( + + + + + {computerUseEnabled ? "Disable" : "Enable"} + + + ) : null} diff --git a/src/renderer/components/composer/MentionInput.test.ts b/src/renderer/components/composer/MentionInput.test.ts index 01a991e2..7e38e860 100644 --- a/src/renderer/components/composer/MentionInput.test.ts +++ b/src/renderer/components/composer/MentionInput.test.ts @@ -21,4 +21,19 @@ describe("buildMentionResults", () => { it("does not show Browser until the composer allows it", () => { expect(buildMentionResults(fileResults, "browser", false)).toEqual(fileResults); }); + + it("shows Computer Use when enabled and the query matches", () => { + expect(buildMentionResults(fileResults, "computer", false, true)).toEqual([ + { type: "computer_use", path: "computer", name: "Computer Use" }, + ...fileResults, + ]); + }); + + it("shows Browser before Computer Use for an empty @ mention", () => { + expect(buildMentionResults(fileResults, "", true, true)).toEqual([ + { type: "browser", path: "browser", name: "Browser" }, + { type: "computer_use", path: "computer", name: "Computer Use" }, + ...fileResults, + ]); + }); }); diff --git a/src/renderer/components/composer/MentionInput.tsx b/src/renderer/components/composer/MentionInput.tsx index 04c989b5..88417906 100644 --- a/src/renderer/components/composer/MentionInput.tsx +++ b/src/renderer/components/composer/MentionInput.tsx @@ -9,16 +9,24 @@ import { useDebouncedFileSearch } from "./useDebouncedFileSearch"; import { serializeToSegments, flattenSegments } from "./serializeMentions"; const BROWSER_MENTION_ENTRY: MentionEntry = { type: "browser", path: "browser", name: "Browser" }; +const COMPUTER_USE_MENTION_ENTRY: MentionEntry = { + type: "computer_use", + path: "computer", + name: "Computer Use", +}; export function buildMentionResults( fileResults: FileEntry[], query: string, showBrowserMention: boolean, + showComputerUseMention = false, ): MentionEntry[] { const q = query.trim().toLowerCase(); const browserResults = showBrowserMention && "browser".startsWith(q) ? [BROWSER_MENTION_ENTRY] : []; - return [...browserResults, ...fileResults]; + const computerUseResults = + showComputerUseMention && "computer use".startsWith(q) ? [COMPUTER_USE_MENTION_ENTRY] : []; + return [...browserResults, ...computerUseResults, ...fileResults]; } export interface MentionInputHandle { @@ -249,6 +257,8 @@ export const MentionInput = forwardRef< onPasteImage?: (file: File) => void; showBrowserMention?: boolean; onBrowserMentionSelect?: () => void; + showComputerUseMention?: boolean; + onComputerUseMentionSelect?: () => void; onSlashCommandChange?: (query: string | null) => void; /** * Trigger words to promote into chips as the user types/pastes (e.g. the @@ -277,6 +287,8 @@ export const MentionInput = forwardRef< onPasteImage, showBrowserMention, onBrowserMentionSelect, + showComputerUseMention, + onComputerUseMentionSelect, onSlashCommandChange, onInterceptKey, triggerWords, @@ -298,11 +310,12 @@ export const MentionInput = forwardRef< fileResults, mention?.query ?? "", showBrowserMention === true, + showComputerUseMention === true, ); useEffect(() => { setActiveIndex(0); - }, [mention?.query, fileResults, showBrowserMention]); + }, [mention?.query, fileResults, showBrowserMention, showComputerUseMention]); function insertPlainText(text: string) { const editor = editorRef.current; @@ -564,6 +577,18 @@ export const MentionInput = forwardRef< return; } + if (entry.type === "computer_use") { + const sel = window.getSelection(); + if (!sel) return; + sel.removeAllRanges(); + sel.addRange(range); + range.deleteContents(); + setMention(null); + onComputerUseMentionSelect?.(); + notifyTextChange(); + return; + } + const mentionData: FileMentionData = { path: entry.path, name: entry.name, diff --git a/src/renderer/components/composer/MentionPopover.tsx b/src/renderer/components/composer/MentionPopover.tsx index c83c93d3..9e3ce3d0 100644 --- a/src/renderer/components/composer/MentionPopover.tsx +++ b/src/renderer/components/composer/MentionPopover.tsx @@ -1,6 +1,6 @@ import { useEffect, useRef } from "react"; import { createPortal } from "react-dom"; -import { Globe } from "lucide-react"; +import { Globe, Monitor } from "lucide-react"; import type { FileEntry } from "@/shared/contracts"; import { getEntryIconUrl } from "@/renderer/components/common/fileIcons"; @@ -10,7 +10,13 @@ export type BrowserMentionEntry = { name: "Browser"; }; -export type MentionEntry = FileEntry | BrowserMentionEntry; +export type ComputerUseMentionEntry = { + type: "computer_use"; + path: "computer"; + name: "Computer Use"; +}; + +export type MentionEntry = FileEntry | BrowserMentionEntry | ComputerUseMentionEntry; function getParentDir(path: string): string { const lastSlash = path.lastIndexOf("/"); @@ -62,6 +68,7 @@ export function MentionPopover(props: { const dir = getParentDir(entry.path); const isActive = index === activeIndex; const isBrowser = entry.type === "browser"; + const isComputerUse = entry.type === "computer_use"; return (
{isBrowser ? (