From 5578f22c96ee36c8072ea29f05263950c0506193 Mon Sep 17 00:00:00 2001 From: yuhan Date: Sun, 5 Apr 2026 13:30:06 +0000 Subject: [PATCH] feat(voice): pluggable voice backend with Gemini Live & Qwen Realtime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a strategy-based voice backend architecture that allows switching between ElevenLabs ConvAI, Gemini Live API, and Qwen Realtime via the VOICE_BACKEND environment variable. New backends: - Gemini 2.5 Live (gemini-live): WebSocket + AudioWorklet audio pipeline, full function calling support for messageCodingAgent/processPermissionRequest - Qwen Realtime (qwen-realtime): DashScope API via Hub WebSocket proxy, voice conversation support (function calling not yet supported by model) Architecture: - VoiceBackendSession dynamically selects backend via GET /voice/backend - React.lazy() code splitting — alternative backends not bundled when unused - Hub routes: GET /voice/backend, POST /voice/gemini-token, POST /voice/qwen-token - Hub WebSocket proxy at /api/voice/qwen-ws for Qwen (browser can't set Auth header) - Inline Blob URL AudioWorklet for Vite compatibility - Auto mic mute during model speech to prevent barge-in from ambient noise - Tool-call-optimized system prompt (Chinese, no greeting turn) - PWA skipWaiting + clientsClaim for immediate deployment activation Switch via environment: VOICE_BACKEND=gemini-live GEMINI_API_KEY=xxx VOICE_BACKEND=qwen-realtime DASHSCOPE_API_KEY=xxx VOICE_BACKEND=elevenlabs ELEVENLABS_API_KEY=xxx (default, unchanged) --- hub/src/web/routes/voice.test.ts | 148 +++++++ hub/src/web/routes/voice.ts | 52 ++- hub/src/web/server.ts | 108 ++++- shared/src/voice.ts | 99 ++++- web/src/api/client.ts | 29 ++ web/src/api/voice.ts | 65 +++ web/src/components/SessionChat.tsx | 6 +- web/src/realtime/GeminiLiveVoiceSession.tsx | 369 +++++++++++++++++ web/src/realtime/QwenVoiceSession.tsx | 376 ++++++++++++++++++ web/src/realtime/VoiceBackendSession.tsx | 54 +++ web/src/realtime/gemini/audioPlayer.ts | 75 ++++ web/src/realtime/gemini/audioRecorder.ts | 132 ++++++ .../realtime/gemini/pcm-recorder.worklet.ts | 35 ++ web/src/realtime/gemini/pcmUtils.test.ts | 60 +++ web/src/realtime/gemini/pcmUtils.ts | 39 ++ web/src/realtime/gemini/toolAdapter.test.ts | 28 ++ web/src/realtime/gemini/toolAdapter.ts | 70 ++++ web/src/realtime/index.ts | 5 +- web/src/realtime/realtimeClientTools.ts | 6 +- web/src/sw.ts | 4 + web/tsconfig.json | 3 +- 21 files changed, 1744 insertions(+), 19 deletions(-) create mode 100644 hub/src/web/routes/voice.test.ts create mode 100644 web/src/realtime/GeminiLiveVoiceSession.tsx create mode 100644 web/src/realtime/QwenVoiceSession.tsx create mode 100644 web/src/realtime/VoiceBackendSession.tsx create mode 100644 web/src/realtime/gemini/audioPlayer.ts create mode 100644 web/src/realtime/gemini/audioRecorder.ts create mode 100644 web/src/realtime/gemini/pcm-recorder.worklet.ts create mode 100644 web/src/realtime/gemini/pcmUtils.test.ts create mode 100644 web/src/realtime/gemini/pcmUtils.ts create mode 100644 web/src/realtime/gemini/toolAdapter.test.ts create mode 100644 web/src/realtime/gemini/toolAdapter.ts diff --git a/hub/src/web/routes/voice.test.ts b/hub/src/web/routes/voice.test.ts new file mode 100644 index 000000000..da989374c --- /dev/null +++ b/hub/src/web/routes/voice.test.ts @@ -0,0 +1,148 @@ +import { describe, test, expect, beforeEach, afterEach } from 'bun:test' +import { Hono } from 'hono' +import type { WebAppEnv } from '../middleware/auth' +import { createVoiceRoutes } from './voice' + +function createApp() { + const app = new Hono() + app.route('/api', createVoiceRoutes()) + return app +} + +describe('GET /api/voice/backend', () => { + const originalEnv = process.env.VOICE_BACKEND + + afterEach(() => { + if (originalEnv === undefined) { + delete process.env.VOICE_BACKEND + } else { + process.env.VOICE_BACKEND = originalEnv + } + }) + + test('returns elevenlabs by default', async () => { + delete process.env.VOICE_BACKEND + const app = createApp() + const res = await app.request('/api/voice/backend') + expect(res.status).toBe(200) + const body = await res.json() as { backend: string } + expect(body.backend).toBe('elevenlabs') + }) + + test('returns gemini-live when configured', async () => { + process.env.VOICE_BACKEND = 'gemini-live' + const app = createApp() + const res = await app.request('/api/voice/backend') + expect(res.status).toBe(200) + const body = await res.json() as { backend: string } + expect(body.backend).toBe('gemini-live') + }) + + test('returns qwen-realtime when configured', async () => { + process.env.VOICE_BACKEND = 'qwen-realtime' + const app = createApp() + const res = await app.request('/api/voice/backend') + expect(res.status).toBe(200) + const body = await res.json() as { backend: string } + expect(body.backend).toBe('qwen-realtime') + }) + + test('falls back to elevenlabs for unknown values', async () => { + process.env.VOICE_BACKEND = 'unknown-backend' + const app = createApp() + const res = await app.request('/api/voice/backend') + expect(res.status).toBe(200) + const body = await res.json() as { backend: string } + expect(body.backend).toBe('elevenlabs') + }) +}) + +describe('POST /api/voice/gemini-token', () => { + const origGemini = process.env.GEMINI_API_KEY + const origGoogle = process.env.GOOGLE_API_KEY + + afterEach(() => { + if (origGemini === undefined) delete process.env.GEMINI_API_KEY + else process.env.GEMINI_API_KEY = origGemini + if (origGoogle === undefined) delete process.env.GOOGLE_API_KEY + else process.env.GOOGLE_API_KEY = origGoogle + }) + + test('returns 400 when no API key configured', async () => { + delete process.env.GEMINI_API_KEY + delete process.env.GOOGLE_API_KEY + const app = createApp() + const res = await app.request('/api/voice/gemini-token', { method: 'POST' }) + expect(res.status).toBe(400) + const body = await res.json() as { allowed: boolean; error: string } + expect(body.allowed).toBe(false) + expect(body.error).toContain('not configured') + }) + + test('returns GEMINI_API_KEY when set', async () => { + process.env.GEMINI_API_KEY = 'test-gemini-key' + delete process.env.GOOGLE_API_KEY + const app = createApp() + const res = await app.request('/api/voice/gemini-token', { method: 'POST' }) + expect(res.status).toBe(200) + const body = await res.json() as { allowed: boolean; apiKey: string } + expect(body.allowed).toBe(true) + expect(body.apiKey).toBe('test-gemini-key') + }) + + test('falls back to GOOGLE_API_KEY', async () => { + delete process.env.GEMINI_API_KEY + process.env.GOOGLE_API_KEY = 'test-google-key' + const app = createApp() + const res = await app.request('/api/voice/gemini-token', { method: 'POST' }) + expect(res.status).toBe(200) + const body = await res.json() as { allowed: boolean; apiKey: string } + expect(body.allowed).toBe(true) + expect(body.apiKey).toBe('test-google-key') + }) +}) + +describe('POST /api/voice/qwen-token', () => { + const origDash = process.env.DASHSCOPE_API_KEY + const origQwen = process.env.QWEN_API_KEY + + afterEach(() => { + if (origDash === undefined) delete process.env.DASHSCOPE_API_KEY + else process.env.DASHSCOPE_API_KEY = origDash + if (origQwen === undefined) delete process.env.QWEN_API_KEY + else process.env.QWEN_API_KEY = origQwen + }) + + test('returns 400 when no API key configured', async () => { + delete process.env.DASHSCOPE_API_KEY + delete process.env.QWEN_API_KEY + const app = createApp() + const res = await app.request('/api/voice/qwen-token', { method: 'POST' }) + expect(res.status).toBe(400) + const body = await res.json() as { allowed: boolean; error: string } + expect(body.allowed).toBe(false) + expect(body.error).toContain('not configured') + }) + + test('returns DASHSCOPE_API_KEY when set', async () => { + process.env.DASHSCOPE_API_KEY = 'test-dash-key' + delete process.env.QWEN_API_KEY + const app = createApp() + const res = await app.request('/api/voice/qwen-token', { method: 'POST' }) + expect(res.status).toBe(200) + const body = await res.json() as { allowed: boolean; apiKey: string } + expect(body.allowed).toBe(true) + expect(body.apiKey).toBe('test-dash-key') + }) + + test('falls back to QWEN_API_KEY', async () => { + delete process.env.DASHSCOPE_API_KEY + process.env.QWEN_API_KEY = 'test-qwen-key' + const app = createApp() + const res = await app.request('/api/voice/qwen-token', { method: 'POST' }) + expect(res.status).toBe(200) + const body = await res.json() as { allowed: boolean; apiKey: string } + expect(body.allowed).toBe(true) + expect(body.apiKey).toBe('test-qwen-key') + }) +}) diff --git a/hub/src/web/routes/voice.ts b/hub/src/web/routes/voice.ts index 1a55f8363..f71b65211 100644 --- a/hub/src/web/routes/voice.ts +++ b/hub/src/web/routes/voice.ts @@ -4,8 +4,10 @@ import type { WebAppEnv } from '../middleware/auth' import { ELEVENLABS_API_BASE, VOICE_AGENT_NAME, - buildVoiceAgentConfig + buildVoiceAgentConfig, + DEFAULT_VOICE_BACKEND } from '@hapi/protocol/voice' +import type { VoiceBackendType } from '@hapi/protocol/voice' const tokenRequestSchema = z.object({ customAgentId: z.string().optional(), @@ -116,6 +118,54 @@ async function getOrCreateAgentId(apiKey: string): Promise { export function createVoiceRoutes(): Hono { const app = new Hono() + // Return the configured voice backend type + app.get('/voice/backend', (c) => { + const raw = process.env.VOICE_BACKEND + const backend: VoiceBackendType = + raw === 'gemini-live' ? 'gemini-live' + : raw === 'qwen-realtime' ? 'qwen-realtime' + : DEFAULT_VOICE_BACKEND + return c.json({ backend }) + }) + + // Get Gemini API key for Gemini Live voice sessions + // Gemini Live API does not support ephemeral tokens, so we proxy the key. + // The key is short-lived in the browser session and never persisted client-side. + app.post('/voice/gemini-token', async (c) => { + const apiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY + if (!apiKey) { + return c.json({ + allowed: false, + error: 'Gemini API key not configured (set GEMINI_API_KEY or GOOGLE_API_KEY)' + }, 400) + } + + return c.json({ + allowed: true, + apiKey, + // Optional overrides for proxy/relay setups + wsUrl: process.env.GEMINI_LIVE_WS_URL || undefined, + baseUrl: process.env.GEMINI_API_BASE || undefined + }) + }) + + // Get Qwen (DashScope) API key for Qwen Realtime voice sessions + app.post('/voice/qwen-token', async (c) => { + const apiKey = process.env.DASHSCOPE_API_KEY || process.env.QWEN_API_KEY + if (!apiKey) { + return c.json({ + allowed: false, + error: 'DashScope API key not configured (set DASHSCOPE_API_KEY or QWEN_API_KEY)' + }, 400) + } + + return c.json({ + allowed: true, + apiKey, + wsUrl: process.env.QWEN_REALTIME_WS_URL || undefined + }) + }) + // Get ElevenLabs ConvAI conversation token app.post('/voice/token', async (c) => { const json = await c.req.json().catch(() => null) diff --git a/hub/src/web/server.ts b/hub/src/web/server.ts index b4dbf4eb5..b23672912 100644 --- a/hub/src/web/server.ts +++ b/hub/src/web/server.ts @@ -21,9 +21,61 @@ import { createPushRoutes } from './routes/push' import { createVoiceRoutes } from './routes/voice' import type { SSEManager } from '../sse/sseManager' import type { VisibilityTracker } from '../visibility/visibilityTracker' -import type { Server as BunServer } from 'bun' +import type { Server as BunServer, ServerWebSocket } from 'bun' import type { Server as SocketEngine } from '@socket.io/bun-engine' import type { WebSocketData } from '@socket.io/bun-engine' + +// Qwen Realtime WebSocket proxy — bridges browser (no custom headers) to DashScope (requires Authorization header) +function createQwenProxyWebSocketHandler() { + const QWEN_WS_BASE = 'wss://dashscope.aliyuncs.com/api-ws/v1/realtime' + // Map browser WS → upstream WS + const upstreamMap = new WeakMap, WebSocket>() + + return { + open(clientWs: ServerWebSocket) { + const data = clientWs.data as { apiKey: string; model: string } + const upstreamUrl = `${process.env.QWEN_REALTIME_WS_URL || QWEN_WS_BASE}?model=${encodeURIComponent(data.model)}` + + const upstream = new WebSocket(upstreamUrl, { + headers: { 'Authorization': `Bearer ${data.apiKey}` } + } as unknown as string[]) + + upstreamMap.set(clientWs, upstream) + + upstream.onopen = () => { + // Connection ready — upstream will send session.created + } + upstream.onmessage = (event) => { + // Forward upstream → client + try { + if (clientWs.readyState === 1) { + clientWs.send(typeof event.data === 'string' ? event.data : new Uint8Array(event.data as ArrayBuffer)) + } + } catch { /* client gone */ } + } + upstream.onerror = () => { + try { clientWs.close(1011, 'Upstream error') } catch { /* */ } + } + upstream.onclose = (event) => { + try { clientWs.close(event.code, event.reason) } catch { /* */ } + upstreamMap.delete(clientWs) + } + }, + message(clientWs: ServerWebSocket, message: string | ArrayBuffer | Uint8Array) { + const upstream = upstreamMap.get(clientWs) + if (upstream?.readyState === WebSocket.OPEN) { + upstream.send(typeof message === 'string' ? message : message) + } + }, + close(clientWs: ServerWebSocket, code: number, reason: string) { + const upstream = upstreamMap.get(clientWs) + if (upstream) { + try { upstream.close(code, reason) } catch { /* */ } + upstreamMap.delete(clientWs) + } + } + } +} import { loadEmbeddedAssetMap, type EmbeddedWebAsset } from './embeddedAssets' import { isBunCompiled } from '../utils/bunCompiled' import type { Store } from '../store' @@ -230,16 +282,62 @@ export async function startWebServer(options: { const socketHandler = options.socketEngine.handler() - const server = Bun.serve({ + // Wrap socket.io websocket handler to also support Qwen Realtime proxy + const originalWsHandler = socketHandler.websocket + const qwenProxyHandler = createQwenProxyWebSocketHandler() + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const server = (Bun.serve as any)({ hostname: configuration.listenHost, port: configuration.listenPort, idleTimeout: Math.max(30, socketHandler.idleTimeout), maxRequestBodySize: Math.max(socketHandler.maxRequestBodySize, 68 * 1024 * 1024), - websocket: socketHandler.websocket, - fetch: (req, server) => { + websocket: { + ...originalWsHandler, + open(ws: unknown) { + const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean }> + if (wsAny.data?._qwenProxy) { + qwenProxyHandler.open(wsAny) + } else { + originalWsHandler.open?.(ws as never) + } + }, + message(ws: unknown, message: unknown) { + const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean }> + if (wsAny.data?._qwenProxy) { + qwenProxyHandler.message(wsAny, message as string) + } else { + originalWsHandler.message?.(ws as never, message as never) + } + }, + close(ws: unknown, code: number, reason: string) { + const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean }> + if (wsAny.data?._qwenProxy) { + qwenProxyHandler.close(wsAny, code, reason) + } else { + originalWsHandler.close?.(ws as never, code as never, reason as never) + } + } + }, + fetch: (req: Request, server: { upgrade: (req: Request, opts?: unknown) => boolean }) => { const url = new URL(req.url) if (url.pathname.startsWith('/socket.io/')) { - return socketHandler.fetch(req, server) + return socketHandler.fetch(req, server as never) + } + // Qwen Realtime WebSocket proxy + if (url.pathname === '/api/voice/qwen-ws') { + const apiKey = process.env.DASHSCOPE_API_KEY || process.env.QWEN_API_KEY + const model = url.searchParams.get('model') || 'qwen3.5-omni-plus-realtime' + if (!apiKey) { + return new Response('DashScope API key not configured', { status: 400 }) + } + const upgraded = server.upgrade(req, { + data: { _qwenProxy: true, apiKey, model } + }) + if (!upgraded) { + return new Response('WebSocket upgrade failed', { status: 500 }) + } + return undefined as unknown as Response } return app.fetch(req) } diff --git a/shared/src/voice.ts b/shared/src/voice.ts index 6751f0eba..b2b6f74e7 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -8,7 +8,11 @@ export const ELEVENLABS_API_BASE = 'https://api.elevenlabs.io/v1' export const VOICE_AGENT_NAME = 'Hapi Voice Assistant' -export const VOICE_SYSTEM_PROMPT = `# Identity +export const VOICE_SYSTEM_PROMPT = `# CRITICAL RULE - Tool Usage + +You MUST call the messageCodingAgent tool for ANY request related to coding, files, development, debugging, or tasks for the agent. Do NOT respond verbally to these requests — call the tool FIRST, then briefly confirm. This is your most important behavior. + +# Identity You are Hapi Voice Assistant. You bridge voice communication between users and their AI coding agents in the Hapi ecosystem. @@ -136,9 +140,21 @@ For builds, tests, or large file operations: - Treat garbled input as phonetic hints and ask for clarification - Correct yourself immediately if you realize you made an error - Keep conversations forward-moving with fresh insights -- Assume a technical software developer audience` +- Assume a technical software developer audience + +# Language + +IMPORTANT: Always respond in Chinese (Mandarin). Use natural spoken Chinese. +- Greet users in Chinese +- Summarize technical content in Chinese +- Use English only for proper nouns, tool names, and code identifiers +- Keep the same warm, concise conversational style in Chinese + +# First Interaction -export const VOICE_FIRST_MESSAGE = "Hey! Hapi here." +When the user speaks to you for the first time, begin your response with a brief greeting (e.g. "你好!") before addressing their request. If their first message is a coding request, greet briefly AND call the tool — do both.` + +export const VOICE_FIRST_MESSAGE = "嗨!我是 Hapi 语音助手,有什么可以帮你的?" export const VOICE_TOOLS = [ { @@ -223,7 +239,7 @@ export function buildVoiceAgentConfig(): VoiceAgentConfig { conversation_config: { agent: { first_message: VOICE_FIRST_MESSAGE, - language: 'en', + language: 'zh', prompt: { prompt: VOICE_SYSTEM_PROMPT, llm: 'gemini-2.5-flash', @@ -255,3 +271,78 @@ export function buildVoiceAgentConfig(): VoiceAgentConfig { } } } + +export type VoiceBackendType = 'elevenlabs' | 'gemini-live' | 'qwen-realtime' + +export const QWEN_REALTIME_MODEL = 'qwen3-omni-flash-realtime' +export const QWEN_REALTIME_VOICE = 'Mia' + +export const DEFAULT_VOICE_BACKEND: VoiceBackendType = 'gemini-live' + +export const GEMINI_LIVE_MODEL = 'gemini-2.5-flash-native-audio-latest' + +export interface VoiceToolDefinition { + name: string + description: string + parameters: { + type: 'object' + required: string[] + properties: Record + } +} + +type VoiceToolSource = Pick<(typeof VOICE_TOOLS)[number], 'name' | 'description' | 'parameters'> + +function cloneVoiceToolDefinition(tool: VoiceToolSource): VoiceToolDefinition { + const properties: VoiceToolDefinition['parameters']['properties'] = {} + + for (const [key, value] of Object.entries(tool.parameters.properties)) { + properties[key] = { + type: value.type, + description: value.description + } + } + + return { + name: tool.name, + description: tool.description, + parameters: { + type: 'object', + required: [...tool.parameters.required], + properties + } + } +} + +export const VOICE_TOOL_DEFINITIONS: VoiceToolDefinition[] = VOICE_TOOLS.map(cloneVoiceToolDefinition) + +export type GeminiLiveFunctionDeclaration = VoiceToolDefinition + +export interface GeminiLiveConfig { + model: string + systemInstruction: string + tools: Array<{ + functionDeclarations: GeminiLiveFunctionDeclaration[] + }> + responseModalities: ['AUDIO'] +} + +export function buildGeminiLiveFunctionDeclarations(): GeminiLiveFunctionDeclaration[] { + return VOICE_TOOLS.map(cloneVoiceToolDefinition) +} + +export function buildGeminiLiveConfig(): GeminiLiveConfig { + return { + model: GEMINI_LIVE_MODEL, + systemInstruction: VOICE_SYSTEM_PROMPT, + tools: [ + { + functionDeclarations: buildGeminiLiveFunctionDeclarations() + } + ], + responseModalities: ['AUDIO'] + } +} diff --git a/web/src/api/client.ts b/web/src/api/client.ts index 163eb206d..97e019093 100644 --- a/web/src/api/client.ts +++ b/web/src/api/client.ts @@ -436,4 +436,33 @@ export class ApiClient { body: JSON.stringify(options || {}) }) } + + async fetchVoiceBackend(): Promise<{ backend: string }> { + return await this.request('/api/voice/backend') + } + + async fetchQwenToken(): Promise<{ + allowed: boolean + apiKey?: string + wsUrl?: string + error?: string + }> { + return await this.request('/api/voice/qwen-token', { + method: 'POST', + body: JSON.stringify({}) + }) + } + + async fetchGeminiToken(): Promise<{ + allowed: boolean + apiKey?: string + wsUrl?: string + baseUrl?: string + error?: string + }> { + return await this.request('/api/voice/gemini-token', { + method: 'POST', + body: JSON.stringify({}) + }) + } } diff --git a/web/src/api/voice.ts b/web/src/api/voice.ts index 66cee443f..5e532eec3 100644 --- a/web/src/api/voice.ts +++ b/web/src/api/voice.ts @@ -15,6 +15,7 @@ import { VOICE_AGENT_NAME, buildVoiceAgentConfig } from '@hapi/protocol/voice' +import type { VoiceBackendType } from '@hapi/protocol/voice' export interface VoiceTokenResponse { allowed: boolean @@ -160,3 +161,67 @@ export async function createOrUpdateHapiAgent(apiKey: string): Promise { + try { + return await api.fetchQwenToken() + } catch (error) { + return { + allowed: false, + error: error instanceof Error ? error.message : 'Network error' + } + } +} + +export interface VoiceBackendResponse { + backend: VoiceBackendType +} + +export interface GeminiTokenResponse { + allowed: boolean + apiKey?: string + wsUrl?: string + baseUrl?: string + error?: string +} + +/** + * Discover which voice backend the hub is configured to use. + */ +export async function fetchVoiceBackend(api: ApiClient): Promise { + try { + const result = await api.fetchVoiceBackend() + const backend = result.backend === 'gemini-live' ? 'gemini-live' + : result.backend === 'qwen-realtime' ? 'qwen-realtime' + : 'elevenlabs' + return { backend } as VoiceBackendResponse + } catch { + return { backend: 'elevenlabs' } + } +} + +/** + * Fetch a Gemini API key from the hub for Gemini Live voice sessions. + */ +export async function fetchGeminiToken(api: ApiClient): Promise { + try { + return await api.fetchGeminiToken() + } catch (error) { + return { + allowed: false, + error: error instanceof Error ? error.message : 'Network error' + } + } +} diff --git a/web/src/components/SessionChat.tsx b/web/src/components/SessionChat.tsx index 841286245..ccf96b65b 100644 --- a/web/src/components/SessionChat.tsx +++ b/web/src/components/SessionChat.tsx @@ -27,7 +27,7 @@ import { TeamPanel } from '@/components/TeamPanel' import { usePlatform } from '@/hooks/usePlatform' import { useSessionActions } from '@/hooks/mutations/useSessionActions' import { useVoiceOptional } from '@/lib/voice-context' -import { RealtimeVoiceSession, registerSessionStore, registerVoiceHooksStore, voiceHooks } from '@/realtime' +import { VoiceBackendSession, registerSessionStore, registerVoiceHooksStore, voiceHooks } from '@/realtime' import { isRemoteTerminalSupported } from '@/utils/terminalSupport' export function SessionChat(props: { @@ -401,9 +401,9 @@ export function SessionChat(props: { - {/* Voice session component - renders nothing but initializes ElevenLabs */} + {/* Voice session component - renders nothing but initializes voice backend */} {voice && ( - { + cleanup() + state.statusCallback?.('connecting') + + // Get API key from hub + const tokenResp = await fetchGeminiToken(this.api) + if (!tokenResp.allowed || !tokenResp.apiKey) { + const msg = tokenResp.error ?? 'Gemini API key not available' + state.statusCallback?.('error', msg) + throw new Error(msg) + } + state.apiKey = tokenResp.apiKey + state.wsBaseUrl = tokenResp.wsUrl || null + + // Request microphone + let permissionStream: MediaStream | null = null + try { + permissionStream = await navigator.mediaDevices.getUserMedia({ audio: true }) + } catch (error) { + state.statusCallback?.('error', 'Microphone permission denied') + throw error + } finally { + permissionStream?.getTracks().forEach((t) => t.stop()) + } + + // Connect WebSocket + const wsBase = state.wsBaseUrl || DEFAULT_GEMINI_LIVE_WS_BASE + const wsUrl = `${wsBase}?key=${encodeURIComponent(state.apiKey)}` + const ws = new WebSocket(wsUrl) + state.ws = ws + + return new Promise((resolve, reject) => { + let setupDone = false + + ws.onopen = () => { + if (DEBUG) console.log('[GeminiLive] WebSocket connected, sending setup') + + const liveConfig = buildGeminiLiveConfig() + const setupMessage = { + setup: { + model: `models/${liveConfig.model}`, + generationConfig: { + responseModalities: ['AUDIO'], + speechConfig: { + voiceConfig: { + prebuiltVoiceConfig: { voiceName: 'Aoede' } + } + } + }, + systemInstruction: { + parts: [{ text: liveConfig.systemInstruction }] + }, + tools: liveConfig.tools.map((t) => ({ + functionDeclarations: t.functionDeclarations.map((fd) => ({ + name: fd.name, + description: fd.description, + parameters: fd.parameters + })) + })) + } + } + + ws.send(JSON.stringify(setupMessage)) + } + + ws.onmessage = async (event) => { + let data: Record + try { + if (event.data instanceof Blob) { + const text = await event.data.text() + data = JSON.parse(text) as Record + } else { + data = JSON.parse(event.data as string) as Record + } + } catch { + if (DEBUG) console.warn('[GeminiLive] Failed to parse message') + return + } + + // Log all message types for debugging + const msgKeys = Object.keys(data).filter(k => k !== 'serverContent' || !('modelTurn' in (data.serverContent as Record || {}))) + if (!data.serverContent) { + console.log('[GeminiLive] Message:', msgKeys.join(', '), JSON.stringify(data).slice(0, 200)) + } + + // Setup complete + if (data.setupComplete && !setupDone) { + setupDone = true + if (DEBUG) console.log('[GeminiLive] Setup complete') + state.statusCallback?.('connected') + + // Start audio capture + startAudioCapture() + + // Send initial context if available (no clientContent greeting — it breaks tool calls) + if (config.initialContext) { + sendClientContent(`[Context] ${config.initialContext}`) + } + + resolve() + return + } + + // Server content (audio / text / turn complete) + const serverContent = data.serverContent as { + modelTurn?: { parts?: Array<{ inlineData?: { data: string; mimeType: string }; text?: string }> } + turnComplete?: boolean + } | undefined + + if (serverContent) { + if (serverContent.modelTurn?.parts) { + // Model is generating — mute mic to prevent barge-in from noise + if (!state.modelSpeaking) { + state.modelSpeaking = true + state.recorder?.setMuted(true) + } + for (const part of serverContent.modelTurn.parts) { + if (part.inlineData?.data) { + state.player?.enqueue(part.inlineData.data) + } + if (part.text) { + console.log('[GeminiLive] Text:', part.text) + } + } + } + if (serverContent.turnComplete) { + console.log('[GeminiLive] Turn complete') + // Model done — unmute mic for next user turn + state.modelSpeaking = false + state.recorder?.setMuted(false) + } + } + + // Tool calls + const toolCall = data.toolCall as { + functionCalls?: Array<{ name: string; args: Record; id: string }> + } | undefined + + if (toolCall?.functionCalls && toolCall.functionCalls.length > 0) { + console.log('[GeminiLive] Tool calls:', toolCall.functionCalls.map((c) => c.name)) + + const responses = await handleGeminiFunctionCalls( + toolCall.functionCalls as GeminiFunctionCall[] + ) + + // Send tool responses back + if (state.ws?.readyState === WebSocket.OPEN) { + state.ws.send(JSON.stringify({ + toolResponse: { + functionResponses: responses.map((r) => ({ + id: r.id, + name: r.name, + response: r.response + })) + } + })) + } + } + } + + ws.onerror = (event) => { + console.error('[GeminiLive] WebSocket error:', event) + if (!setupDone) { + state.statusCallback?.('error', 'WebSocket connection failed') + reject(new Error('WebSocket connection failed')) + } + } + + ws.onclose = (event) => { + if (DEBUG) console.log('[GeminiLive] WebSocket closed:', event.code, event.reason) + cleanup() + resetRealtimeSessionState() + state.statusCallback?.('disconnected') + } + }) + } + + async endSession(): Promise { + cleanup() + resetRealtimeSessionState() + state.statusCallback?.('disconnected') + } + + sendTextMessage(message: string): void { + sendClientContent(message) + } + + sendContextualUpdate(update: string): void { + // Send as a system-like context message + sendClientContent(`[System Context Update] ${update}`) + } +} + +function sendClientContent(text: string): void { + if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return + state.ws.send(JSON.stringify({ + clientContent: { + turns: [{ role: 'user', parts: [{ text }] }], + turnComplete: true + } + })) +} + +function sendAudioChunk(base64Pcm: string): void { + if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return + // Don't send audio while model is speaking + if (state.modelSpeaking) return + state.ws.send(JSON.stringify({ + realtimeInput: { + mediaChunks: [{ + mimeType: 'audio/pcm;rate=16000', + data: base64Pcm + }] + } + })) +} + +function startAudioCapture(): void { + state.player = new GeminiAudioPlayer() + state.recorder = new GeminiAudioRecorder() + + state.recorder.start( + (pcm16Chunk) => sendAudioChunk(pcm16Chunk), + (error) => { + console.error('[GeminiLive] Audio capture error:', error) + state.statusCallback?.('error', 'Microphone error') + } + ) +} + +// --- React component --- + +export interface GeminiLiveVoiceSessionProps { + api: ApiClient + micMuted?: boolean + onStatusChange?: StatusCallback + getSession?: (sessionId: string) => Session | null + sendMessage?: (sessionId: string, message: string) => void + approvePermission?: (sessionId: string, requestId: string) => Promise + denyPermission?: (sessionId: string, requestId: string) => Promise +} + +export function GeminiLiveVoiceSession({ + api, + micMuted = false, + onStatusChange, + getSession, + sendMessage, + approvePermission, + denyPermission +}: GeminiLiveVoiceSessionProps) { + const hasRegistered = useRef(false) + + // Store status callback + useEffect(() => { + state.statusCallback = onStatusChange || null + return () => { state.statusCallback = null } + }, [onStatusChange]) + + // Register session store for client tools + useEffect(() => { + if (getSession && sendMessage && approvePermission && denyPermission) { + registerSessionStore({ + getSession: (sessionId: string) => + getSession(sessionId) as { agentState?: { requests?: Record } } | null, + sendMessage, + approvePermission, + denyPermission + }) + } + }, [getSession, sendMessage, approvePermission, denyPermission]) + + // Register voice session once + useEffect(() => { + if (!hasRegistered.current) { + try { + registerVoiceSession(new GeminiLiveVoiceSessionImpl(api)) + hasRegistered.current = true + } catch (error) { + console.error('[GeminiLive] Failed to register voice session:', error) + } + } + }, [api]) + + // Sync mic mute state + useEffect(() => { + if (state.recorder) { + state.recorder.setMuted(micMuted) + } + }, [micMuted]) + + // Handle barge-in: clear audio queue when user starts speaking + const handleBargeIn = useCallback(() => { + if (state.player?.isPlaying()) { + state.player.clearQueue() + } + }, []) + + // Cleanup on unmount + useEffect(() => { + return () => { + cleanup() + } + }, []) + + return null +} diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx new file mode 100644 index 000000000..3c9808c6f --- /dev/null +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -0,0 +1,376 @@ +import { useEffect, useRef, useCallback } from 'react' +import { registerVoiceSession, resetRealtimeSessionState } from './RealtimeSession' +import { registerSessionStore } from './realtimeClientTools' +import { fetchQwenToken } from '@/api/voice' +import { GeminiAudioRecorder } from './gemini/audioRecorder' +import { GeminiAudioPlayer } from './gemini/audioPlayer' +import { realtimeClientTools } from './realtimeClientTools' +import { + QWEN_REALTIME_MODEL, + QWEN_REALTIME_VOICE, + VOICE_SYSTEM_PROMPT, + VOICE_TOOL_DEFINITIONS +} from '@hapi/protocol/voice' +import type { VoiceSession, VoiceSessionConfig, StatusCallback } from './types' +import type { ApiClient } from '@/api/client' +import type { Session } from '@/types/api' + +const DEBUG = import.meta.env.DEV + +// Qwen WebSocket connects via Hub proxy (browser can't set Authorization header) + +interface QwenState { + ws: WebSocket | null + recorder: GeminiAudioRecorder | null + player: GeminiAudioPlayer | null + statusCallback: StatusCallback | null + apiKey: string | null + wsBaseUrl: string | null +} + +const state: QwenState = { + ws: null, + recorder: null, + player: null, + statusCallback: null, + apiKey: null, + wsBaseUrl: null +} + +let eventCounter = 0 +function nextEventId(): string { + return `evt_${++eventCounter}` +} + +function cleanup() { + if (state.recorder) { + state.recorder.dispose() + state.recorder = null + } + if (state.player) { + state.player.dispose() + state.player = null + } + if (state.ws) { + if (state.ws.readyState === WebSocket.OPEN || state.ws.readyState === WebSocket.CONNECTING) { + state.ws.close() + } + state.ws = null + } +} + +function sendEvent(type: string, payload?: Record): void { + if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return + state.ws.send(JSON.stringify({ + event_id: nextEventId(), + type, + ...payload + })) +} + +class QwenVoiceSessionImpl implements VoiceSession { + private api: ApiClient + + constructor(api: ApiClient) { + this.api = api + } + + async startSession(config: VoiceSessionConfig): Promise { + cleanup() + state.statusCallback?.('connecting') + + // Get API key from hub + const tokenResp = await fetchQwenToken(this.api) + if (!tokenResp.allowed || !tokenResp.apiKey) { + const msg = tokenResp.error ?? 'DashScope API key not available' + state.statusCallback?.('error', msg) + throw new Error(msg) + } + state.apiKey = tokenResp.apiKey + state.wsBaseUrl = tokenResp.wsUrl || null + + // Request microphone + let permissionStream: MediaStream | null = null + try { + permissionStream = await navigator.mediaDevices.getUserMedia({ audio: true }) + } catch (error) { + state.statusCallback?.('error', 'Microphone permission denied') + throw error + } finally { + permissionStream?.getTracks().forEach((t) => t.stop()) + } + + // Connect via Hub WebSocket proxy (DashScope requires Authorization header, + // which browser WebSocket API doesn't support) + const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:' + const proxyBase = state.wsBaseUrl || `${protocol}//${window.location.host}` + const model = QWEN_REALTIME_MODEL + const wsUrl = `${proxyBase}/api/voice/qwen-ws?model=${encodeURIComponent(model)}` + const ws = new WebSocket(wsUrl) + state.ws = ws + + return new Promise((resolve, reject) => { + let sessionCreated = false + + ws.onopen = () => { + if (DEBUG) console.log('[Qwen] WebSocket connected') + } + + ws.onmessage = async (event) => { + let data: Record + try { + data = JSON.parse(event.data as string) as Record + } catch { + if (DEBUG) console.warn('[Qwen] Failed to parse message') + return + } + + const eventType = data.type as string + + // Session created - send configuration + if (eventType === 'session.created' && !sessionCreated) { + sessionCreated = true + if (DEBUG) console.log('[Qwen] Session created') + + // Build tools config + const tools = VOICE_TOOL_DEFINITIONS.map((td) => ({ + type: 'function' as const, + name: td.name, + description: td.description, + parameters: td.parameters + })) + + // Send session.update with full configuration + const instructions = config.initialContext + ? `${VOICE_SYSTEM_PROMPT}\n\n[Current Context]\n${config.initialContext}` + : VOICE_SYSTEM_PROMPT + + sendEvent('session.update', { + session: { + modalities: ['text', 'audio'], + voice: QWEN_REALTIME_VOICE, + input_audio_format: 'pcm', + output_audio_format: 'pcm', + instructions, + temperature: 0.7, + turn_detection: { + type: 'server_vad', + threshold: 0.5, + silence_duration_ms: 800, + prefix_padding_ms: 300 + }, + tools, + tool_choice: 'auto' + } + }) + return + } + + // Session updated - ready to go + if (eventType === 'session.updated') { + if (DEBUG) console.log('[Qwen] Session configured') + state.statusCallback?.('connected') + startAudioCapture() + resolve() + return + } + + // Audio output streaming + if (eventType === 'response.audio.delta') { + const delta = data.delta as string + if (delta) { + state.player?.enqueue(delta) + } + return + } + + // Text transcript (for debug) + if (eventType === 'response.audio_transcript.delta' && DEBUG) { + console.log('[Qwen] Transcript:', data.delta) + return + } + + // Function call complete + if (eventType === 'response.function_call_arguments.done') { + const callId = data.call_id as string + const fnName = data.name as string + const argsStr = data.arguments as string + + if (DEBUG) console.log('[Qwen] Tool call:', fnName, argsStr) + + let args: Record = {} + try { args = JSON.parse(argsStr) } catch { /* empty */ } + + // Execute the tool + const handler = fnName === 'messageCodingAgent' + ? realtimeClientTools.messageCodingAgent + : fnName === 'processPermissionRequest' + ? realtimeClientTools.processPermissionRequest + : null + + const result = handler + ? await handler(args) + : `error (unknown tool: ${fnName})` + + // Send function result back + sendEvent('conversation.item.create', { + item: { + type: 'function_call_output', + call_id: callId, + output: typeof result === 'string' ? result : JSON.stringify(result) + } + }) + // Trigger model to continue + sendEvent('response.create') + return + } + + // VAD: user started speaking - barge-in + if (eventType === 'input_audio_buffer.speech_started') { + if (state.player?.isPlaying()) { + state.player.clearQueue() + } + return + } + + // Response done + if (eventType === 'response.done' && DEBUG) { + const resp = data.response as Record | undefined + const usage = resp?.usage as Record | undefined + if (usage) console.log('[Qwen] Usage:', usage) + return + } + + // Error + if (eventType === 'error') { + const err = data.error as { message?: string } | undefined + console.error('[Qwen] Server error:', err?.message || data) + return + } + } + + ws.onerror = (event) => { + console.error('[Qwen] WebSocket error:', event) + if (!sessionCreated) { + state.statusCallback?.('error', 'WebSocket connection failed') + reject(new Error('WebSocket connection failed')) + } + } + + ws.onclose = (event) => { + if (DEBUG) console.log('[Qwen] WebSocket closed:', event.code, event.reason) + cleanup() + resetRealtimeSessionState() + state.statusCallback?.('disconnected') + } + }) + } + + async endSession(): Promise { + cleanup() + resetRealtimeSessionState() + state.statusCallback?.('disconnected') + } + + sendTextMessage(message: string): void { + // Send text as a user message via conversation.item.create + sendEvent('conversation.item.create', { + item: { + type: 'message', + role: 'user', + content: [{ type: 'input_text', text: message }] + } + }) + sendEvent('response.create') + } + + sendContextualUpdate(update: string): void { + // Send context as a system-like user message + sendEvent('conversation.item.create', { + item: { + type: 'message', + role: 'user', + content: [{ type: 'input_text', text: `[System Context Update] ${update}` }] + } + }) + } +} + +function startAudioCapture(): void { + state.player = new GeminiAudioPlayer() + state.recorder = new GeminiAudioRecorder() + + state.recorder.start( + (base64Pcm) => { + sendEvent('input_audio_buffer.append', { audio: base64Pcm }) + }, + (error) => { + console.error('[Qwen] Audio capture error:', error) + state.statusCallback?.('error', 'Microphone error') + } + ) +} + +// --- React component --- + +export interface QwenVoiceSessionProps { + api: ApiClient + micMuted?: boolean + onStatusChange?: StatusCallback + getSession?: (sessionId: string) => Session | null + sendMessage?: (sessionId: string, message: string) => void + approvePermission?: (sessionId: string, requestId: string) => Promise + denyPermission?: (sessionId: string, requestId: string) => Promise +} + +export function QwenVoiceSession({ + api, + micMuted = false, + onStatusChange, + getSession, + sendMessage, + approvePermission, + denyPermission +}: QwenVoiceSessionProps) { + const hasRegistered = useRef(false) + + useEffect(() => { + state.statusCallback = onStatusChange || null + return () => { state.statusCallback = null } + }, [onStatusChange]) + + useEffect(() => { + if (getSession && sendMessage && approvePermission && denyPermission) { + registerSessionStore({ + getSession: (sessionId: string) => + getSession(sessionId) as { agentState?: { requests?: Record } } | null, + sendMessage, + approvePermission, + denyPermission + }) + } + }, [getSession, sendMessage, approvePermission, denyPermission]) + + useEffect(() => { + if (!hasRegistered.current) { + try { + registerVoiceSession(new QwenVoiceSessionImpl(api)) + hasRegistered.current = true + } catch (error) { + console.error('[Qwen] Failed to register voice session:', error) + } + } + }, [api]) + + useEffect(() => { + if (state.recorder) { + state.recorder.setMuted(micMuted) + } + }, [micMuted]) + + useEffect(() => { + return () => { cleanup() } + }, []) + + return null +} diff --git a/web/src/realtime/VoiceBackendSession.tsx b/web/src/realtime/VoiceBackendSession.tsx new file mode 100644 index 000000000..c23dfa150 --- /dev/null +++ b/web/src/realtime/VoiceBackendSession.tsx @@ -0,0 +1,54 @@ +import { lazy, Suspense, useEffect, useState } from 'react' +import { RealtimeVoiceSession } from './RealtimeVoiceSession' +import type { RealtimeVoiceSessionProps } from './RealtimeVoiceSession' +import { fetchVoiceBackend } from '@/api/voice' +import type { ApiClient } from '@/api/client' +import type { VoiceBackendType } from '@hapi/protocol/voice' + +// Lazy-load alternative backends to avoid bundling when using ElevenLabs +const GeminiLiveVoiceSession = lazy(() => + import('./GeminiLiveVoiceSession').then((m) => ({ default: m.GeminiLiveVoiceSession })) +) +const QwenVoiceSession = lazy(() => + import('./QwenVoiceSession').then((m) => ({ default: m.QwenVoiceSession })) +) + +export type VoiceBackendSessionProps = RealtimeVoiceSessionProps & { + api: ApiClient +} + +/** + * Dynamically selects the voice session component based on the hub's configured backend. + * Queries GET /voice/backend once on mount and renders the appropriate component. + */ +export function VoiceBackendSession(props: VoiceBackendSessionProps) { + const [backend, setBackend] = useState(null) + + useEffect(() => { + let cancelled = false + fetchVoiceBackend(props.api).then((resp) => { + if (!cancelled) setBackend(resp.backend) + }) + return () => { cancelled = true } + }, [props.api]) + + if (!backend) return null + + if (backend === 'gemini-live') { + return ( + + + + ) + } + + if (backend === 'qwen-realtime') { + return ( + + + + ) + } + + return +} diff --git a/web/src/realtime/gemini/audioPlayer.ts b/web/src/realtime/gemini/audioPlayer.ts new file mode 100644 index 000000000..23d1d341e --- /dev/null +++ b/web/src/realtime/gemini/audioPlayer.ts @@ -0,0 +1,75 @@ +import { base64ToArrayBuffer, pcm16ToFloat32 } from './pcmUtils'; + +export class GeminiAudioPlayer { + private audioContext: AudioContext; + private ownsContext: boolean; + private lastEndTime: number = 0; + private activeSources: AudioBufferSourceNode[] = []; + + constructor(audioContext?: AudioContext) { + if (audioContext) { + this.audioContext = audioContext; + this.ownsContext = false; + } else { + this.audioContext = new AudioContext({ sampleRate: 24000 }); + this.ownsContext = true; + } + this.lastEndTime = this.audioContext.currentTime; + } + + enqueue(base64Pcm: string): void { + if (this.audioContext.state === 'suspended') { + this.audioContext.resume(); + } + + const arrayBuffer = base64ToArrayBuffer(base64Pcm); + const float32Data = pcm16ToFloat32(arrayBuffer); + + if (float32Data.length === 0) return; + + const audioBuffer = this.audioContext.createBuffer(1, float32Data.length, 24000); + audioBuffer.copyToChannel(new Float32Array(float32Data), 0); + + const source = this.audioContext.createBufferSource(); + source.buffer = audioBuffer; + source.connect(this.audioContext.destination); + + const startTime = Math.max(this.audioContext.currentTime, this.lastEndTime); + + source.onended = () => { + const index = this.activeSources.indexOf(source); + if (index > -1) { + this.activeSources.splice(index, 1); + } + }; + + source.start(startTime); + this.activeSources.push(source); + + this.lastEndTime = startTime + audioBuffer.duration; + } + + clearQueue(): void { + this.activeSources.forEach(source => { + try { + source.stop(); + } catch (e) { + // Ignore if already stopped + } + source.disconnect(); + }); + this.activeSources = []; + this.lastEndTime = this.audioContext.currentTime; + } + + isPlaying(): boolean { + return this.lastEndTime > this.audioContext.currentTime; + } + + dispose(): void { + this.clearQueue(); + if (this.ownsContext && this.audioContext.state !== 'closed') { + this.audioContext.close(); + } + } +} diff --git a/web/src/realtime/gemini/audioRecorder.ts b/web/src/realtime/gemini/audioRecorder.ts new file mode 100644 index 000000000..b1c01a7c4 --- /dev/null +++ b/web/src/realtime/gemini/audioRecorder.ts @@ -0,0 +1,132 @@ +import { float32ToPcm16, arrayBufferToBase64 } from './pcmUtils'; + +// Inline worklet source to avoid Vite bundling issues with ?url imports. +// AudioWorklet.addModule() requires a URL to valid JS, so we create a Blob URL. +const WORKLET_SOURCE = ` +class PcmRecorderProcessor extends AudioWorkletProcessor { + constructor() { + super(); + this.buffer = new Float32Array(4096); + this.idx = 0; + } + process(inputs) { + const input = inputs[0]; + if (input && input.length > 0) { + const channel = input[0]; + for (let i = 0; i < channel.length; i++) { + this.buffer[this.idx++] = channel[i]; + if (this.idx >= 4096) { + this.port.postMessage({ samples: this.buffer.slice() }); + this.idx = 0; + } + } + } + return true; + } +} +registerProcessor('pcm-recorder-processor', PcmRecorderProcessor); +`; + +function createWorkletUrl(): string { + const blob = new Blob([WORKLET_SOURCE], { type: 'application/javascript' }); + return URL.createObjectURL(blob); +} + +export class GeminiAudioRecorder { + private audioContext: AudioContext | null = null; + private mediaStream: MediaStream | null = null; + private sourceNode: MediaStreamAudioSourceNode | null = null; + private workletNode: AudioWorkletNode | null = null; + private scriptNode: ScriptProcessorNode | null = null; + + async start(onChunk: (base64Pcm: string) => void, onError?: (error: Error) => void): Promise { + try { + this.mediaStream = await navigator.mediaDevices.getUserMedia({ + audio: { sampleRate: 16000, channelCount: 1 } + }); + + this.mediaStream.getTracks().forEach((track) => { + track.onended = () => { + if (onError) onError(new Error('Microphone disconnected')); + }; + }); + + this.audioContext = new AudioContext({ sampleRate: 16000 }); + if (this.audioContext.state === 'suspended') { + await this.audioContext.resume(); + } + + this.sourceNode = this.audioContext.createMediaStreamSource(this.mediaStream); + + try { + const workletUrl = createWorkletUrl(); + await this.audioContext.audioWorklet.addModule(workletUrl); + URL.revokeObjectURL(workletUrl); + + this.workletNode = new AudioWorkletNode(this.audioContext, 'pcm-recorder-processor'); + this.workletNode.port.onmessage = (event) => { + const pcm16 = float32ToPcm16(event.data.samples); + const base64 = arrayBufferToBase64(pcm16); + onChunk(base64); + }; + this.sourceNode.connect(this.workletNode); + } catch (e) { + console.warn('[GeminiLive] AudioWorklet failed, falling back to ScriptProcessorNode', e); + this.scriptNode = this.audioContext.createScriptProcessor(4096, 1, 1); + this.scriptNode.onaudioprocess = (event) => { + const inputData = event.inputBuffer.getChannelData(0); + const pcm16 = float32ToPcm16(new Float32Array(inputData)); + const base64 = arrayBufferToBase64(pcm16); + onChunk(base64); + }; + this.sourceNode.connect(this.scriptNode); + this.scriptNode.connect(this.audioContext.destination); + } + } catch (e) { + if (onError) onError(e instanceof Error ? e : new Error(String(e))); + throw e; + } + } + + stop(): void { + if (this.mediaStream) { + this.mediaStream.getTracks().forEach(track => { + track.onended = null; + track.stop(); + }); + this.mediaStream = null; + } + + if (this.scriptNode) { + this.scriptNode.disconnect(); + this.scriptNode = null; + } + + if (this.workletNode) { + this.workletNode.disconnect(); + this.workletNode = null; + } + + if (this.sourceNode) { + this.sourceNode.disconnect(); + this.sourceNode = null; + } + + if (this.audioContext) { + this.audioContext.close(); + this.audioContext = null; + } + } + + setMuted(muted: boolean): void { + if (this.mediaStream) { + this.mediaStream.getAudioTracks().forEach(track => { + track.enabled = !muted; + }); + } + } + + dispose(): void { + this.stop(); + } +} diff --git a/web/src/realtime/gemini/pcm-recorder.worklet.ts b/web/src/realtime/gemini/pcm-recorder.worklet.ts new file mode 100644 index 000000000..404f65445 --- /dev/null +++ b/web/src/realtime/gemini/pcm-recorder.worklet.ts @@ -0,0 +1,35 @@ +// AudioWorklet processor runs in a separate scope with its own globals. +// These declarations satisfy TypeScript without pulling in DOM lib types. +declare class AudioWorkletProcessor { + readonly port: MessagePort + constructor() +} +declare function registerProcessor(name: string, ctor: new () => AudioWorkletProcessor): void + +class PcmRecorderProcessor extends AudioWorkletProcessor { + private buffer: Float32Array; + private bufferSize = 4096; + private bufferIndex = 0; + + constructor() { + super(); + this.buffer = new Float32Array(this.bufferSize); + } + + process(inputs: Float32Array[][]): boolean { + const input = inputs[0]; + if (input && input.length > 0) { + const channel = input[0]; + for (let i = 0; i < channel.length; i++) { + this.buffer[this.bufferIndex++] = channel[i]; + if (this.bufferIndex >= this.bufferSize) { + this.port.postMessage({ samples: this.buffer.slice() }); + this.bufferIndex = 0; + } + } + } + return true; + } +} + +registerProcessor('pcm-recorder-processor', PcmRecorderProcessor); diff --git a/web/src/realtime/gemini/pcmUtils.test.ts b/web/src/realtime/gemini/pcmUtils.test.ts new file mode 100644 index 000000000..2e0be05c3 --- /dev/null +++ b/web/src/realtime/gemini/pcmUtils.test.ts @@ -0,0 +1,60 @@ +import { describe, test, expect } from 'bun:test' +import { + float32ToPcm16, + pcm16ToFloat32, + arrayBufferToBase64, + base64ToArrayBuffer +} from './pcmUtils' + +describe('pcmUtils', () => { + describe('float32ToPcm16 / pcm16ToFloat32 round-trip', () => { + test('preserves signal within quantization error', () => { + const input = new Float32Array([0, 0.5, -0.5, 1.0, -1.0]) + const pcm16 = float32ToPcm16(input) + const output = pcm16ToFloat32(pcm16) + + expect(output.length).toBe(input.length) + for (let i = 0; i < input.length; i++) { + expect(Math.abs(output[i] - input[i])).toBeLessThan(0.001) + } + }) + + test('clamps values outside [-1, 1]', () => { + const input = new Float32Array([2.0, -2.0]) + const pcm16 = float32ToPcm16(input) + const output = pcm16ToFloat32(pcm16) + + expect(Math.abs(output[0] - 1.0)).toBeLessThan(0.001) + expect(Math.abs(output[1] - (-1.0))).toBeLessThan(0.001) + }) + + test('handles empty input', () => { + const input = new Float32Array(0) + const pcm16 = float32ToPcm16(input) + expect(pcm16.byteLength).toBe(0) + const output = pcm16ToFloat32(pcm16) + expect(output.length).toBe(0) + }) + }) + + describe('arrayBufferToBase64 / base64ToArrayBuffer round-trip', () => { + test('preserves binary data', () => { + const original = new Uint8Array([0, 1, 127, 128, 255]) + const base64 = arrayBufferToBase64(original.buffer) + const restored = new Uint8Array(base64ToArrayBuffer(base64)) + + expect(restored.length).toBe(original.length) + for (let i = 0; i < original.length; i++) { + expect(restored[i]).toBe(original[i]) + } + }) + + test('handles empty buffer', () => { + const empty = new ArrayBuffer(0) + const base64 = arrayBufferToBase64(empty) + expect(base64).toBe('') + const restored = base64ToArrayBuffer(base64) + expect(restored.byteLength).toBe(0) + }) + }) +}) diff --git a/web/src/realtime/gemini/pcmUtils.ts b/web/src/realtime/gemini/pcmUtils.ts new file mode 100644 index 000000000..67e2928fc --- /dev/null +++ b/web/src/realtime/gemini/pcmUtils.ts @@ -0,0 +1,39 @@ +export function float32ToPcm16(samples: Float32Array): ArrayBuffer { + const buffer = new ArrayBuffer(samples.length * 2); + const view = new DataView(buffer); + for (let i = 0; i < samples.length; i++) { + let s = Math.max(-1, Math.min(1, samples[i])); + s = s < 0 ? s * 0x8000 : s * 0x7FFF; + view.setInt16(i * 2, s, true); + } + return buffer; +} + +export function pcm16ToFloat32(buffer: ArrayBuffer): Float32Array { + const int16Array = new Int16Array(buffer); + const float32Array = new Float32Array(int16Array.length); + for (let i = 0; i < int16Array.length; i++) { + const s = int16Array[i]; + float32Array[i] = s < 0 ? s / 0x8000 : s / 0x7FFF; + } + return float32Array; +} + +export function arrayBufferToBase64(buffer: ArrayBuffer): string { + let binary = ''; + const bytes = new Uint8Array(buffer); + const len = bytes.byteLength; + for (let i = 0; i < len; i++) { + binary += String.fromCharCode(bytes[i]); + } + return btoa(binary); +} + +export function base64ToArrayBuffer(base64: string): ArrayBuffer { + const binary = atob(base64); + const bytes = new Uint8Array(binary.length); + for (let i = 0; i < binary.length; i++) { + bytes[i] = binary.charCodeAt(i); + } + return bytes.buffer; +} diff --git a/web/src/realtime/gemini/toolAdapter.test.ts b/web/src/realtime/gemini/toolAdapter.test.ts new file mode 100644 index 000000000..5d98d6d4d --- /dev/null +++ b/web/src/realtime/gemini/toolAdapter.test.ts @@ -0,0 +1,28 @@ +import { describe, test, expect } from 'bun:test' +import { handleGeminiFunctionCall, handleGeminiFunctionCalls } from './toolAdapter' +import type { GeminiFunctionCall } from './toolAdapter' + +describe('toolAdapter', () => { + test('returns error for unknown tool', async () => { + const call: GeminiFunctionCall = { + name: 'unknownTool', + args: {}, + id: 'call-1' + } + const resp = await handleGeminiFunctionCall(call) + expect(resp.name).toBe('unknownTool') + expect(resp.id).toBe('call-1') + expect(resp.response.result).toContain('unknown tool') + }) + + test('handles multiple calls in parallel', async () => { + const calls: GeminiFunctionCall[] = [ + { name: 'unknownA', args: {}, id: 'a' }, + { name: 'unknownB', args: {}, id: 'b' } + ] + const responses = await handleGeminiFunctionCalls(calls) + expect(responses.length).toBe(2) + expect(responses[0].id).toBe('a') + expect(responses[1].id).toBe('b') + }) +}) diff --git a/web/src/realtime/gemini/toolAdapter.ts b/web/src/realtime/gemini/toolAdapter.ts new file mode 100644 index 000000000..dd44e4fb1 --- /dev/null +++ b/web/src/realtime/gemini/toolAdapter.ts @@ -0,0 +1,70 @@ +import { realtimeClientTools } from '../realtimeClientTools' + +/** + * Gemini Live API function call from server. + * Matches the `toolCall` shape in a BidiGenerateContent serverMessage. + */ +export interface GeminiFunctionCall { + name: string + args: Record + id: string +} + +/** + * Response sent back to Gemini Live via `toolResponse`. + */ +export interface GeminiFunctionResponse { + name: string + id: string + response: { result: string } +} + +type ClientToolHandler = (parameters: unknown) => Promise + +const toolHandlers: Record = { + messageCodingAgent: realtimeClientTools.messageCodingAgent, + processPermissionRequest: realtimeClientTools.processPermissionRequest +} + +/** + * Execute a Gemini Live function call using the existing client tool handlers. + * Returns a GeminiFunctionResponse ready to send back over the WebSocket. + */ +export async function handleGeminiFunctionCall( + call: GeminiFunctionCall +): Promise { + const handler = toolHandlers[call.name] + + if (!handler) { + return { + name: call.name, + id: call.id, + response: { result: `error (unknown tool: ${call.name})` } + } + } + + try { + const result = await handler(call.args) + return { + name: call.name, + id: call.id, + response: { result } + } + } catch (error) { + const message = error instanceof Error ? error.message : 'unknown error' + return { + name: call.name, + id: call.id, + response: { result: `error (${message})` } + } + } +} + +/** + * Process multiple function calls in parallel and return all responses. + */ +export async function handleGeminiFunctionCalls( + calls: GeminiFunctionCall[] +): Promise { + return Promise.all(calls.map(handleGeminiFunctionCall)) +} diff --git a/web/src/realtime/index.ts b/web/src/realtime/index.ts index a7fa2fbe9..1e080123b 100644 --- a/web/src/realtime/index.ts +++ b/web/src/realtime/index.ts @@ -15,8 +15,11 @@ export { // Client tools export { realtimeClientTools, registerSessionStore } from './realtimeClientTools' -// Voice session component +// Voice session components export { RealtimeVoiceSession, type RealtimeVoiceSessionProps } from './RealtimeVoiceSession' +export { GeminiLiveVoiceSession, type GeminiLiveVoiceSessionProps } from './GeminiLiveVoiceSession' +export { QwenVoiceSession, type QwenVoiceSessionProps } from './QwenVoiceSession' +export { VoiceBackendSession, type VoiceBackendSessionProps } from './VoiceBackendSession' // Voice hooks export { voiceHooks, registerVoiceHooksStore } from './hooks/voiceHooks' diff --git a/web/src/realtime/realtimeClientTools.ts b/web/src/realtime/realtimeClientTools.ts index a2490ac81..962898c56 100644 --- a/web/src/realtime/realtimeClientTools.ts +++ b/web/src/realtime/realtimeClientTools.ts @@ -45,10 +45,8 @@ export const realtimeClientTools = { return 'error (session store not available)' } - if (VOICE_CONFIG.ENABLE_DEBUG_LOGGING) { - console.log('[Voice] messageCodingAgent called with:', message) - console.log('[Voice] Sending message to session:', sessionId) - } + console.log('[Voice] messageCodingAgent called with:', message) + console.log('[Voice] Sending message to session:', sessionId) sessionStore.sendMessage(sessionId, message) return "sent [DO NOT say anything else, simply say 'sent']" diff --git a/web/src/sw.ts b/web/src/sw.ts index ebe55dc0a..732ebef29 100644 --- a/web/src/sw.ts +++ b/web/src/sw.ts @@ -21,6 +21,10 @@ type PushPayload = { } } +// Activate new SW immediately without waiting for all tabs to close +self.addEventListener('install', () => { self.skipWaiting() }) +self.addEventListener('activate', (event) => { event.waitUntil(self.clients.claim()) }) + precacheAndRoute(self.__WB_MANIFEST) registerRoute( diff --git a/web/tsconfig.json b/web/tsconfig.json index 8b0682a4b..de7bcdca5 100644 --- a/web/tsconfig.json +++ b/web/tsconfig.json @@ -11,5 +11,6 @@ "@/*": ["./src/*"] } }, - "include": ["src"] + "include": ["src"], + "exclude": ["src/**/*.test.ts", "src/**/*.test.tsx", "src/**/*.spec.ts", "src/**/*.spec.tsx"] }