From 430ba7e6f53035c54963cc2c371ecfacd83a9264 Mon Sep 17 00:00:00 2001 From: Omar Al-Jadda Date: Sun, 21 Jun 2026 15:50:21 -0700 Subject: [PATCH] feat(tts): add Microsoft Edge TTS provider Adds a fourth TTS provider that uses Microsoft Edge's online Read Aloud service via WebSocket. No API key, account, or TTS_API_URL required; only outbound HTTPS/WebSocket access to speech.platform.bing.com. Implementation ports the protocol from the Python edge-tts reference (https://github.com/rany2/edge-tts): - Sec-MS-GEC DRM token: SHA256 of Windows file-time ticks (rounded to 5 min) + trusted client token, with single 403 retry that corrects clock skew from the server Date header - SSML framing with prosody rate/volume/pitch and 4096-byte UTF-8-safe chunking that never splits multi-byte chars or XML entities - WebSocket message handling: binary frames use a 2-byte big-endian header-length prefix (length includes the trailing CRLF), audio starts at offset 2 + headerLength; text frames signal turn.end per chunk Adds ws as a runtime dependency (Node 20+ target; global WebSocket is only stable from Node 22+). - New: src/app/services/edge-tts.ts + tests - Wire 'edge' provider into config.ts and tts-service.ts - isTtsConfigured() returns true for edge (no credentials needed) - Document provider in .env.example and PRODUCT.md --- .env.example | 9 +- PRODUCT.md | 2 +- package-lock.json | 35 ++- package.json | 4 +- src/app/services/edge-tts.ts | 395 +++++++++++++++++++++++++ src/app/services/tts-service.ts | 26 ++ src/config.ts | 8 +- tests/app/services/edge-tts.test.ts | 222 ++++++++++++++ tests/app/services/tts-service.test.ts | 51 ++++ 9 files changed, 745 insertions(+), 7 deletions(-) create mode 100644 src/app/services/edge-tts.ts create mode 100644 tests/app/services/edge-tts.test.ts diff --git a/.env.example b/.env.example index 052766b0..75de9d86 100644 --- a/.env.example +++ b/.env.example @@ -154,7 +154,7 @@ OPENCODE_MODEL_ID=big-pickle # Text-to-Speech credentials (optional) # TTS reply behavior is controlled globally with /tts and persisted in settings.json. -# Provider: "openai" (default), "elevenlabs", or "google". +# Provider: "openai" (default), "elevenlabs", "google", or "edge". # # --- OpenAI-compatible (default) --- # Set TTS_API_URL and TTS_API_KEY to any OpenAI-compatible TTS endpoint. @@ -180,3 +180,10 @@ OPENCODE_MODEL_ID=big-pickle # TTS_PROVIDER=google # TTS_VOICE=en-US-Studio-O # GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account-key.json +# +# --- Microsoft Edge TTS --- +# Uses Microsoft Edge's online Read Aloud service. No API key or account +# required; only an outbound HTTPS/WebSocket connection to +# speech.platform.bing.com. Voice list: https://learn.microsoft.com/azure/ai-services/speech-service/language-support +# TTS_PROVIDER=edge +# TTS_VOICE=en-US-EmmaMultilingualNeural diff --git a/PRODUCT.md b/PRODUCT.md index 652285dd..5cf84941 100644 --- a/PRODUCT.md +++ b/PRODUCT.md @@ -90,7 +90,7 @@ No public inbound ports are required for normal usage. - Configurable opt-in display of full thinking/reasoning content - Configurable max code file size in KB (default: 100) - Optional STT settings for voice transcription (`STT_API_URL`, `STT_API_KEY`, `STT_MODEL`, `STT_LANGUAGE`) -- Optional TTS settings for global audio replies (`TTS_PROVIDER`, `TTS_API_URL`, `TTS_API_KEY`, `TTS_MODEL`, `TTS_VOICE`) +- Optional TTS settings for global audio replies (`TTS_PROVIDER`, `TTS_API_URL`, `TTS_API_KEY`, `TTS_MODEL`, `TTS_VOICE`); supported providers: OpenAI-compatible, ElevenLabs, Google Cloud TTS, and Microsoft Edge TTS (no API key required) - Optional IPv4-only mode for Telegram connectivity (`TELEGRAM_FORCE_IPV4`) ## Current Product Scope diff --git a/package-lock.json b/package-lock.json index 55026217..d38b8de5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -21,7 +21,8 @@ "remark-gfm": "^4.0.1", "remark-parse": "^11.0.0", "socks-proxy-agent": "^8.0.5", - "unified": "^11.0.5" + "unified": "^11.0.5", + "ws": "^8.21.0" }, "bin": { "opencode-telegram": "dist/cli.js" @@ -29,6 +30,7 @@ "devDependencies": { "@types/better-sqlite3": "^7.6.13", "@types/node": "^25.0.8", + "@types/ws": "^8.18.1", "@typescript-eslint/eslint-plugin": "^8.53.0", "@typescript-eslint/parser": "^8.53.0", "@vitest/coverage-v8": "^3.2.4", @@ -1359,6 +1361,16 @@ "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==", "license": "MIT" }, + "node_modules/@types/ws": { + "version": "8.18.1", + "resolved": "https://registry.npmjs.org/@types/ws/-/ws-8.18.1.tgz", + "integrity": "sha512-ThVF6DCVhA8kUGy+aazFQ4kXQ7E1Ty7A3ypFOe0IcJV8O/M511G99AW24irKrW56Wt44yG9+ij8FaqoBGkuBXg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/node": "*" + } + }, "node_modules/@typescript-eslint/eslint-plugin": { "version": "8.53.0", "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-8.53.0.tgz", @@ -6145,6 +6157,27 @@ "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==", "license": "ISC" }, + "node_modules/ws": { + "version": "8.21.0", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.21.0.tgz", + "integrity": "sha512-Vsp28b7DRcimFQvrqu2Wek3z1iYxDCWqHYB8Qsnk/S4RfaCQzPGPyBNuVjJV3cd6UiKtUtp6sNM77gWvzcCH+g==", + "license": "MIT", + "engines": { + "node": ">=10.0.0" + }, + "peerDependencies": { + "bufferutil": "^4.0.1", + "utf-8-validate": ">=5.0.2" + }, + "peerDependenciesMeta": { + "bufferutil": { + "optional": true + }, + "utf-8-validate": { + "optional": true + } + } + }, "node_modules/y18n": { "version": "5.0.8", "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", diff --git a/package.json b/package.json index 8fe79fe3..e6b5ae03 100644 --- a/package.json +++ b/package.json @@ -63,11 +63,13 @@ "remark-gfm": "^4.0.1", "remark-parse": "^11.0.0", "socks-proxy-agent": "^8.0.5", - "unified": "^11.0.5" + "unified": "^11.0.5", + "ws": "^8.21.0" }, "devDependencies": { "@types/better-sqlite3": "^7.6.13", "@types/node": "^25.0.8", + "@types/ws": "^8.18.1", "@typescript-eslint/eslint-plugin": "^8.53.0", "@typescript-eslint/parser": "^8.53.0", "@vitest/coverage-v8": "^3.2.4", diff --git a/src/app/services/edge-tts.ts b/src/app/services/edge-tts.ts new file mode 100644 index 00000000..efc42e88 --- /dev/null +++ b/src/app/services/edge-tts.ts @@ -0,0 +1,395 @@ +import { createHash, randomBytes, randomUUID } from "crypto"; +import { WebSocket } from "ws"; +import { logger } from "../../utils/logger.js"; + +/** + * Microsoft Edge online text-to-speech client. + * + * Speaks the same WebSocket protocol used by Microsoft Edge's Read Aloud + * feature (wss://speech.platform.bing.com/.../readaloud/edge/v1). No API key + * is required; access is authenticated through a SHA256 "Sec-MS-GEC" token + * derived from the current time. + * + * Ported from the Python reference implementation at + * https://github.com/rany2/edge-tts. + */ + +const BASE_URL = "speech.platform.bing.com/consumer/speech/synthesize/readaloud"; +const TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4"; +const WSS_URL = `wss://${BASE_URL}/edge/v1?TrustedClientToken=${TRUSTED_CLIENT_TOKEN}`; + +const CHROMIUM_FULL_VERSION = "143.0.3650.75"; +const CHROMIUM_MAJOR_VERSION = CHROMIUM_FULL_VERSION.split(".")[0]; +export const SEC_MS_GEC_VERSION = `1-${CHROMIUM_FULL_VERSION}`; + +export const EDGE_DEFAULT_VOICE = "en-US-EmmaMultilingualNeural"; + +const WIN_EPOCH_SECONDS = 11644473600; +const TICKS_PER_SECOND = 10_000_000; +const ROUND_SECONDS = 300; + +const WSS_HEADERS: Record = { + Pragma: "no-cache", + "Cache-Control": "no-cache", + Origin: "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold", + "User-Agent": + `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ` + + `(KHTML, like Gecko) Chrome/${CHROMIUM_MAJOR_VERSION}.0.0.0 Safari/537.36 ` + + `Edg/${CHROMIUM_MAJOR_VERSION}.0.0.0`, + "Accept-Encoding": "gzip, deflate, br, zstd", + "Accept-Language": "en-US,en;q=0.9", +}; + +const SYNTHESIS_TIMEOUT_MS = 60_000; +const MAX_CHUNK_BYTES = 4096; + +let clockSkewSeconds = 0; + +/** + * Generates the Sec-MS-GEC DRM token Microsoft requires on every request. + * + * The token is the SHA256 (uppercased hex) of `` where `ticks` + * is the current time as Windows file time (100-ns intervals since 1601-01-01) + * rounded down to the nearest 5 minutes. Rounded to limit token churn; the + * server accepts any token valid within the current 5-minute window. + */ +export function generateSecMsGec(now: Date = new Date()): string { + let seconds = now.getTime() / 1000 + clockSkewSeconds + WIN_EPOCH_SECONDS; + seconds -= seconds % ROUND_SECONDS; + const ticks = BigInt(Math.round(seconds)) * BigInt(TICKS_PER_SECOND); + const strToHash = `${ticks}${TRUSTED_CLIENT_TOKEN}`; + return createHash("sha256").update(strToHash, "ascii").digest("hex").toUpperCase(); +} + +/** @internal Reset clock skew (for tests only). */ +export function _resetClockSkew(): void { + clockSkewSeconds = 0; +} + +function generateMuid(): string { + return randomBytes(16).toString("hex").toUpperCase(); +} + +function connectId(): string { + return randomUUID().replace(/-/g, ""); +} + +function jsDateString(date: Date = new Date()): string { + const days = ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"]; + const months = [ + "Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", + ]; + const pad = (n: number): string => n.toString().padStart(2, "0"); + return ( + `${days[date.getUTCDay()]} ${months[date.getUTCMonth()]} ` + + `${pad(date.getUTCDate())} ${date.getUTCFullYear()} ` + + `${pad(date.getUTCHours())}:${pad(date.getUTCMinutes())}:${pad(date.getUTCSeconds())} ` + + `GMT+0000 (Coordinated Universal Time)` + ); +} + +function escapeXml(text: string): string { + return text + .replace(/&/g, "&") + .replace(//g, ">"); +} + +/** + * Replaces control characters the service rejects (0x00-0x08, 0x0B-0x0C, + * 0x0E-0x1F) with spaces. Common in OCR'd text; without this the service + * returns an error. + */ +function removeIncompatibleCharacters(text: string): string { + let result = ""; + for (const char of text) { + const code = char.codePointAt(0)!; + if ( + (code >= 0 && code <= 8) || + (code >= 11 && code <= 12) || + (code >= 14 && code <= 31) + ) { + result += " "; + } else { + result += char; + } + } + return result; +} + +function isValidUtf8Prefix(buf: Buffer, length: number): boolean { + const prefix = buf.subarray(0, length); + return Buffer.from(prefix.toString("utf-8"), "utf-8").equals(prefix); +} + +/** Moves a split point back so it does not land inside an XML entity (&). */ +function adjustForXmlEntity(buf: Buffer, splitAt: number): number { + let result = splitAt; + while (result > 0) { + const ampersandIndex = buf.subarray(0, result).lastIndexOf("&"); + if (ampersandIndex < 0) break; + if (buf.subarray(ampersandIndex, result).includes(";")) break; + result = ampersandIndex; + } + return result; +} + +/** + * Splits text into chunks no larger than `byteLength` UTF-8 bytes, preferring + * to break at newlines or spaces and never inside a multi-byte character or + * XML entity. Mirrors edge-tts's split_text_by_byte_length. + */ +export function splitTextByByteLength(text: string, byteLength: number): string[] { + if (byteLength <= 0) { + throw new Error("byteLength must be greater than 0"); + } + let rest = Buffer.from(text, "utf-8"); + const chunks: string[] = []; + while (rest.length > byteLength) { + let splitAt = rest.lastIndexOf(0x0a, byteLength - 1); + if (splitAt < 0) splitAt = rest.lastIndexOf(0x20, byteLength - 1); + if (splitAt < 0) { + splitAt = byteLength; + while (splitAt > 0 && !isValidUtf8Prefix(rest, splitAt)) { + splitAt--; + } + } + splitAt = adjustForXmlEntity(rest, splitAt); + if (splitAt <= 0) splitAt = 1; + const chunk = rest.subarray(0, splitAt).toString("utf-8").trim(); + if (chunk) chunks.push(chunk); + rest = rest.subarray(splitAt); + } + const remaining = rest.toString("utf-8").trim(); + if (remaining) chunks.push(remaining); + return chunks; +} + +function buildSsml( + voice: string, + rate: string, + volume: string, + pitch: string, + text: string, +): string { + return ( + "" + + `` + + `` + + text + + "" + ); +} + +function parseRfc2616Date(date: string): number | null { + const parsed = Date.parse(date); + return Number.isNaN(parsed) ? null : parsed / 1000; +} + +class EdgeHttpUpgradeError extends Error { + readonly statusCode: number; + readonly serverDate: string | null; + constructor(statusCode: number, serverDate: string | null) { + super(`Edge TTS WebSocket upgrade failed: HTTP ${statusCode}`); + this.name = "EdgeHttpUpgradeError"; + this.statusCode = statusCode; + this.serverDate = serverDate; + } +} + +interface SynthesisParams { + voice: string; + rate: string; + volume: string; + pitch: string; +} + +/** + * Opens one WebSocket, streams SSML chunks sequentially, and resolves with the + * concatenated MP3 audio bytes. Retries once on HTTP 403 (clock skew) by + * re-deriving the token against the server's reported time. + */ +async function streamSynthesis(chunks: string[], params: SynthesisParams): Promise { + for (let attempt = 0; attempt < 2; attempt++) { + try { + return await attemptSynthesis(chunks, params); + } catch (err) { + if ( + err instanceof EdgeHttpUpgradeError && + err.statusCode === 403 && + attempt === 0 && + err.serverDate + ) { + const serverTime = parseRfc2616Date(err.serverDate); + if (serverTime !== null) { + const clientTime = Date.now() / 1000 + clockSkewSeconds; + clockSkewSeconds += serverTime - clientTime; + logger.warn( + `[EdgeTTS] HTTP 403: adjusted clock skew by ${serverTime - clientTime}s, retrying`, + ); + continue; + } + } + throw err; + } + } + throw new Error("Edge TTS synthesis failed after retry"); +} + +function attemptSynthesis(chunks: string[], params: SynthesisParams): Promise { + return new Promise((resolve, reject) => { + const gec = generateSecMsGec(); + const url = + `${WSS_URL}&ConnectionId=${connectId()}` + + `&Sec-MS-GEC=${gec}&Sec-MS-GEC-Version=${SEC_MS_GEC_VERSION}`; + const headers = { ...WSS_HEADERS, Cookie: `muid=${generateMuid()};` }; + + const ws = new WebSocket(url, { headers }); + const audioChunks: Buffer[] = []; + let audioReceived = false; + let chunkIndex = 0; + let settled = false; + let timer: NodeJS.Timeout | null = null; + + const finish = (error: Error | null, result?: Buffer): void => { + if (settled) return; + settled = true; + if (timer) clearTimeout(timer); + ws.removeAllListeners(); + if (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING) { + ws.close(); + } + if (error) reject(error); + else resolve(result ?? Buffer.alloc(0)); + }; + + timer = setTimeout(() => { + finish(new Error(`Edge TTS synthesis timed out after ${SYNTHESIS_TIMEOUT_MS}ms`)); + }, SYNTHESIS_TIMEOUT_MS); + + ws.on("unexpected-response", (_req, res) => { + const statusCode = res.statusCode ?? 0; + const serverDate = (res.headers["date"] as string | undefined) ?? null; + finish(new EdgeHttpUpgradeError(statusCode, serverDate)); + }); + + ws.on("error", (err: NodeJS.ErrnoException) => { + if (!settled) finish(err); + }); + + ws.on("open", () => { + const configMessage = + `X-Timestamp:${jsDateString()}\r\n` + + "Content-Type:application/json; charset=utf-8\r\n" + + "Path:speech.config\r\n\r\n" + + '{"context":{"synthesis":{"audio":{"metadataoptions":' + + '{"sentenceBoundaryEnabled":"true","wordBoundaryEnabled":"false"},' + + '"outputFormat":"audio-24khz-48kbitrate-mono-mp3"}}}}'; + ws.send(configMessage); + sendNextChunk(); + }); + + const sendNextChunk = (): void => { + if (chunkIndex >= chunks.length) return; + const ssml = buildSsml(params.voice, params.rate, params.volume, params.pitch, chunks[chunkIndex]); + const message = + `X-RequestId:${connectId()}\r\n` + + "Content-Type:application/ssml+xml\r\n" + + `X-Timestamp:${jsDateString()}Z\r\n` + + "Path:ssml\r\n\r\n" + + ssml; + ws.send(message); + }; + + ws.on("message", (data, isBinary) => { + if (settled) return; + const buf = Buffer.isBuffer(data) + ? data + : Array.isArray(data) + ? Buffer.concat(data) + : Buffer.from(data as ArrayBuffer); + + if (isBinary) { + if (buf.length < 2) { + finish(new Error("Edge TTS: binary message too short")); + return; + } + // Binary frames: [2-byte big-endian header length][headers + \r\n][audio]. + // The length value includes the trailing \r\n terminator, so audio + // starts immediately at offset 2 + headerLength. + const headerLength = buf.readUInt16BE(0); + if (headerLength > buf.length) { + finish(new Error("Edge TTS: binary header length exceeds message")); + return; + } + const headersBlock = buf.subarray(2, 2 + headerLength).toString("utf-8"); + if (!headersBlock.includes("Path:audio")) return; + const audioStart = 2 + headerLength; + const audio = audioStart < buf.length ? buf.subarray(audioStart) : Buffer.alloc(0); + if (audio.length > 0) { + audioChunks.push(audio); + audioReceived = true; + } + return; + } + + const text = buf.toString("utf-8"); + const sep = text.indexOf("\r\n\r\n"); + const headerBlock = sep >= 0 ? text.slice(0, sep) : text; + if (!headerBlock.includes("Path:turn.end")) return; + + chunkIndex++; + if (chunkIndex >= chunks.length) { + if (!audioReceived) { + finish(new Error("Edge TTS: no audio received from service")); + } else { + finish(null, Buffer.concat(audioChunks)); + } + } else { + sendNextChunk(); + } + }); + + ws.on("close", () => { + if (!settled) { + if (!audioReceived) { + finish(new Error("Edge TTS: connection closed before audio was received")); + } else { + finish(null, Buffer.concat(audioChunks)); + } + } + }); + }); +} + +export interface EdgeTtsOptions { + voice: string; + rate?: string; + volume?: string; + pitch?: string; +} + +/** + * Synthesizes `text` to an MP3 Buffer using Microsoft Edge's online TTS. + * Throws on protocol errors, timeouts, or if no audio is returned. + */ +export async function synthesizeWithEdgeTts( + text: string, + options: EdgeTtsOptions, +): Promise { + const voice = options.voice || EDGE_DEFAULT_VOICE; + const rate = options.rate ?? "+0%"; + const volume = options.volume ?? "+0%"; + const pitch = options.pitch ?? "+0Hz"; + + const cleaned = removeIncompatibleCharacters(text); + const escaped = escapeXml(cleaned); + const chunks = splitTextByByteLength(escaped, MAX_CHUNK_BYTES); + + logger.debug( + `[EdgeTTS] Synthesizing: voice=${voice}, chunks=${chunks.length}, chars=${text.length}`, + ); + + return streamSynthesis(chunks, { voice, rate, volume, pitch }); +} diff --git a/src/app/services/tts-service.ts b/src/app/services/tts-service.ts index 48d2eb83..8c77c6f3 100644 --- a/src/app/services/tts-service.ts +++ b/src/app/services/tts-service.ts @@ -1,6 +1,7 @@ import { config } from "../../config.js"; import { logger } from "../../utils/logger.js"; import textToSpeech from "@google-cloud/text-to-speech"; +import { synthesizeWithEdgeTts } from "./edge-tts.js"; const TTS_REQUEST_TIMEOUT_MS = 60_000; const MAX_TTS_INPUT_CHARS = 4_000; @@ -29,6 +30,9 @@ export function isTtsConfigured(): boolean { if (config.tts.provider === "google") { return Boolean(process.env.GOOGLE_APPLICATION_CREDENTIALS); } + if (config.tts.provider === "edge") { + return true; + } return Boolean(config.tts.apiUrl && config.tts.apiKey); } @@ -203,6 +207,22 @@ async function synthesizeWithElevenLabs(text: string): Promise { } } +async function synthesizeWithEdge(text: string): Promise { + const voice = config.tts.voice || "en-US-EmmaMultilingualNeural"; + + logger.debug( + `[TTS] Edge: voice=${voice}, chars=${text.length}`, + ); + + const buffer = await synthesizeWithEdgeTts(text, { voice }); + if (buffer.length === 0) { + throw new Error("Edge TTS returned an empty audio response"); + } + + logger.debug(`[TTS] Generated Edge speech audio: ${buffer.length} bytes`); + return { buffer, filename: "assistant-reply.mp3", mimeType: "audio/mpeg" }; +} + // --- Public API --- function getNotConfiguredMessage(): string { @@ -212,6 +232,9 @@ function getNotConfiguredMessage(): string { if (config.tts.provider === "elevenlabs") { return "TTS is not configured: set TTS_API_URL and TTS_API_KEY for ElevenLabs"; } + if (config.tts.provider === "edge") { + return "Edge TTS is unavailable: requires network access to speech.platform.bing.com"; + } return "TTS is not configured: set TTS_API_URL and TTS_API_KEY"; } @@ -234,6 +257,9 @@ export async function synthesizeSpeech(text: string): Promise { if (config.tts.provider === "elevenlabs") { return await synthesizeWithElevenLabs(input); } + if (config.tts.provider === "edge") { + return await synthesizeWithEdge(input); + } return await synthesizeWithOpenAi(input); } catch (err) { if (err instanceof DOMException && err.name === "AbortError") { diff --git a/src/config.ts b/src/config.ts index 13fa81b0..be9d028e 100644 --- a/src/config.ts +++ b/src/config.ts @@ -7,7 +7,7 @@ dotenv.config({ path: runtimePaths.envFilePath, quiet: true }); export type MessageFormatMode = "raw" | "markdown"; export type StreamingMode = "edit" | "draft"; -export type TtsProvider = "openai" | "google" | "elevenlabs"; +export type TtsProvider = "openai" | "google" | "elevenlabs" | "edge"; function getEnvVar(key: string, required: boolean = true): string { const value = process.env[key]; @@ -95,7 +95,7 @@ function getOptionalMessageFormatModeEnvVar( return defaultValue; } -const VALID_TTS_PROVIDERS: TtsProvider[] = ["openai", "google", "elevenlabs"]; +const VALID_TTS_PROVIDERS: TtsProvider[] = ["openai", "google", "elevenlabs", "edge"]; function getOptionalTtsProviderEnvVar(key: string, defaultValue: TtsProvider): TtsProvider { const value = getEnvVar(key, false); @@ -213,7 +213,9 @@ export const config = { ? "en-US-Studio-O" : provider === "elevenlabs" ? "21m00Tcm4TlvDq8ikWAM" - : "alloy"; + : provider === "edge" + ? "en-US-EmmaMultilingualNeural" + : "alloy"; const defaultModel = provider === "elevenlabs" ? "eleven_flash_v2_5" : "gpt-4o-mini-tts"; return { diff --git a/tests/app/services/edge-tts.test.ts b/tests/app/services/edge-tts.test.ts new file mode 100644 index 00000000..bc3b902a --- /dev/null +++ b/tests/app/services/edge-tts.test.ts @@ -0,0 +1,222 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; + +vi.mock("../../../src/utils/logger.js", () => ({ + logger: { + debug: vi.fn(), + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + }, +})); + +import { + generateSecMsGec, + splitTextByByteLength, + synthesizeWithEdgeTts, + EDGE_DEFAULT_VOICE, + SEC_MS_GEC_VERSION, + _resetClockSkew, +} from "../../../src/app/services/edge-tts.js"; + +describe("generateSecMsGec", () => { + beforeEach(() => _resetClockSkew()); + afterEach(() => _resetClockSkew()); + + it("produces a 64-char uppercase hex string", () => { + const token = generateSecMsGec(new Date("2024-01-01T00:00:00Z")); + expect(token).toMatch(/^[0-9A-F]{64}$/); + }); + + it("matches the reference vector for a fixed time", () => { + const token = generateSecMsGec(new Date("2024-01-01T00:00:00Z")); + expect(token).toBe( + "2AC0A57C1214B9458F8725BB7800499BB594EC29DDA83424BC14661707141F2F", + ); + }); + + it("is stable within the same 5-minute window", () => { + const start = new Date("2024-06-01T12:02:37Z"); + const later = new Date("2024-06-01T12:04:59Z"); + expect(generateSecMsGec(start)).toBe(generateSecMsGec(later)); + }); + + it("changes across 5-minute window boundaries", () => { + const before = new Date("2024-06-01T12:04:59Z"); + const after = new Date("2024-06-01T12:05:00Z"); + expect(generateSecMsGec(before)).not.toBe(generateSecMsGec(after)); + }); +}); + +describe("SEC_MS_GEC_VERSION / EDGE_DEFAULT_VOICE", () => { + it("exposes a Chromium-prefixed GEC version", () => { + expect(SEC_MS_GEC_VERSION).toMatch(/^1-\d+\.\d+\.\d+\.\d+$/); + }); + + it("uses a Neural voice as default", () => { + expect(EDGE_DEFAULT_VOICE).toMatch(/Neural$/); + }); +}); + +describe("splitTextByByteLength", () => { + it("returns a single chunk when text fits", () => { + expect(splitTextByByteLength("hello world", 100)).toEqual(["hello world"]); + }); + + it("splits at newlines when possible", () => { + const text = "line one\nline two\nline three"; + const chunks = splitTextByByteLength(text, 15); + // Newlines act as split boundaries and are trimmed away. + expect(chunks).toEqual(["line one", "line two", "line three"]); + for (const chunk of chunks) { + expect(Buffer.byteLength(chunk, "utf-8")).toBeLessThanOrEqual(15); + } + }); + + it("splits at spaces when no newline fits", () => { + const text = "alpha beta gamma delta"; + const chunks = splitTextByByteLength(text, 12); + expect(chunks.every((c) => Buffer.byteLength(c, "utf-8") <= 12)).toBe(true); + expect(chunks.join(" ").replace(/\s+/g, " ").trim()).toContain("alpha"); + }); + + it("never splits a multi-byte UTF-8 character", () => { + // Each CJK char is 3 bytes in UTF-8; force splits mid-character. + const text = "你好世界测试文本".repeat(10); + const chunks = splitTextByByteLength(text, 8); + const roundTrip = chunks.join(""); + expect(roundTrip).toBe(text); + for (const chunk of chunks) { + expect(Buffer.byteLength(chunk, "utf-8")).toBeLessThanOrEqual(8); + } + }); + + it("does not split inside an XML entity", () => { + const text = "foo & bar"; + const chunks = splitTextByByteLength(text, 6); + // The entity "&" must stay whole within a single chunk. + expect(chunks).toContain("&"); + expect( + chunks.some((c) => c.includes("&") && !c.includes("&")), + ).toBe(false); + expect(chunks.some((c) => c.includes("amp;") && !c.includes("&"))).toBe(false); + }); + + it("throws on non-positive byte length", () => { + expect(() => splitTextByByteLength("x", 0)).toThrow(); + expect(() => splitTextByByteLength("x", -1)).toThrow(); + }); +}); + +describe("synthesizeWithEdgeTts (WebSocket flow)", () => { + const fakeWs = { + on: vi.fn(), + send: vi.fn(), + close: vi.fn(), + removeAllListeners: vi.fn(), + readyState: 1, + }; + + function emit(event: string, ...args: unknown[]): void { + const handler = fakeWs.on.mock.calls.find((c) => c[0] === event)?.[1]; + if (handler) (handler as (...a: unknown[]) => void)(...args); + } + + beforeEach(() => { + vi.useFakeTimers(); + fakeWs.on.mockClear(); + fakeWs.send.mockClear(); + fakeWs.close.mockClear(); + fakeWs.removeAllListeners.mockClear(); + fakeWs.readyState = 1; + vi.resetModules(); + }); + + afterEach(() => { + vi.useRealTimers(); + }); + + function installMockWs(): void { + vi.doMock("ws", () => ({ + WebSocket: vi.fn(() => fakeWs), + })); + } + + it("sends config + SSML and concatenates binary audio on turn.end", async () => { + installMockWs(); + const { synthesizeWithEdgeTts } = await import( + "../../../src/app/services/edge-tts.js" + ); + + const promise = synthesizeWithEdgeTts("Hello world", { + voice: "en-US-AriaNeural", + }); + + // Allow the constructor + listeners to register. + await Promise.resolve(); + + emit("open"); + + // Two messages sent on open: speech.config then first ssml. + expect(fakeWs.send).toHaveBeenCalledTimes(2); + expect(fakeWs.send.mock.calls[0][0]).toContain("Path:speech.config"); + const ssml = String(fakeWs.send.mock.calls[1][0]); + expect(ssml).toContain("Path:ssml"); + expect(ssml).toContain(""); + expect(ssml).toContain("Hello world"); + + // Simulate a binary audio frame from the service. + // Format: [2-byte header length][headers + \r\n][audio data] + const headers = Buffer.from("Path:audio\r\nContent-Type:audio/mpeg\r\n", "utf-8"); + const prefix = Buffer.alloc(2); + prefix.writeUInt16BE(headers.length, 0); + const audioBytes = Buffer.from([0xff, 0xf3, 0x90, 0x00]); + emit("message", Buffer.concat([prefix, headers, audioBytes]), true); + + // Simulate turn.end on the text channel. + emit("message", "X-RequestId:abc\r\nPath:turn.end\r\n\r\n", false); + + const result = await promise; + expect(result).toEqual(audioBytes); + }); + + it("rejects when no audio is received before turn.end", async () => { + installMockWs(); + const { synthesizeWithEdgeTts } = await import( + "../../../src/app/services/edge-tts.js" + ); + + const promise = synthesizeWithEdgeTts("Hello", { voice: "en-US-AriaNeural" }); + await Promise.resolve(); + emit("open"); + emit("message", "Path:turn.end\r\n\r\n", false); + + await expect(promise).rejects.toThrow("no audio received"); + }); + + it("rejects on connection close before audio", async () => { + installMockWs(); + const { synthesizeWithEdgeTts } = await import( + "../../../src/app/services/edge-tts.js" + ); + + const promise = synthesizeWithEdgeTts("Hello", { voice: "en-US-AriaNeural" }); + await Promise.resolve(); + emit("open"); + emit("close"); + + await expect(promise).rejects.toThrow("connection closed"); + }); + + it("rejects on WebSocket error", async () => { + installMockWs(); + const { synthesizeWithEdgeTts } = await import( + "../../../src/app/services/edge-tts.js" + ); + + const promise = synthesizeWithEdgeTts("Hello", { voice: "en-US-AriaNeural" }); + await Promise.resolve(); + emit("error", new Error("connect ECONNREFUSED")); + + await expect(promise).rejects.toThrow("ECONNREFUSED"); + }); +}); diff --git a/tests/app/services/tts-service.test.ts b/tests/app/services/tts-service.test.ts index bc969a3a..d4172d63 100644 --- a/tests/app/services/tts-service.test.ts +++ b/tests/app/services/tts-service.test.ts @@ -26,6 +26,11 @@ vi.mock("@google-cloud/text-to-speech", () => { }; }); +const mockEdgeSynth = vi.hoisted(() => vi.fn()); +vi.mock("../../../src/app/services/edge-tts.js", () => ({ + synthesizeWithEdgeTts: mockEdgeSynth, +})); + const mockTts = vi.hoisted(() => ({ apiUrl: "", apiKey: "", @@ -112,6 +117,13 @@ describe("isTtsConfigured", () => { mockTts.apiKey = "xi-test-key"; expect(isTtsConfigured()).toBe(true); }); + + it("returns true for edge provider (no credentials required)", () => { + mockTts.provider = "edge"; + mockTts.apiUrl = ""; + mockTts.apiKey = ""; + expect(isTtsConfigured()).toBe(true); + }); }); describe("stripMarkdownForSpeech", () => { @@ -438,3 +450,42 @@ describe("synthesizeSpeech (ElevenLabs)", () => { ); }); }); + +describe("synthesizeSpeech (Edge)", () => { + beforeEach(() => { + mockTts.provider = "edge"; + mockTts.voice = "en-US-EmmaMultilingualNeural"; + mockTts.apiUrl = ""; + mockTts.apiKey = ""; + mockEdgeSynth.mockReset(); + vi.restoreAllMocks(); + }); + + it("delegates to synthesizeWithEdgeTts and returns mp3 bytes", async () => { + mockEdgeSynth.mockResolvedValue(Buffer.from([1, 2, 3, 4])); + + const result = await synthesizeSpeech("Hello **bold** world"); + + expect(mockEdgeSynth).toHaveBeenCalledOnce(); + const [text, options] = mockEdgeSynth.mock.calls[0]; + // Markdown is stripped before being passed to the provider. + expect(text).toBe("Hello bold world"); + expect(options.voice).toBe("en-US-EmmaMultilingualNeural"); + + expect(result.filename).toBe("assistant-reply.mp3"); + expect(result.mimeType).toBe("audio/mpeg"); + expect(result.buffer).toEqual(Buffer.from([1, 2, 3, 4])); + }); + + it("throws when Edge returns an empty audio buffer", async () => { + mockEdgeSynth.mockResolvedValue(Buffer.alloc(0)); + + await expect(synthesizeSpeech("Hello")).rejects.toThrow("empty audio response"); + }); + + it("propagates upstream Edge errors", async () => { + mockEdgeSynth.mockRejectedValue(new Error("Edge TTS: no audio received")); + + await expect(synthesizeSpeech("Hello")).rejects.toThrow("no audio received"); + }); +});