From 430ba7e6f53035c54963cc2c371ecfacd83a9264 Mon Sep 17 00:00:00 2001
From: Omar Al-Jadda <omar.aljadda@gmail.com>
Date: Sun, 21 Jun 2026 15:50:21 -0700
Subject: [PATCH] feat(tts): add Microsoft Edge TTS provider

Adds a fourth TTS provider that uses Microsoft Edge's online Read
Aloud service via WebSocket. No API key, account, or TTS_API_URL
required; only outbound HTTPS/WebSocket access to
speech.platform.bing.com.

Implementation ports the protocol from the Python edge-tts reference
(https://github.com/rany2/edge-tts):
- Sec-MS-GEC DRM token: SHA256 of Windows file-time ticks (rounded to
  5 min) + trusted client token, with single 403 retry that corrects
  clock skew from the server Date header
- SSML framing with prosody rate/volume/pitch and 4096-byte UTF-8-safe
  chunking that never splits multi-byte chars or XML entities
- WebSocket message handling: binary frames use a 2-byte big-endian
  header-length prefix (length includes the trailing CRLF), audio
  starts at offset 2 + headerLength; text frames signal turn.end per
  chunk

Adds ws as a runtime dependency (Node 20+ target; global WebSocket is
only stable from Node 22+).

- New: src/app/services/edge-tts.ts + tests
- Wire 'edge' provider into config.ts and tts-service.ts
- isTtsConfigured() returns true for edge (no credentials needed)
- Document provider in .env.example and PRODUCT.md
---
 .env.example                           |   9 +-
 PRODUCT.md                             |   2 +-
 package-lock.json                      |  35 ++-
 package.json                           |   4 +-
 src/app/services/edge-tts.ts           | 395 +++++++++++++++++++++++++
 src/app/services/tts-service.ts        |  26 ++
 src/config.ts                          |   8 +-
 tests/app/services/edge-tts.test.ts    | 222 ++++++++++++++
 tests/app/services/tts-service.test.ts |  51 ++++
 9 files changed, 745 insertions(+), 7 deletions(-)
 create mode 100644 src/app/services/edge-tts.ts
 create mode 100644 tests/app/services/edge-tts.test.ts

diff --git a/.env.example b/.env.example
index 052766b0..75de9d86 100644
--- a/.env.example
+++ b/.env.example
@@ -154,7 +154,7 @@ OPENCODE_MODEL_ID=big-pickle
 
 # Text-to-Speech credentials (optional)
 # TTS reply behavior is controlled globally with /tts and persisted in settings.json.
-# Provider: "openai" (default), "elevenlabs", or "google".
+# Provider: "openai" (default), "elevenlabs", "google", or "edge".
 #
 # --- OpenAI-compatible (default) ---
 # Set TTS_API_URL and TTS_API_KEY to any OpenAI-compatible TTS endpoint.
@@ -180,3 +180,10 @@ OPENCODE_MODEL_ID=big-pickle
 # TTS_PROVIDER=google
 # TTS_VOICE=en-US-Studio-O
 # GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account-key.json
+#
+# --- Microsoft Edge TTS ---
+# Uses Microsoft Edge's online Read Aloud service. No API key or account
+# required; only an outbound HTTPS/WebSocket connection to
+# speech.platform.bing.com. Voice list: https://learn.microsoft.com/azure/ai-services/speech-service/language-support
+# TTS_PROVIDER=edge
+# TTS_VOICE=en-US-EmmaMultilingualNeural
diff --git a/PRODUCT.md b/PRODUCT.md
index 652285dd..5cf84941 100644
--- a/PRODUCT.md
+++ b/PRODUCT.md
@@ -90,7 +90,7 @@ No public inbound ports are required for normal usage.
 - Configurable opt-in display of full thinking/reasoning content
 - Configurable max code file size in KB (default: 100)
 - Optional STT settings for voice transcription (`STT_API_URL`, `STT_API_KEY`, `STT_MODEL`, `STT_LANGUAGE`)
-- Optional TTS settings for global audio replies (`TTS_PROVIDER`, `TTS_API_URL`, `TTS_API_KEY`, `TTS_MODEL`, `TTS_VOICE`)
+- Optional TTS settings for global audio replies (`TTS_PROVIDER`, `TTS_API_URL`, `TTS_API_KEY`, `TTS_MODEL`, `TTS_VOICE`); supported providers: OpenAI-compatible, ElevenLabs, Google Cloud TTS, and Microsoft Edge TTS (no API key required)
 - Optional IPv4-only mode for Telegram connectivity (`TELEGRAM_FORCE_IPV4`)
 
 ## Current Product Scope
diff --git a/package-lock.json b/package-lock.json
index 55026217..d38b8de5 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -21,7 +21,8 @@
         "remark-gfm": "^4.0.1",
         "remark-parse": "^11.0.0",
         "socks-proxy-agent": "^8.0.5",
-        "unified": "^11.0.5"
+        "unified": "^11.0.5",
+        "ws": "^8.21.0"
       },
       "bin": {
         "opencode-telegram": "dist/cli.js"
@@ -29,6 +30,7 @@
       "devDependencies": {
         "@types/better-sqlite3": "^7.6.13",
         "@types/node": "^25.0.8",
+        "@types/ws": "^8.18.1",
         "@typescript-eslint/eslint-plugin": "^8.53.0",
         "@typescript-eslint/parser": "^8.53.0",
         "@vitest/coverage-v8": "^3.2.4",
@@ -1359,6 +1361,16 @@
       "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
       "license": "MIT"
     },
+    "node_modules/@types/ws": {
+      "version": "8.18.1",
+      "resolved": "https://registry.npmjs.org/@types/ws/-/ws-8.18.1.tgz",
+      "integrity": "sha512-ThVF6DCVhA8kUGy+aazFQ4kXQ7E1Ty7A3ypFOe0IcJV8O/M511G99AW24irKrW56Wt44yG9+ij8FaqoBGkuBXg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@types/node": "*"
+      }
+    },
     "node_modules/@typescript-eslint/eslint-plugin": {
       "version": "8.53.0",
       "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-8.53.0.tgz",
@@ -6145,6 +6157,27 @@
       "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
       "license": "ISC"
     },
+    "node_modules/ws": {
+      "version": "8.21.0",
+      "resolved": "https://registry.npmjs.org/ws/-/ws-8.21.0.tgz",
+      "integrity": "sha512-Vsp28b7DRcimFQvrqu2Wek3z1iYxDCWqHYB8Qsnk/S4RfaCQzPGPyBNuVjJV3cd6UiKtUtp6sNM77gWvzcCH+g==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=10.0.0"
+      },
+      "peerDependencies": {
+        "bufferutil": "^4.0.1",
+        "utf-8-validate": ">=5.0.2"
+      },
+      "peerDependenciesMeta": {
+        "bufferutil": {
+          "optional": true
+        },
+        "utf-8-validate": {
+          "optional": true
+        }
+      }
+    },
     "node_modules/y18n": {
       "version": "5.0.8",
       "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
diff --git a/package.json b/package.json
index 8fe79fe3..e6b5ae03 100644
--- a/package.json
+++ b/package.json
@@ -63,11 +63,13 @@
     "remark-gfm": "^4.0.1",
     "remark-parse": "^11.0.0",
     "socks-proxy-agent": "^8.0.5",
-    "unified": "^11.0.5"
+    "unified": "^11.0.5",
+    "ws": "^8.21.0"
   },
   "devDependencies": {
     "@types/better-sqlite3": "^7.6.13",
     "@types/node": "^25.0.8",
+    "@types/ws": "^8.18.1",
     "@typescript-eslint/eslint-plugin": "^8.53.0",
     "@typescript-eslint/parser": "^8.53.0",
     "@vitest/coverage-v8": "^3.2.4",
diff --git a/src/app/services/edge-tts.ts b/src/app/services/edge-tts.ts
new file mode 100644
index 00000000..efc42e88
--- /dev/null
+++ b/src/app/services/edge-tts.ts
@@ -0,0 +1,395 @@
+import { createHash, randomBytes, randomUUID } from "crypto";
+import { WebSocket } from "ws";
+import { logger } from "../../utils/logger.js";
+
+/**
+ * Microsoft Edge online text-to-speech client.
+ *
+ * Speaks the same WebSocket protocol used by Microsoft Edge's Read Aloud
+ * feature (wss://speech.platform.bing.com/.../readaloud/edge/v1). No API key
+ * is required; access is authenticated through a SHA256 "Sec-MS-GEC" token
+ * derived from the current time.
+ *
+ * Ported from the Python reference implementation at
+ * https://github.com/rany2/edge-tts.
+ */
+
+const BASE_URL = "speech.platform.bing.com/consumer/speech/synthesize/readaloud";
+const TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4";
+const WSS_URL = `wss://${BASE_URL}/edge/v1?TrustedClientToken=${TRUSTED_CLIENT_TOKEN}`;
+
+const CHROMIUM_FULL_VERSION = "143.0.3650.75";
+const CHROMIUM_MAJOR_VERSION = CHROMIUM_FULL_VERSION.split(".")[0];
+export const SEC_MS_GEC_VERSION = `1-${CHROMIUM_FULL_VERSION}`;
+
+export const EDGE_DEFAULT_VOICE = "en-US-EmmaMultilingualNeural";
+
+const WIN_EPOCH_SECONDS = 11644473600;
+const TICKS_PER_SECOND = 10_000_000;
+const ROUND_SECONDS = 300;
+
+const WSS_HEADERS: Record<string, string> = {
+  Pragma: "no-cache",
+  "Cache-Control": "no-cache",
+  Origin: "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
+  "User-Agent":
+    `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ` +
+    `(KHTML, like Gecko) Chrome/${CHROMIUM_MAJOR_VERSION}.0.0.0 Safari/537.36 ` +
+    `Edg/${CHROMIUM_MAJOR_VERSION}.0.0.0`,
+  "Accept-Encoding": "gzip, deflate, br, zstd",
+  "Accept-Language": "en-US,en;q=0.9",
+};
+
+const SYNTHESIS_TIMEOUT_MS = 60_000;
+const MAX_CHUNK_BYTES = 4096;
+
+let clockSkewSeconds = 0;
+
+/**
+ * Generates the Sec-MS-GEC DRM token Microsoft requires on every request.
+ *
+ * The token is the SHA256 (uppercased hex) of `<ticks><token>` where `ticks`
+ * is the current time as Windows file time (100-ns intervals since 1601-01-01)
+ * rounded down to the nearest 5 minutes. Rounded to limit token churn; the
+ * server accepts any token valid within the current 5-minute window.
+ */
+export function generateSecMsGec(now: Date = new Date()): string {
+  let seconds = now.getTime() / 1000 + clockSkewSeconds + WIN_EPOCH_SECONDS;
+  seconds -= seconds % ROUND_SECONDS;
+  const ticks = BigInt(Math.round(seconds)) * BigInt(TICKS_PER_SECOND);
+  const strToHash = `${ticks}${TRUSTED_CLIENT_TOKEN}`;
+  return createHash("sha256").update(strToHash, "ascii").digest("hex").toUpperCase();
+}
+
+/** @internal Reset clock skew (for tests only). */
+export function _resetClockSkew(): void {
+  clockSkewSeconds = 0;
+}
+
+function generateMuid(): string {
+  return randomBytes(16).toString("hex").toUpperCase();
+}
+
+function connectId(): string {
+  return randomUUID().replace(/-/g, "");
+}
+
+function jsDateString(date: Date = new Date()): string {
+  const days = ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"];
+  const months = [
+    "Jan", "Feb", "Mar", "Apr", "May", "Jun",
+    "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
+  ];
+  const pad = (n: number): string => n.toString().padStart(2, "0");
+  return (
+    `${days[date.getUTCDay()]} ${months[date.getUTCMonth()]} ` +
+    `${pad(date.getUTCDate())} ${date.getUTCFullYear()} ` +
+    `${pad(date.getUTCHours())}:${pad(date.getUTCMinutes())}:${pad(date.getUTCSeconds())} ` +
+    `GMT+0000 (Coordinated Universal Time)`
+  );
+}
+
+function escapeXml(text: string): string {
+  return text
+    .replace(/&/g, "&amp;")
+    .replace(/</g, "&lt;")
+    .replace(/>/g, "&gt;");
+}
+
+/**
+ * Replaces control characters the service rejects (0x00-0x08, 0x0B-0x0C,
+ * 0x0E-0x1F) with spaces. Common in OCR'd text; without this the service
+ * returns an error.
+ */
+function removeIncompatibleCharacters(text: string): string {
+  let result = "";
+  for (const char of text) {
+    const code = char.codePointAt(0)!;
+    if (
+      (code >= 0 && code <= 8) ||
+      (code >= 11 && code <= 12) ||
+      (code >= 14 && code <= 31)
+    ) {
+      result += " ";
+    } else {
+      result += char;
+    }
+  }
+  return result;
+}
+
+function isValidUtf8Prefix(buf: Buffer, length: number): boolean {
+  const prefix = buf.subarray(0, length);
+  return Buffer.from(prefix.toString("utf-8"), "utf-8").equals(prefix);
+}
+
+/** Moves a split point back so it does not land inside an XML entity (&amp;). */
+function adjustForXmlEntity(buf: Buffer, splitAt: number): number {
+  let result = splitAt;
+  while (result > 0) {
+    const ampersandIndex = buf.subarray(0, result).lastIndexOf("&");
+    if (ampersandIndex < 0) break;
+    if (buf.subarray(ampersandIndex, result).includes(";")) break;
+    result = ampersandIndex;
+  }
+  return result;
+}
+
+/**
+ * Splits text into chunks no larger than `byteLength` UTF-8 bytes, preferring
+ * to break at newlines or spaces and never inside a multi-byte character or
+ * XML entity. Mirrors edge-tts's split_text_by_byte_length.
+ */
+export function splitTextByByteLength(text: string, byteLength: number): string[] {
+  if (byteLength <= 0) {
+    throw new Error("byteLength must be greater than 0");
+  }
+  let rest = Buffer.from(text, "utf-8");
+  const chunks: string[] = [];
+  while (rest.length > byteLength) {
+    let splitAt = rest.lastIndexOf(0x0a, byteLength - 1);
+    if (splitAt < 0) splitAt = rest.lastIndexOf(0x20, byteLength - 1);
+    if (splitAt < 0) {
+      splitAt = byteLength;
+      while (splitAt > 0 && !isValidUtf8Prefix(rest, splitAt)) {
+        splitAt--;
+      }
+    }
+    splitAt = adjustForXmlEntity(rest, splitAt);
+    if (splitAt <= 0) splitAt = 1;
+    const chunk = rest.subarray(0, splitAt).toString("utf-8").trim();
+    if (chunk) chunks.push(chunk);
+    rest = rest.subarray(splitAt);
+  }
+  const remaining = rest.toString("utf-8").trim();
+  if (remaining) chunks.push(remaining);
+  return chunks;
+}
+
+function buildSsml(
+  voice: string,
+  rate: string,
+  volume: string,
+  pitch: string,
+  text: string,
+): string {
+  return (
+    "<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'>" +
+    `<voice name='${voice}'>` +
+    `<prosody pitch='${pitch}' rate='${rate}' volume='${volume}'>` +
+    text +
+    "</prosody></voice></speak>"
+  );
+}
+
+function parseRfc2616Date(date: string): number | null {
+  const parsed = Date.parse(date);
+  return Number.isNaN(parsed) ? null : parsed / 1000;
+}
+
+class EdgeHttpUpgradeError extends Error {
+  readonly statusCode: number;
+  readonly serverDate: string | null;
+  constructor(statusCode: number, serverDate: string | null) {
+    super(`Edge TTS WebSocket upgrade failed: HTTP ${statusCode}`);
+    this.name = "EdgeHttpUpgradeError";
+    this.statusCode = statusCode;
+    this.serverDate = serverDate;
+  }
+}
+
+interface SynthesisParams {
+  voice: string;
+  rate: string;
+  volume: string;
+  pitch: string;
+}
+
+/**
+ * Opens one WebSocket, streams SSML chunks sequentially, and resolves with the
+ * concatenated MP3 audio bytes. Retries once on HTTP 403 (clock skew) by
+ * re-deriving the token against the server's reported time.
+ */
+async function streamSynthesis(chunks: string[], params: SynthesisParams): Promise<Buffer> {
+  for (let attempt = 0; attempt < 2; attempt++) {
+    try {
+      return await attemptSynthesis(chunks, params);
+    } catch (err) {
+      if (
+        err instanceof EdgeHttpUpgradeError &&
+        err.statusCode === 403 &&
+        attempt === 0 &&
+        err.serverDate
+      ) {
+        const serverTime = parseRfc2616Date(err.serverDate);
+        if (serverTime !== null) {
+          const clientTime = Date.now() / 1000 + clockSkewSeconds;
+          clockSkewSeconds += serverTime - clientTime;
+          logger.warn(
+            `[EdgeTTS] HTTP 403: adjusted clock skew by ${serverTime - clientTime}s, retrying`,
+          );
+          continue;
+        }
+      }
+      throw err;
+    }
+  }
+  throw new Error("Edge TTS synthesis failed after retry");
+}
+
+function attemptSynthesis(chunks: string[], params: SynthesisParams): Promise<Buffer> {
+  return new Promise<Buffer>((resolve, reject) => {
+    const gec = generateSecMsGec();
+    const url =
+      `${WSS_URL}&ConnectionId=${connectId()}` +
+      `&Sec-MS-GEC=${gec}&Sec-MS-GEC-Version=${SEC_MS_GEC_VERSION}`;
+    const headers = { ...WSS_HEADERS, Cookie: `muid=${generateMuid()};` };
+
+    const ws = new WebSocket(url, { headers });
+    const audioChunks: Buffer[] = [];
+    let audioReceived = false;
+    let chunkIndex = 0;
+    let settled = false;
+    let timer: NodeJS.Timeout | null = null;
+
+    const finish = (error: Error | null, result?: Buffer): void => {
+      if (settled) return;
+      settled = true;
+      if (timer) clearTimeout(timer);
+      ws.removeAllListeners();
+      if (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING) {
+        ws.close();
+      }
+      if (error) reject(error);
+      else resolve(result ?? Buffer.alloc(0));
+    };
+
+    timer = setTimeout(() => {
+      finish(new Error(`Edge TTS synthesis timed out after ${SYNTHESIS_TIMEOUT_MS}ms`));
+    }, SYNTHESIS_TIMEOUT_MS);
+
+    ws.on("unexpected-response", (_req, res) => {
+      const statusCode = res.statusCode ?? 0;
+      const serverDate = (res.headers["date"] as string | undefined) ?? null;
+      finish(new EdgeHttpUpgradeError(statusCode, serverDate));
+    });
+
+    ws.on("error", (err: NodeJS.ErrnoException) => {
+      if (!settled) finish(err);
+    });
+
+    ws.on("open", () => {
+      const configMessage =
+        `X-Timestamp:${jsDateString()}\r\n` +
+        "Content-Type:application/json; charset=utf-8\r\n" +
+        "Path:speech.config\r\n\r\n" +
+        '{"context":{"synthesis":{"audio":{"metadataoptions":' +
+        '{"sentenceBoundaryEnabled":"true","wordBoundaryEnabled":"false"},' +
+        '"outputFormat":"audio-24khz-48kbitrate-mono-mp3"}}}}';
+      ws.send(configMessage);
+      sendNextChunk();
+    });
+
+    const sendNextChunk = (): void => {
+      if (chunkIndex >= chunks.length) return;
+      const ssml = buildSsml(params.voice, params.rate, params.volume, params.pitch, chunks[chunkIndex]);
+      const message =
+        `X-RequestId:${connectId()}\r\n` +
+        "Content-Type:application/ssml+xml\r\n" +
+        `X-Timestamp:${jsDateString()}Z\r\n` +
+        "Path:ssml\r\n\r\n" +
+        ssml;
+      ws.send(message);
+    };
+
+    ws.on("message", (data, isBinary) => {
+      if (settled) return;
+      const buf = Buffer.isBuffer(data)
+        ? data
+        : Array.isArray(data)
+          ? Buffer.concat(data)
+          : Buffer.from(data as ArrayBuffer);
+
+      if (isBinary) {
+        if (buf.length < 2) {
+          finish(new Error("Edge TTS: binary message too short"));
+          return;
+        }
+        // Binary frames: [2-byte big-endian header length][headers + \r\n][audio].
+        // The length value includes the trailing \r\n terminator, so audio
+        // starts immediately at offset 2 + headerLength.
+        const headerLength = buf.readUInt16BE(0);
+        if (headerLength > buf.length) {
+          finish(new Error("Edge TTS: binary header length exceeds message"));
+          return;
+        }
+        const headersBlock = buf.subarray(2, 2 + headerLength).toString("utf-8");
+        if (!headersBlock.includes("Path:audio")) return;
+        const audioStart = 2 + headerLength;
+        const audio = audioStart < buf.length ? buf.subarray(audioStart) : Buffer.alloc(0);
+        if (audio.length > 0) {
+          audioChunks.push(audio);
+          audioReceived = true;
+        }
+        return;
+      }
+
+      const text = buf.toString("utf-8");
+      const sep = text.indexOf("\r\n\r\n");
+      const headerBlock = sep >= 0 ? text.slice(0, sep) : text;
+      if (!headerBlock.includes("Path:turn.end")) return;
+
+      chunkIndex++;
+      if (chunkIndex >= chunks.length) {
+        if (!audioReceived) {
+          finish(new Error("Edge TTS: no audio received from service"));
+        } else {
+          finish(null, Buffer.concat(audioChunks));
+        }
+      } else {
+        sendNextChunk();
+      }
+    });
+
+    ws.on("close", () => {
+      if (!settled) {
+        if (!audioReceived) {
+          finish(new Error("Edge TTS: connection closed before audio was received"));
+        } else {
+          finish(null, Buffer.concat(audioChunks));
+        }
+      }
+    });
+  });
+}
+
+export interface EdgeTtsOptions {
+  voice: string;
+  rate?: string;
+  volume?: string;
+  pitch?: string;
+}
+
+/**
+ * Synthesizes `text` to an MP3 Buffer using Microsoft Edge's online TTS.
+ * Throws on protocol errors, timeouts, or if no audio is returned.
+ */
+export async function synthesizeWithEdgeTts(
+  text: string,
+  options: EdgeTtsOptions,
+): Promise<Buffer> {
+  const voice = options.voice || EDGE_DEFAULT_VOICE;
+  const rate = options.rate ?? "+0%";
+  const volume = options.volume ?? "+0%";
+  const pitch = options.pitch ?? "+0Hz";
+
+  const cleaned = removeIncompatibleCharacters(text);
+  const escaped = escapeXml(cleaned);
+  const chunks = splitTextByByteLength(escaped, MAX_CHUNK_BYTES);
+
+  logger.debug(
+    `[EdgeTTS] Synthesizing: voice=${voice}, chunks=${chunks.length}, chars=${text.length}`,
+  );
+
+  return streamSynthesis(chunks, { voice, rate, volume, pitch });
+}
diff --git a/src/app/services/tts-service.ts b/src/app/services/tts-service.ts
index 48d2eb83..8c77c6f3 100644
--- a/src/app/services/tts-service.ts
+++ b/src/app/services/tts-service.ts
@@ -1,6 +1,7 @@
 import { config } from "../../config.js";
 import { logger } from "../../utils/logger.js";
 import textToSpeech from "@google-cloud/text-to-speech";
+import { synthesizeWithEdgeTts } from "./edge-tts.js";
 
 const TTS_REQUEST_TIMEOUT_MS = 60_000;
 const MAX_TTS_INPUT_CHARS = 4_000;
@@ -29,6 +30,9 @@ export function isTtsConfigured(): boolean {
   if (config.tts.provider === "google") {
     return Boolean(process.env.GOOGLE_APPLICATION_CREDENTIALS);
   }
+  if (config.tts.provider === "edge") {
+    return true;
+  }
   return Boolean(config.tts.apiUrl && config.tts.apiKey);
 }
 
@@ -203,6 +207,22 @@ async function synthesizeWithElevenLabs(text: string): Promise<TtsResult> {
   }
 }
 
+async function synthesizeWithEdge(text: string): Promise<TtsResult> {
+  const voice = config.tts.voice || "en-US-EmmaMultilingualNeural";
+
+  logger.debug(
+    `[TTS] Edge: voice=${voice}, chars=${text.length}`,
+  );
+
+  const buffer = await synthesizeWithEdgeTts(text, { voice });
+  if (buffer.length === 0) {
+    throw new Error("Edge TTS returned an empty audio response");
+  }
+
+  logger.debug(`[TTS] Generated Edge speech audio: ${buffer.length} bytes`);
+  return { buffer, filename: "assistant-reply.mp3", mimeType: "audio/mpeg" };
+}
+
 // --- Public API ---
 
 function getNotConfiguredMessage(): string {
@@ -212,6 +232,9 @@ function getNotConfiguredMessage(): string {
   if (config.tts.provider === "elevenlabs") {
     return "TTS is not configured: set TTS_API_URL and TTS_API_KEY for ElevenLabs";
   }
+  if (config.tts.provider === "edge") {
+    return "Edge TTS is unavailable: requires network access to speech.platform.bing.com";
+  }
   return "TTS is not configured: set TTS_API_URL and TTS_API_KEY";
 }
 
@@ -234,6 +257,9 @@ export async function synthesizeSpeech(text: string): Promise<TtsResult> {
     if (config.tts.provider === "elevenlabs") {
       return await synthesizeWithElevenLabs(input);
     }
+    if (config.tts.provider === "edge") {
+      return await synthesizeWithEdge(input);
+    }
     return await synthesizeWithOpenAi(input);
   } catch (err) {
     if (err instanceof DOMException && err.name === "AbortError") {
diff --git a/src/config.ts b/src/config.ts
index 13fa81b0..be9d028e 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -7,7 +7,7 @@ dotenv.config({ path: runtimePaths.envFilePath, quiet: true });
 
 export type MessageFormatMode = "raw" | "markdown";
 export type StreamingMode = "edit" | "draft";
-export type TtsProvider = "openai" | "google" | "elevenlabs";
+export type TtsProvider = "openai" | "google" | "elevenlabs" | "edge";
 
 function getEnvVar(key: string, required: boolean = true): string {
   const value = process.env[key];
@@ -95,7 +95,7 @@ function getOptionalMessageFormatModeEnvVar(
   return defaultValue;
 }
 
-const VALID_TTS_PROVIDERS: TtsProvider[] = ["openai", "google", "elevenlabs"];
+const VALID_TTS_PROVIDERS: TtsProvider[] = ["openai", "google", "elevenlabs", "edge"];
 
 function getOptionalTtsProviderEnvVar(key: string, defaultValue: TtsProvider): TtsProvider {
   const value = getEnvVar(key, false);
@@ -213,7 +213,9 @@ export const config = {
         ? "en-US-Studio-O"
         : provider === "elevenlabs"
           ? "21m00Tcm4TlvDq8ikWAM"
-          : "alloy";
+          : provider === "edge"
+            ? "en-US-EmmaMultilingualNeural"
+            : "alloy";
     const defaultModel =
       provider === "elevenlabs" ? "eleven_flash_v2_5" : "gpt-4o-mini-tts";
     return {
diff --git a/tests/app/services/edge-tts.test.ts b/tests/app/services/edge-tts.test.ts
new file mode 100644
index 00000000..bc3b902a
--- /dev/null
+++ b/tests/app/services/edge-tts.test.ts
@@ -0,0 +1,222 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+
+vi.mock("../../../src/utils/logger.js", () => ({
+  logger: {
+    debug: vi.fn(),
+    info: vi.fn(),
+    warn: vi.fn(),
+    error: vi.fn(),
+  },
+}));
+
+import {
+  generateSecMsGec,
+  splitTextByByteLength,
+  synthesizeWithEdgeTts,
+  EDGE_DEFAULT_VOICE,
+  SEC_MS_GEC_VERSION,
+  _resetClockSkew,
+} from "../../../src/app/services/edge-tts.js";
+
+describe("generateSecMsGec", () => {
+  beforeEach(() => _resetClockSkew());
+  afterEach(() => _resetClockSkew());
+
+  it("produces a 64-char uppercase hex string", () => {
+    const token = generateSecMsGec(new Date("2024-01-01T00:00:00Z"));
+    expect(token).toMatch(/^[0-9A-F]{64}$/);
+  });
+
+  it("matches the reference vector for a fixed time", () => {
+    const token = generateSecMsGec(new Date("2024-01-01T00:00:00Z"));
+    expect(token).toBe(
+      "2AC0A57C1214B9458F8725BB7800499BB594EC29DDA83424BC14661707141F2F",
+    );
+  });
+
+  it("is stable within the same 5-minute window", () => {
+    const start = new Date("2024-06-01T12:02:37Z");
+    const later = new Date("2024-06-01T12:04:59Z");
+    expect(generateSecMsGec(start)).toBe(generateSecMsGec(later));
+  });
+
+  it("changes across 5-minute window boundaries", () => {
+    const before = new Date("2024-06-01T12:04:59Z");
+    const after = new Date("2024-06-01T12:05:00Z");
+    expect(generateSecMsGec(before)).not.toBe(generateSecMsGec(after));
+  });
+});
+
+describe("SEC_MS_GEC_VERSION / EDGE_DEFAULT_VOICE", () => {
+  it("exposes a Chromium-prefixed GEC version", () => {
+    expect(SEC_MS_GEC_VERSION).toMatch(/^1-\d+\.\d+\.\d+\.\d+$/);
+  });
+
+  it("uses a Neural voice as default", () => {
+    expect(EDGE_DEFAULT_VOICE).toMatch(/Neural$/);
+  });
+});
+
+describe("splitTextByByteLength", () => {
+  it("returns a single chunk when text fits", () => {
+    expect(splitTextByByteLength("hello world", 100)).toEqual(["hello world"]);
+  });
+
+  it("splits at newlines when possible", () => {
+    const text = "line one\nline two\nline three";
+    const chunks = splitTextByByteLength(text, 15);
+    // Newlines act as split boundaries and are trimmed away.
+    expect(chunks).toEqual(["line one", "line two", "line three"]);
+    for (const chunk of chunks) {
+      expect(Buffer.byteLength(chunk, "utf-8")).toBeLessThanOrEqual(15);
+    }
+  });
+
+  it("splits at spaces when no newline fits", () => {
+    const text = "alpha beta gamma delta";
+    const chunks = splitTextByByteLength(text, 12);
+    expect(chunks.every((c) => Buffer.byteLength(c, "utf-8") <= 12)).toBe(true);
+    expect(chunks.join(" ").replace(/\s+/g, " ").trim()).toContain("alpha");
+  });
+
+  it("never splits a multi-byte UTF-8 character", () => {
+    // Each CJK char is 3 bytes in UTF-8; force splits mid-character.
+    const text = "你好世界测试文本".repeat(10);
+    const chunks = splitTextByByteLength(text, 8);
+    const roundTrip = chunks.join("");
+    expect(roundTrip).toBe(text);
+    for (const chunk of chunks) {
+      expect(Buffer.byteLength(chunk, "utf-8")).toBeLessThanOrEqual(8);
+    }
+  });
+
+  it("does not split inside an XML entity", () => {
+    const text = "foo &amp; bar";
+    const chunks = splitTextByByteLength(text, 6);
+    // The entity "&amp;" must stay whole within a single chunk.
+    expect(chunks).toContain("&amp;");
+    expect(
+      chunks.some((c) => c.includes("&amp") && !c.includes("&amp;")),
+    ).toBe(false);
+    expect(chunks.some((c) => c.includes("amp;") && !c.includes("&amp;"))).toBe(false);
+  });
+
+  it("throws on non-positive byte length", () => {
+    expect(() => splitTextByByteLength("x", 0)).toThrow();
+    expect(() => splitTextByByteLength("x", -1)).toThrow();
+  });
+});
+
+describe("synthesizeWithEdgeTts (WebSocket flow)", () => {
+  const fakeWs = {
+    on: vi.fn(),
+    send: vi.fn(),
+    close: vi.fn(),
+    removeAllListeners: vi.fn(),
+    readyState: 1,
+  };
+
+  function emit(event: string, ...args: unknown[]): void {
+    const handler = fakeWs.on.mock.calls.find((c) => c[0] === event)?.[1];
+    if (handler) (handler as (...a: unknown[]) => void)(...args);
+  }
+
+  beforeEach(() => {
+    vi.useFakeTimers();
+    fakeWs.on.mockClear();
+    fakeWs.send.mockClear();
+    fakeWs.close.mockClear();
+    fakeWs.removeAllListeners.mockClear();
+    fakeWs.readyState = 1;
+    vi.resetModules();
+  });
+
+  afterEach(() => {
+    vi.useRealTimers();
+  });
+
+  function installMockWs(): void {
+    vi.doMock("ws", () => ({
+      WebSocket: vi.fn(() => fakeWs),
+    }));
+  }
+
+  it("sends config + SSML and concatenates binary audio on turn.end", async () => {
+    installMockWs();
+    const { synthesizeWithEdgeTts } = await import(
+      "../../../src/app/services/edge-tts.js"
+    );
+
+    const promise = synthesizeWithEdgeTts("Hello world", {
+      voice: "en-US-AriaNeural",
+    });
+
+    // Allow the constructor + listeners to register.
+    await Promise.resolve();
+
+    emit("open");
+
+    // Two messages sent on open: speech.config then first ssml.
+    expect(fakeWs.send).toHaveBeenCalledTimes(2);
+    expect(fakeWs.send.mock.calls[0][0]).toContain("Path:speech.config");
+    const ssml = String(fakeWs.send.mock.calls[1][0]);
+    expect(ssml).toContain("Path:ssml");
+    expect(ssml).toContain("<voice name='en-US-AriaNeural'>");
+    expect(ssml).toContain("Hello world");
+
+    // Simulate a binary audio frame from the service.
+    // Format: [2-byte header length][headers + \r\n][audio data]
+    const headers = Buffer.from("Path:audio\r\nContent-Type:audio/mpeg\r\n", "utf-8");
+    const prefix = Buffer.alloc(2);
+    prefix.writeUInt16BE(headers.length, 0);
+    const audioBytes = Buffer.from([0xff, 0xf3, 0x90, 0x00]);
+    emit("message", Buffer.concat([prefix, headers, audioBytes]), true);
+
+    // Simulate turn.end on the text channel.
+    emit("message", "X-RequestId:abc\r\nPath:turn.end\r\n\r\n", false);
+
+    const result = await promise;
+    expect(result).toEqual(audioBytes);
+  });
+
+  it("rejects when no audio is received before turn.end", async () => {
+    installMockWs();
+    const { synthesizeWithEdgeTts } = await import(
+      "../../../src/app/services/edge-tts.js"
+    );
+
+    const promise = synthesizeWithEdgeTts("Hello", { voice: "en-US-AriaNeural" });
+    await Promise.resolve();
+    emit("open");
+    emit("message", "Path:turn.end\r\n\r\n", false);
+
+    await expect(promise).rejects.toThrow("no audio received");
+  });
+
+  it("rejects on connection close before audio", async () => {
+    installMockWs();
+    const { synthesizeWithEdgeTts } = await import(
+      "../../../src/app/services/edge-tts.js"
+    );
+
+    const promise = synthesizeWithEdgeTts("Hello", { voice: "en-US-AriaNeural" });
+    await Promise.resolve();
+    emit("open");
+    emit("close");
+
+    await expect(promise).rejects.toThrow("connection closed");
+  });
+
+  it("rejects on WebSocket error", async () => {
+    installMockWs();
+    const { synthesizeWithEdgeTts } = await import(
+      "../../../src/app/services/edge-tts.js"
+    );
+
+    const promise = synthesizeWithEdgeTts("Hello", { voice: "en-US-AriaNeural" });
+    await Promise.resolve();
+    emit("error", new Error("connect ECONNREFUSED"));
+
+    await expect(promise).rejects.toThrow("ECONNREFUSED");
+  });
+});
diff --git a/tests/app/services/tts-service.test.ts b/tests/app/services/tts-service.test.ts
index bc969a3a..d4172d63 100644
--- a/tests/app/services/tts-service.test.ts
+++ b/tests/app/services/tts-service.test.ts
@@ -26,6 +26,11 @@ vi.mock("@google-cloud/text-to-speech", () => {
   };
 });
 
+const mockEdgeSynth = vi.hoisted(() => vi.fn());
+vi.mock("../../../src/app/services/edge-tts.js", () => ({
+  synthesizeWithEdgeTts: mockEdgeSynth,
+}));
+
 const mockTts = vi.hoisted(() => ({
   apiUrl: "",
   apiKey: "",
@@ -112,6 +117,13 @@ describe("isTtsConfigured", () => {
     mockTts.apiKey = "xi-test-key";
     expect(isTtsConfigured()).toBe(true);
   });
+
+  it("returns true for edge provider (no credentials required)", () => {
+    mockTts.provider = "edge";
+    mockTts.apiUrl = "";
+    mockTts.apiKey = "";
+    expect(isTtsConfigured()).toBe(true);
+  });
 });
 
 describe("stripMarkdownForSpeech", () => {
@@ -438,3 +450,42 @@ describe("synthesizeSpeech (ElevenLabs)", () => {
     );
   });
 });
+
+describe("synthesizeSpeech (Edge)", () => {
+  beforeEach(() => {
+    mockTts.provider = "edge";
+    mockTts.voice = "en-US-EmmaMultilingualNeural";
+    mockTts.apiUrl = "";
+    mockTts.apiKey = "";
+    mockEdgeSynth.mockReset();
+    vi.restoreAllMocks();
+  });
+
+  it("delegates to synthesizeWithEdgeTts and returns mp3 bytes", async () => {
+    mockEdgeSynth.mockResolvedValue(Buffer.from([1, 2, 3, 4]));
+
+    const result = await synthesizeSpeech("Hello **bold** world");
+
+    expect(mockEdgeSynth).toHaveBeenCalledOnce();
+    const [text, options] = mockEdgeSynth.mock.calls[0];
+    // Markdown is stripped before being passed to the provider.
+    expect(text).toBe("Hello bold world");
+    expect(options.voice).toBe("en-US-EmmaMultilingualNeural");
+
+    expect(result.filename).toBe("assistant-reply.mp3");
+    expect(result.mimeType).toBe("audio/mpeg");
+    expect(result.buffer).toEqual(Buffer.from([1, 2, 3, 4]));
+  });
+
+  it("throws when Edge returns an empty audio buffer", async () => {
+    mockEdgeSynth.mockResolvedValue(Buffer.alloc(0));
+
+    await expect(synthesizeSpeech("Hello")).rejects.toThrow("empty audio response");
+  });
+
+  it("propagates upstream Edge errors", async () => {
+    mockEdgeSynth.mockRejectedValue(new Error("Edge TTS: no audio received"));
+
+    await expect(synthesizeSpeech("Hello")).rejects.toThrow("no audio received");
+  });
+});