diff --git a/.env.example b/.env.example
index 6780d0f..50e226d 100644
--- a/.env.example
+++ b/.env.example
@@ -143,3 +143,70 @@ SOLRAC_WEB_TOKEN=
 # Synthetic chat id all web traffic shares. Negative integer; default −1000
 # avoids collision with real Telegram chat ids.
 # SOLRAC_WEB_CHAT_ID=-1000
+
+# ── Voice (ElevenLabs STT + TTS) ────────────────────────────────────────────
+# Off by default. When VOICE_ENABLED=true:
+#   - Web UI gets a mic button (record → STT pre-fills the composer) and a
+#     per-message speak button (TTS plays in the browser).
+#   - Telegram voice notes (msg.voice) are transcribed and fed through the
+#     normal text path.
+#   - `/voice on` (per chat, sticky) injects a word-limit prompt to the LLM
+#     AND attaches a Telegram voice note to each reply.
+#
+# REQUIRED when VOICE_ENABLED=true (boot fails loud otherwise):
+#   ELEVENLABS_API_KEY — from https://elevenlabs.io → Profile + API Keys
+#                        (starts `sk_…`). Restrict the key to Text-to-Speech
+#                        + Speech-to-Text endpoints only.
+#   ELEVENLABS_VOICE_ID — 20-char id from VoiceLab → voice detail page.
+#
+# Both ELEVENLABS_* and VOICE_* keys are scrubbed from the Claude SDK
+# subprocess env (agent.ts::sanitizedSubprocessEnv) so a compromised model
+# can't exfiltrate them via an auto-allowed Bash command.
+#
+# Privacy: audio + transcripts + TTS-bound replies hit ElevenLabs SaaS.
+# SOUL.md / SOLRAC.md never leave the host, but the speech does.
+#
+# VOICE_ENABLED=true
+# ELEVENLABS_API_KEY=sk_REPLACE_ME
+# ELEVENLABS_VOICE_ID=REPLACE_ME
+
+# TTS model. eleven_flash_v2_5 is $0.05/1k chars, low latency. Other options:
+# eleven_turbo_v2_5 (similar price/latency), eleven_multilingual_v2 ($0.10/1k,
+# better quality). Operator pin to match plan / preference.
+# ELEVENLABS_TTS_MODEL=eleven_flash_v2_5
+
+# STT model. scribe_v2 went GA March 2026 and replaces v1.
+# ELEVENLABS_STT_MODEL=scribe_v2
+
+# Hard wall — TTS requests over this length are refused with HTTP 413 + a
+# user-visible chat hint. The voice-mode prompt nudge (see below) defends
+# against this softly; the wall is the last line of defense.
+# VOICE_TTS_MAX_CHARS=3000
+
+# Soft target — when `/voice on` is set for a chat, this many words is the
+# budget injected into the LLM prompt ("respond in under N words"). The
+# model may use up to 3× when the user explicitly asks for more.
+# Clamps to [30, 200] at boot (out-of-range warns + clamps).
+# VOICE_REPLY_WORDS_HINT=60
+
+# STT upload limits. Web `/api/stt` rejects oversized bodies before paying
+# Scribe; Telegram voice-note download is bounded by the same byte cap.
+# VOICE_STT_MAX_BYTES=2097152          # 2 MiB
+# VOICE_STT_MAX_SECONDS=60             # client MediaRecorder stops at this
+
+# Independent voice cost cap (Anthropic burn is separate). Sliding 60-min
+# windows over voice_events.cost_usd_estimate.
+# VOICE_HOURLY_COST_CAP_USD=0.25       # per-chat
+# VOICE_GLOBAL_HOURLY_COST_CAP_USD=1.00  # host-wide
+
+# Output formats. Web uses MP3 (plays everywhere). Telegram uses Ogg/Opus
+# (sendVoice). §17 probe (May 2026) confirmed ElevenLabs returns
+# Ogg-containerized Opus for opus_48000_64; if a future change flips that
+# to raw Opus, set this to mp3_44100_64 and Telegram path uses sendAudio.
+# ELEVENLABS_TTS_OUTPUT_FORMAT_WEB=mp3_44100_64
+# ELEVENLABS_TTS_OUTPUT_FORMAT_TG=opus_48000_64
+
+# Pricing constants used for cap math + voice_events.cost_usd_estimate.
+# Pin to your ElevenLabs plan if it differs from the published defaults.
+# ELEVENLABS_TTS_PRICE_USD_PER_1K_CHARS=0.05
+# ELEVENLABS_STT_PRICE_USD_PER_HOUR=0.22
diff --git a/README.md b/README.md
index 24d8db6..ce2bf8b 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Solrac
 
-> A self-hosted, hackable personal Agent: free local LLM (Ollama or LMStudio) or remote LLM (OpenRouter) by default, with explicit escalation to Anthropic's Claude Sonnet/Opus via the Claude Agent SDK. Reach it from Telegram or a browser; own every audit row, permission rule, and budget cap.
+> A self-hosted personal Agent you can configure, hack, and **converse with**. Reach it by text from Telegram or a browser, or by voice (ElevenLabs STT + TTS) on either transport. Free local LLM (Ollama / LMStudio) or remote (OpenRouter) by default; escalate to Anthropic's Claude Sonnet (`@`) or Opus (`!`) only when you mean it. Own every audit row, permission rule, and budget cap.
 
 <image src="./docs/solrac.png" width="300px" />
 
@@ -16,11 +16,12 @@ It's deliberately smaller and narrower than other personal-assistant projects:
 Both are broader and better-resourced. **Solrac's distinct value:**
 
 - **BYO-model engine slot.** No-prefix messages route to whichever model source you wire — free on-host (Ollama / LMStudio) or pay-per-token remote (OpenRouter). `@` (Sonnet) and `!` (Opus) are paid Claude escalations only on operator intent.
-- **Cost enforcement, not just visibility.** Sliding per-chat and global hourly USD caps that *deny* turns when hit — they sum every `cost_usd` row (Claude or OpenRouter), so remote-mode burn is gated by the same ceilings without extra configuration. Plus a daily cost-report DM.
-- **Audit-before-acting.** Every update (allowed, denied, queue-full) writes a row to one append-only SQLite table, tagged with the engine that served it (`local:ollama:...`, `remote:openrouter:...`, `claude:primary:...`).
+- **Cost enforcement, not just visibility.** Sliding per-chat and global hourly USD caps that *deny* turns when hit — they sum every `cost_usd` row (Claude or OpenRouter), so remote-mode burn is gated by the same ceilings without extra configuration. Plus a daily cost-report DM. Voice spend (ElevenLabs STT + TTS, when enabled) rides a **second** independent cost-cap axis with its own per-chat + global ceilings.
+- **Voice on every transport.** Telegram voice notes get transcribed; the web UI has a mic button and per-message speak buttons. `/voice on` turns on terse audio replies. ~120 lines of `fetch` against ElevenLabs — no SDK, no realtime WebSocket. Off by default.
+- **Audit-before-acting.** Every update (allowed, denied, queue-full) writes a row to one append-only SQLite table, tagged with the engine that served it (`local:ollama:...`, `remote:openrouter:...`, `claude:primary:...`). Voice gets a parallel `voice_events` log — every STT/TTS attempt (allowed, capped, denied, errored) is recorded.
 - **Single-process minimalism.** No HTTP framework, no Telegram framework runtime, no queue server, no Docker, no sub-agents. A few thousand lines of TypeScript you can read in an afternoon and fork.
 
-If you need multi-tenancy, voice wake, mobile companions, or 25 chat platforms, use OpenClaw or Hermes. If you want a small, cost-capped, fully audited foundation you can bend to your shape, Solrac fits.
+If you need multi-tenancy, always-listening voice wake, mobile companions, or 25 chat platforms, use OpenClaw or Hermes. If you want a small, cost-capped, fully audited foundation — with optional speech-to-text and text-to-speech on Telegram and the browser — that you can bend to your shape, Solrac fits.
 
 ## Quick start
 
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
index 774b493..acaf97a 100644
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -1216,6 +1216,151 @@ The existing `policy.ts::createConfirmationBroker` is transport-agnostic — `re
 
 The transport adds `web.ts`, `web-client.ts`, `web-sanitize.ts`, and `markdown.ts`. No HTTP framework, no WebSocket framework, no extra runtime dependencies beyond `marked` (used on both transports). The "no HTTP framework" anti-goal is honored — `Bun.serve` `routes` and `fetch` only, same shape as `server.ts`.
 
+## Voice transport (optional)
+
+Off by default. Enabled via `VOICE_ENABLED=true` + ElevenLabs credentials. Adds two flows — **speech-in** (operator → text prompt) and **speech-out** (assistant reply → audio) — to both Telegram and the web UI. Implementation lives in two modules:
+
+| Module | Role |
+|---|---|
+| `src/elevenlabs.ts` | Typed `fetch` wrapper for ElevenLabs HTTP. STT (`POST /v1/speech-to-text`, multipart) and TTS-stream (`POST /v1/text-to-speech/{voice_id}/stream`, chunked body). ~165 lines, no SDK. |
+| `src/voice.ts` | Orchestration — gate, cost-cap, audit-write, transport delivery. Exports `handleWebStt`, `handleWebTts`, `handleTelegramVoiceStt`, `maybeReplyWithVoice`, `stripMarkdownForSpeech`, `buildVoiceModePrompt`. |
+
+### The two flows
+
+```
+SPEECH-IN (STT)
+   Telegram voice note               Web mic button
+   ───────────────────                ────────────────
+   poll loop → msg.voice              MediaRecorder → /api/stt (multipart)
+       │                                  │
+       │ gateUpdate (allowlist)            │ session-cookie auth
+       │ voice cost cap check              │ voice cost cap check
+       ▼                                  ▼
+   getFile + download bytes           parse multipart, validate size
+       │                                  │
+       └──────► voice.handleXxxStt ◄──────┘
+                    │
+                    ▼
+            ElevenLabs Scribe
+                    │
+                    ▼
+           voice_events row written
+                    │
+       Telegram: synthesize text Update    Web: return { ok, text } →
+       → queue.enqueue (normal turn)       browser pre-fills composer
+
+SPEECH-OUT (TTS)
+   Telegram: post-turn hook           Web: speak button on assistant msg
+   ──────────────────────              ──────────────────────────────────
+   agent/engine done + audit closed   user click → POST /api/tts
+       │                                  │ session-cookie auth
+       │ /voice on? (sessions table)       │ voice cost cap check
+       │ voice cost cap check              │ length wall
+       ▼                                  ▼
+   stripMarkdownForSpeech(final)      stripMarkdownForSpeech(markdown)
+       │                                  │
+       └──────► ElevenLabs TTS-stream ◄────┘
+                    │
+                    ▼
+           voice_events row written
+                    │
+       Telegram: buffer → sendVoice       Web: proxy-stream → <audio>
+       (Ogg/Opus) or sendAudio (MP3)      blob URL → autoplay
+```
+
+### `voice_events` table — separate from `audit`
+
+One turn can produce **multiple** voice events (one STT input + one TTS output, sometimes). The `audit` table's two-writes-per-turn shape doesn't fit, so voice gets its own append-only log:
+
+```sql
+CREATE TABLE voice_events (
+  id INTEGER PRIMARY KEY AUTOINCREMENT,
+  chat_id INTEGER NOT NULL,
+  ts_ms INTEGER NOT NULL,
+  kind TEXT NOT NULL CHECK (kind IN ('stt','tts')),
+  source TEXT NOT NULL CHECK (source IN ('web','telegram')),
+  model TEXT NOT NULL,
+  voice_id TEXT,
+  audit_id INTEGER,            -- informational link to audit.id (not FK)
+  duration_ms INTEGER,         -- STT only
+  chars INTEGER,               -- TTS only
+  cost_usd_estimate REAL NOT NULL DEFAULT 0,
+  status TEXT NOT NULL CHECK (status IN ('ok','denied_cap','denied_gate','error')),
+  error_message TEXT
+);
+CREATE INDEX idx_voice_events_chat_ts ON voice_events (chat_id, ts_ms);
+CREATE INDEX idx_voice_events_ts ON voice_events (ts_ms);
+```
+
+`audit_id` is informational only (no FK) so a `denied_gate` STT — which never reaches `audit` because no allowlisted sender existed — still gets a row. Cost-cap queries sum `cost_usd_estimate` over a sliding 60-min window, filtered to `status='ok'` so denials don't double-count.
+
+### Independent voice cost cap
+
+Anthropic burn (`audit.cost_usd`) and ElevenLabs burn (`voice_events.cost_usd_estimate`) are **separate axes**. Each has its own per-chat + global sliding-60-min ceiling:
+
+| Axis | Per-chat env var | Global env var | Default |
+|---|---|---|---|
+| Anthropic | `HOURLY_COST_CAP_USD` | `GLOBAL_HOURLY_COST_CAP_USD` | $1.00 / $4.00 (4×) |
+| Voice | `VOICE_HOURLY_COST_CAP_USD` | `VOICE_GLOBAL_HOURLY_COST_CAP_USD` | $0.25 / $1.00 |
+
+Order of checks inside `voice.ts` mirrors the Anthropic cap shape: **global first, then per-chat** (a host-wide hit shouldn't be masked by a per-chat pass). For STT the gate fires before either cap; for TTS the length wall fires after both caps. Cost is **estimated** at write time (ElevenLabs doesn't return per-call billing on the wire) using the configured price constants:
+
+- **STT:** `audio_duration_secs / 3600 × ELEVENLABS_STT_PRICE_USD_PER_HOUR`
+- **TTS:** `chars / 1000 × ELEVENLABS_TTS_PRICE_USD_PER_1K_CHARS`
+
+Pin the prices to your ElevenLabs plan if the published defaults don't match.
+
+### Voice mode (`sessions.voice_replies` + `/voice on|off`)
+
+Per-chat sticky toggle backing both Telegram TTS attach AND the word-limit prompt nudge. Added to `sessions` as a `0/1` column (idempotent ALTER). The `/voice` command's parser accepts `on`, `off`, `1`, `0`, `true`, `false`, or no-arg (renders current state).
+
+When `voice_replies=1` for a chat, two things happen on every turn:
+
+1. **`voice.ts::buildVoiceModePrompt`** is called by both SOLRAC.md injection sites and a `<voice-mode>` block is prepended that tells the model to keep the reply under `VOICE_REPLY_WORDS_HINT` words (default 60). The block sits **after** SOLRAC.md and **before** the cross-engine OOB block, so operator overlays can override the word limit if needed. The model may use up to 3× the limit when the user explicitly asks for more.
+2. **`maybeReplyWithVoice`** runs as the post-turn hook (after the audit row closes, only on `!isError`). It strips the markdown, checks the cost cap + length wall, calls ElevenLabs TTS, buffers the audio, and sends via `sendVoice` (Ogg/Opus) or `sendAudio` (MP3 fallback). Web turns don't invoke this — the per-message speak button does it on user demand instead.
+
+The post-turn hook is wired as an optional `attachVoiceReply` callback on `AgentRunDeps` and `EngineRunDeps`. Telegram-bound deps carry the callback; web-bound deps don't. Same VoiceDeps instance backs both Telegram STT/TTS and web STT/TTS — the sliding 60-min cap is shared across transports, so an operator can't double up by talking on web + Telegram simultaneously.
+
+### Footer strip
+
+The `*✅ ...*` line agent/engine append to every successful reply (turn count, cost, model) is UI chrome, not content. `voice.ts::stripMarkdownForSpeech` regex-strips that pattern before tokenizing, so TTS never reads "✅ remote:openrouter:z-ai/glm-5.1 · 1 tools · 6.6s · $0.0048" aloud.
+
+The strip also handles standard markdown → speech transforms via `marked.lexer`:
+- Code fences → `[code block omitted]`
+- Tables → `[table omitted]`
+- Lists → comma-joined items
+- Links → text (URL dropped)
+- Headers / bold / italic → unwrap, keep text
+
+### Env scrub additions
+
+`ELEVENLABS_*` and `VOICE_*` are added to `agent.ts::sanitizedSubprocessEnv`'s scrub list. `ELEVENLABS_API_KEY` is a billed credential that the spawned `claude` SDK subprocess has no business reading; `VOICE_*` (cost caps, model ids) shouldn't leak via an auto-allowed `Bash(echo $VOICE_...)`.
+
+### What about the web sanitizer?
+
+`web-sanitize.ts` deliberately excludes `<audio>` from its allowlist. We do NOT widen it. The `<audio>` element on the web UI is injected via `document.createElement` by `app.js`, AFTER sanitization runs on the reply body — the trust boundary doesn't move. The sanitizer is for marked-rendered LLM content; audio playback is UI chrome.
+
+### Dependency direction
+
+```
+elevenlabs.ts  →  log + config
+voice.ts       →  elevenlabs + db + log + config + policy + telegram + marked
+agent.ts       →  + voice (post-turn hook + buildVoiceModePrompt)
+engine.ts      →  + voice (post-turn hook + buildVoiceModePrompt)
+web.ts         →  + voice (handleWebStt, handleWebTts)
+main.ts        →  + voice (handleTelegramVoiceStt dispatcher, maybeReplyWithVoice)
+commands.ts    →  unchanged structurally — /voice command dispatches via db.setVoiceRepliesFlag
+```
+
+No new runtime dependency outside `marked` (already shipped). `fetch` and `FormData` are global in Bun.
+
+### Anti-goal preservation
+
+- `marked` is still the only non-SDK runtime dep — ElevenLabs is raw `fetch`, no SDK.
+- No HTTP framework added (the two new routes ride the existing `Bun.serve` instance).
+- No Telegram framework runtime added (`sendVoice`/`sendAudio` use multipart `fetch`).
+- One-PR-per-feature was reversed deliberately for the voice change (phases 1–5 landed together) — per PLAN.md §16, called out as an explicit re-evaluation.
+
 ## Anti-goals
 
 Decisions deliberately not made. Don't relitigate without strong justification.
diff --git a/docs/CONFIG.md b/docs/CONFIG.md
index a512766..2479fe7 100644
--- a/docs/CONFIG.md
+++ b/docs/CONFIG.md
@@ -52,6 +52,21 @@ Every Solrac knob is an environment variable, validated and frozen at boot by `s
 | `SOLRAC_WEB_PORT` | no | `8080` | positive int | Port for the web UI. Must differ from `PORT` (which serves `/health` & `/stats`). |
 | `SOLRAC_WEB_TOKEN` | when `SOLRAC_WEB_ENABLED=true` | — | string | Login secret. **Required even on `127.0.0.1`** — a co-tenant on a shared host could otherwise reach the unauthenticated UI. Generate with `openssl rand -hex 32`. Cookie set after login is HttpOnly + SameSite=Strict + Path=/ + Max-Age=24h. |
 | `SOLRAC_WEB_CHAT_ID` | no | `-1000` | negative int | Synthetic chat id all web traffic shares. One session per Claude tier, one cost-cap bucket, one `/clear` scope. Negative to avoid collision with real Telegram chat ids. |
+| `VOICE_ENABLED` | no | `false` | boolean | Master switch for voice (ElevenLabs STT + TTS). When `true`, `ELEVENLABS_API_KEY` AND `ELEVENLABS_VOICE_ID` MUST be set (boot fails loud otherwise). Telegram voice notes get transcribed via Scribe; `/voice on` enables per-chat audio replies via the configured voice. The web UI surfaces a mic button + per-message speak button. Independent voice cost cap (per-chat + global, sliding 60-min) over `voice_events.cost_usd_estimate` — Anthropic burn cap is unaffected. |
+| `ELEVENLABS_API_KEY` | when `VOICE_ENABLED=true` | — | string | ElevenLabs API key (`sk_…`). Get one at [elevenlabs.io](https://elevenlabs.io) → Profile + API Keys. Recommended restriction: **Text to Speech + Speech to Text only**, nothing else. **Scrubbed** from the SDK-spawned `claude` subprocess env (`agent.ts::sanitizedSubprocessEnv` strips the entire `ELEVENLABS_*` prefix). |
+| `ELEVENLABS_VOICE_ID` | when `VOICE_ENABLED=true` | — | string | 20-char voice id from ElevenLabs VoiceLab → voice detail page. Single deploy-wide voice (no per-chat override in v1). |
+| `ELEVENLABS_TTS_MODEL` | no | `eleven_flash_v2_5` | string | TTS model id. `eleven_flash_v2_5` is $0.05/1k chars, low latency. Alternatives: `eleven_turbo_v2_5` (similar), `eleven_multilingual_v2` ($0.10/1k, better quality). |
+| `ELEVENLABS_STT_MODEL` | no | `scribe_v2` | string | STT model id. `scribe_v2` is the GA replacement for v1 (March 2026). |
+| `VOICE_TTS_MAX_CHARS` | no | `3000` | positive int | Hard wall — TTS requests over this length (post-markdown-strip) are refused with HTTP 413 (web) or a chat hint (Telegram). The voice-mode prompt nudge (`VOICE_REPLY_WORDS_HINT`) defends against this softly; the wall is the last line of defense. |
+| `VOICE_REPLY_WORDS_HINT` | no | `60` | positive int | Soft target — when `/voice on` is set for a chat, this word budget is injected as a system prompt block. The model may use up to 3× when the user asks for more detail. Clamped to `[30, 200]` at boot (out-of-range warns + clamps). |
+| `VOICE_STT_MAX_BYTES` | no | `2097152` (2 MiB) | positive int | Hard ceiling on audio upload size. Web `/api/stt` rejects oversized bodies before paying Scribe; Telegram voice-note download is bounded by the same cap. |
+| `VOICE_STT_MAX_SECONDS` | no | `60` | positive int | Client-side `MediaRecorder` stop timer for the web UI mic button. |
+| `VOICE_HOURLY_COST_CAP_USD` | no | `0.25` | positive float | **Per-chat** voice cost ceiling. Sliding 60-min window over `voice_events.cost_usd_estimate`. Independent of Anthropic `HOURLY_COST_CAP_USD`. |
+| `VOICE_GLOBAL_HOURLY_COST_CAP_USD` | no | `1.00` | positive float | **Global** voice cost ceiling. Sliding 60-min window across all chats. Independent of Anthropic `GLOBAL_HOURLY_COST_CAP_USD`. |
+| `ELEVENLABS_TTS_OUTPUT_FORMAT_WEB` | no | `mp3_44100_64` | string | Output format the browser `<audio>` consumes. MP3 plays everywhere (Chromium, Firefox, Safari). |
+| `ELEVENLABS_TTS_OUTPUT_FORMAT_TG` | no | `opus_48000_64` | string | Output format Telegram `sendVoice` consumes. Defaults to Ogg/Opus — verified against ElevenLabs (May 2026 probe). If a future upstream change flips to raw Opus, set to `mp3_44100_64` and Telegram path uses `sendAudio` instead (the env-var picks). |
+| `ELEVENLABS_TTS_PRICE_USD_PER_1K_CHARS` | no | `0.05` | positive float | Pricing constant for TTS cost estimate (used by the voice cost cap). Pin to your ElevenLabs plan if it differs from the published default. |
+| `ELEVENLABS_STT_PRICE_USD_PER_HOUR` | no | `0.22` | positive float | Pricing constant for STT cost estimate. Pin to your plan if it differs. |
 
 ## Validation rules
 
@@ -73,6 +88,7 @@ Every Solrac knob is an environment variable, validated and frozen at boot by `s
 - **Local/remote mutex:** `LOCAL_ENABLED=true && REMOTE_ENABLED=true` is rejected at boot. The engine slot has a single driver per boot — picking between modes is structural, not per-message. Operators wanting both should pin `SOLRAC_DEFAULT_ENGINE=primary` and use Claude for the no-prefix path.
 - **Local-tools constraint:** `LOCAL_TOOLS_ENABLED=true` requires `SOLRAC_INTEGRATIONS_ENABLED=true` (else there are no tools to expose; boot throws).
 - **Web UI constraint:** when `SOLRAC_WEB_ENABLED=true`, `SOLRAC_WEB_TOKEN` must be set (any value; ≥32 chars recommended). `SOLRAC_WEB_PORT` must differ from `PORT`. `SOLRAC_WEB_CHAT_ID` must be a negative integer.
+- **Voice constraint:** when `VOICE_ENABLED=true`, both `ELEVENLABS_API_KEY` AND `ELEVENLABS_VOICE_ID` must be set and non-blank. `VOICE_REPLY_WORDS_HINT` is clamped to `[30, 200]` at boot with a `config.voice_reply_words_clamped` warn line if the operator value falls outside that range. All other voice values must parse as positive numbers / integers when provided. Voice cost caps (`VOICE_HOURLY_COST_CAP_USD`, `VOICE_GLOBAL_HOURLY_COST_CAP_USD`) are **independent** of the Anthropic caps — they sum `voice_events.cost_usd_estimate` over their own sliding 60-min windows.
 
 The returned `Config` object is `Object.freeze`d; `allowlistBootstrap` is also frozen. There's no runtime mutation path.
 
@@ -175,6 +191,19 @@ SOLRAC_WEB_HOST=127.0.0.1         # 0.0.0.0 to expose on LAN/Tailscale/public
 SOLRAC_WEB_PORT=8080              # must differ from PORT
 SOLRAC_WEB_TOKEN=                 # required when enabled; generate: openssl rand -hex 32
 # SOLRAC_WEB_CHAT_ID=-1000        # synthetic shared chat id for the web transport
+
+# Voice (ElevenLabs STT + TTS). Off by default. When VOICE_ENABLED=true,
+# ELEVENLABS_API_KEY + ELEVENLABS_VOICE_ID are required (boot fails loud).
+# Independent voice cost cap separate from the Anthropic cap.
+VOICE_ENABLED=false
+# ELEVENLABS_API_KEY=sk_…                # restrict the key to TTS + STT only
+# ELEVENLABS_VOICE_ID=…                  # 20-char id from VoiceLab
+# ELEVENLABS_TTS_MODEL=eleven_flash_v2_5
+# ELEVENLABS_STT_MODEL=scribe_v2
+# VOICE_TTS_MAX_CHARS=3000               # hard wall on TTS input length
+# VOICE_REPLY_WORDS_HINT=60              # soft word budget when /voice on
+# VOICE_HOURLY_COST_CAP_USD=0.25         # per-chat sliding 60-min cap
+# VOICE_GLOBAL_HOURLY_COST_CAP_USD=1.00  # global sliding 60-min cap
 ```
 
 ### Claude-only deploy
@@ -219,6 +248,8 @@ The SDK spawns a `claude` subprocess that **inherits parent env**. Solrac scrubs
 - `TG_*` (any prefix)
 - `LOCAL_*` (any prefix — backend URL/model)
 - `REMOTE_*` (any prefix — OpenRouter API key + base URL)
+- `ELEVENLABS_*` (any prefix — voice API key + voice id)
+- `VOICE_*` (any prefix — voice cost caps + limits)
 - `STATS_BEARER_TOKEN`
 - `ALLOWLIST_BOOTSTRAP`
 - `NOTION_API_KEY`
diff --git a/docs/FEATURES.md b/docs/FEATURES.md
index 11a488a..aac5cb0 100644
--- a/docs/FEATURES.md
+++ b/docs/FEATURES.md
@@ -19,6 +19,7 @@ The complete feature list, grouped by theme. See [../README.md](../README.md) fo
 ## Transport
 
 - **Optional browser web UI** — a second `Bun.serve` instance on a configurable port serves a minimal vanilla-JS chat interface with the same agent loop, slash commands, engine routing, and tool-confirm UX as Telegram. Full markdown rendering (headers, lists, tables, fenced code) on both transports — Claude and local responses get a server-side markdown→HTML pass for Telegram and the raw markdown to the browser. Off by default; enable with `SOLRAC_WEB_ENABLED=true` plus a token. See [USAGE.md#web-ui-browser-interface](./USAGE.md#web-ui-browser-interface).
+- **Optional voice (ElevenLabs STT + TTS)** — Telegram voice notes get transcribed via ElevenLabs Scribe into the normal text path; the web UI gets a mic button that pre-fills the composer. With `/voice on` (per-chat sticky toggle), Telegram replies attach an audio voice note and a `<voice-mode>` system block tells the model to respond in under N words. Web UI gets a per-message 🔊 speak button for on-demand playback (with blob caching — replay doesn't re-bill). Independent cost cap separate from the Anthropic cap. Off by default; enable with `VOICE_ENABLED=true` plus an ElevenLabs key/voice id. See [USAGE.md#voice-elevenlabs-stt--tts](./USAGE.md#voice-elevenlabs-stt--tts).
 - **Multi-user, multi-chat** — gated by per-`from.id` allowlist.
 
 ## Safety & audit
@@ -26,7 +27,8 @@ The complete feature list, grouped by theme. See [../README.md](../README.md) fo
 - **Three-tier permission policy** — auto-allow / auto-deny / Telegram-inline-keyboard-confirm. Configurable rule tables.
 - **Per-chat hourly cost cap** — sliding 60-minute window over the audit log. Default $1.00/chat/hour.
 - **Loop detector** — denies the third call to the same `(toolName, input)` within a turn. Order-insensitive over JSON keys.
-- **Persistent audit trail** — every turn (allowed, denied, queue-full) writes a SQLite row with prompt, response, tool calls, cost, tokens, session id, status, **and engine** (`claude:primary:<modelId>` / `claude:secondary:<modelId>` / `local:<backend>:<modelId>` / `remote:openrouter:<modelId>`).
+- **Persistent audit trail** — every turn (allowed, denied, queue-full) writes a SQLite row with prompt, response, tool calls, cost, tokens, session id, status, **and engine** (`claude:primary:<modelId>` / `claude:secondary:<modelId>` / `local:<backend>:<modelId>` / `remote:openrouter:<modelId>`). When voice is enabled, every STT/TTS attempt — allowed, capped, denied at the gate, errored — writes a row to the separate `voice_events` table with kind/source/cost-estimate/status.
+- **Independent voice cost cap** — per-chat and global sliding 60-min ceilings on ElevenLabs spend (`voice_events.cost_usd_estimate`), separate from the Anthropic burn cap. ElevenLabs API key + voice-mode env vars (`ELEVENLABS_*`, `VOICE_*`) are scrubbed from the Claude SDK subprocess env so a compromised model can't exfiltrate the billed credential.
 - **Session resume across restarts** — SDK session ids persisted per chat **and per tier**; conversations survive process death.
 - **Inline-keyboard confirm UX** — 60-second timeout, fail-closed on send failure, verdict stamped into chat history after tap.
 - **Sub-agent default-deny** — `Agent`/`Task` tools disabled at SDK + policy layers.
diff --git a/docs/GLOSSARY.md b/docs/GLOSSARY.md
index e3c490f..06b5bae 100644
--- a/docs/GLOSSARY.md
+++ b/docs/GLOSSARY.md
@@ -34,6 +34,8 @@ Terms that recur across Solrac's codebase and docs. Alphabetical.
 
 **edit (Telegram)** — `editMessageText` API call. Solrac edits its 🤔 stub message rather than sending many small ones. Throttled to 1.5s between edits (`agent.ts:19`).
 
+**ElevenLabs** — Hosted speech provider used by Solrac's optional voice path. STT via Scribe (`/v1/speech-to-text`); TTS via voice models (`/v1/text-to-speech/{voice_id}/stream`). Two HTTP calls, no SDK. `ELEVENLABS_*` and `VOICE_*` env vars are scrubbed from the Claude SDK subprocess (`agent.ts::sanitizedSubprocessEnv`) so a compromised model can't exfiltrate the billed credential. See `src/elevenlabs.ts`.
+
 **from.id** — Telegram user identifier. The user who actually sent a message. Differs from `chat.id` in groups and forwarded messages.
 
 **handled_updates** — SQLite table holding all claimed `update_id`s. Idempotency surface for the poll loop; pruned by a future janitor (deferred to a follow-up).
@@ -90,6 +92,8 @@ Terms that recur across Solrac's codebase and docs. Alphabetical.
 
 **stub** — The `🤔 thinking…` placeholder message Solrac sends at turn start, then edits with progress. Final state is the same message edited to the answer + footer (`<i>✅ N turns · $X.XXXX</i>`). No separate "final" message — that's intentional (see ARCHITECTURE.md "No-op-edit guard").
 
+**STT** — Speech-to-text. ElevenLabs Scribe in Solrac. Telegram voice notes get transcribed inline (`handleTelegramVoiceStt` synthesizes a text Update for the existing dispatcher); the web UI's mic button pre-fills the composer with the transcript. Off by default; gated by `VOICE_ENABLED=true`.
+
 **SOUL.md** — Operator-editable persona file at the launch cwd's root. Contains voice, stance, and the `<untrusted-content>` safety clause. Read once at boot via `instance.ts::loadSoul`; joined with an engine-specific capability note and shipped as `systemPrompt.append` (Claude path) or as the first `system` message (local path). Hard-fails at boot if missing or empty. Mirrors OpenClaw's SOUL concept (voice, not operating rules).
 
 **SOLRAC.md** — Operator-editable instance overlay at the launch cwd's root. Contains operator-specific operating rules (operator name, channel posture, project hints). Re-read per turn so live edits take effect immediately. Wrapped in `<solrac-md>...</solrac-md>` and injected at the top of the user-message envelope (Claude path) or as a second `system` message (local path). Soft-warn if missing — Solrac runs vanilla without it. Carries a `solrac-md:unedited` sentinel marker on first install so a fresh template injects nothing until the operator activates the overlay. Analogous to a per-project CLAUDE.md.
@@ -104,6 +108,8 @@ Terms that recur across Solrac's codebase and docs. Alphabetical.
 
 **TurnTracker** — `turn-tracker.ts`. Symbol-keyed `Set<symbol>` tracking active turns. `count` for `/stats`; `drain()` for shutdown.
 
+**TTS** — Text-to-speech. ElevenLabs voice models. Web UI gets a per-message 🔊 button with a cached blob — replay doesn't re-bill. Telegram gets a voice-note attachment when `/voice on` (`sessions.voice_replies = 1`). `VOICE_TTS_MAX_CHARS` (default 1500) is a length wall before the call fires. Off by default; gated by `VOICE_ENABLED=true`.
+
 **`tree_id`** — see above.
 
 **untrusted-content wrapper** — `policy.ts::wrapUntrustedContent(text, source)` returns `<untrusted-content source="…">text</untrusted-content>`. Paired with a system-prompt clause that tells the agent to treat such blocks as data, never instructions. v1 has no inbound-attachment intake yet, so the wrapper is wired but unused.
@@ -112,6 +118,12 @@ Terms that recur across Solrac's codebase and docs. Alphabetical.
 
 **verdict** — The user's tap on a confirm prompt: `"allow" | "deny" | "timeout"`. Surfaced from the broker as a `ConfirmDecision`.
 
+**voice cost cap** — Independent of the Anthropic burn cap. Per-chat `VOICE_HOURLY_COST_CAP_USD` and global `VOICE_GLOBAL_HOURLY_COST_CAP_USD`, both sliding 60-min windows over `voice_events.cost_usd_estimate`. Enforced pre-flight on every STT/TTS attempt. Cap-hit writes a `denied_cap` row with `error_message ∈ {chat_voice_cap, global_voice_cap}` and refuses the call.
+
+**voice mode** — Per-chat sticky toggle: `/voice on` / `/voice off`. Persisted as `sessions.voice_replies` (0 or 1). When on, Telegram replies attach a TTS voice note and a `<voice-mode>` system block tells the model to keep replies under N words (`VOICE_REPLY_WORDS_HINT`, clamped to [30,200]). Web UI ignores the flag — its per-message 🔊 button is on-demand. See `commands.ts::runVoiceSet`.
+
+**voice_events** — Append-only SQLite table parallel to `audit`. One row per ElevenLabs attempt with `kind` (`stt`|`tts`), `source` (`web`|`telegram`), `status` (`ok`|`denied_cap`|`denied_gate`|`error`), `cost_usd_estimate`, `duration_ms` (STT) or `chars` (TTS). Source of truth for the voice cost cap. `audit_id` is informational (no FK) so denied-gate STTs — which never reach `audit` — still get a row. See [SCHEMA.md#voice_events](./SCHEMA.md#voice_events).
+
 **WAL** — SQLite Write-Ahead Log mode (`PRAGMA journal_mode = WAL`). Concurrent readers + a single writer; checkpointed to truncate on graceful shutdown (`PRAGMA wal_checkpoint(TRUNCATE)` in `lifecycle.ts`).
 
 **web transport** — Optional second transport: a `Bun.serve` instance on `SOLRAC_WEB_HOST:SOLRAC_WEB_PORT` that hosts a browser chat UI. All web traffic shares one synthetic `chat.id` (default `-1000`, settable via `SOLRAC_WEB_CHAT_ID`). Token-gated login (`SOLRAC_WEB_TOKEN`) → HttpOnly + SameSite=Strict cookie. The `WebClient` (`src/web-client.ts`) implements the same `TelegramClient` interface as the bot path, publishing to an in-process bus consumed by SSE. Off by default; see [SETUP.md#11-optional-enable-the-browser-web-ui](./SETUP.md#11-optional-enable-the-browser-web-ui).
diff --git a/docs/OPERATIONS.md b/docs/OPERATIONS.md
index 86e8012..c11b4c2 100644
--- a/docs/OPERATIONS.md
+++ b/docs/OPERATIONS.md
@@ -676,6 +676,77 @@ GROUP BY skill_name, origin
 ORDER BY n DESC;
 ```
 
+### Voice events
+
+The `voice_events` table is append-only and parallel to `audit`. Every voice flow (STT in, TTS out) writes a row regardless of outcome — `ok`, `denied_cap` (voice cap fired), `denied_gate` (allowlist), or `error` (upstream / network).
+
+```sql
+-- Recent voice events for a chat (one row per stt/tts attempt)
+SELECT id,
+       datetime(ts_ms/1000, 'unixepoch') AS at,
+       kind,
+       source,
+       status,
+       duration_ms,
+       chars,
+       ROUND(cost_usd_estimate, 4) AS cost,
+       error_message
+FROM voice_events
+WHERE chat_id = ?
+ORDER BY ts_ms DESC
+LIMIT 50;
+```
+
+```sql
+-- Voice spend last 24h, per chat and per kind
+SELECT chat_id,
+       kind,
+       COUNT(*) AS attempts,
+       SUM(CASE status WHEN 'ok' THEN 1 ELSE 0 END) AS ok,
+       SUM(CASE status WHEN 'denied_cap' THEN 1 ELSE 0 END) AS capped,
+       SUM(CASE status WHEN 'error' THEN 1 ELSE 0 END) AS errored,
+       ROUND(SUM(cost_usd_estimate), 4) AS spend
+FROM voice_events
+WHERE ts_ms >= (strftime('%s','now','-1 day') * 1000)
+GROUP BY chat_id, kind
+ORDER BY spend DESC;
+```
+
+```sql
+-- Sliding 60-min global voice spend (matches the in-process cap query)
+SELECT ROUND(SUM(cost_usd_estimate), 4) AS spent_last_hour
+FROM voice_events
+WHERE ts_ms >= (strftime('%s','now','-1 hour') * 1000)
+  AND status = 'ok';
+```
+
+```sql
+-- Voice cap-deny rate by chat (last 7 days). High values mean the cap is too
+-- low OR the operator is using voice heavily — check before raising the cap.
+SELECT chat_id,
+       SUM(CASE status WHEN 'denied_cap' THEN 1 ELSE 0 END) * 100.0 / COUNT(*)
+         AS pct_denied,
+       COUNT(*) AS total_attempts
+FROM voice_events
+WHERE ts_ms >= (strftime('%s','now','-7 day') * 1000)
+GROUP BY chat_id
+HAVING total_attempts >= 5
+ORDER BY pct_denied DESC;
+```
+
+```sql
+-- Upstream errors (ElevenLabs side: rate limits, auth, transcoding issues)
+SELECT datetime(ts_ms/1000, 'unixepoch') AS at,
+       chat_id,
+       kind,
+       source,
+       error_message
+FROM voice_events
+WHERE status = 'error'
+ORDER BY ts_ms DESC
+LIMIT 20;
+```
+
 ---
 
 ## Workspace inspection
diff --git a/docs/RUNBOOK.md b/docs/RUNBOOK.md
index c0be8e2..95e9f75 100644
--- a/docs/RUNBOOK.md
+++ b/docs/RUNBOOK.md
@@ -12,6 +12,8 @@ For day-to-day operations, see [OPERATIONS.md](./OPERATIONS.md).
 - [Bot silent, no error in logs](#bot-silent-no-error)
 - [Drain timeout on shutdown](#drain-timeout)
 - [Runaway cost (cap not firing)](#runaway-cost)
+- [Voice cost runaway (ElevenLabs)](#voice-cost-runaway)
+- [Voice silent / ElevenLabs errors](#voice-errors)
 - [DB corruption / lock errors](#db-corruption)
 - [OOM kill / runaway memory](#oom)
 - [Zombie poller / stale PID](#zombie-poller)
@@ -325,6 +327,152 @@ sudo systemctl start solrac.service
 
 ---
 
+<a id="voice-cost-runaway"></a>
+
+## Voice cost runaway (ElevenLabs)
+
+### Symptoms
+
+- Daily report DM shows voice spend above the configured cap.
+- Users report `/voice on` replies stopped arriving with no error in chat.
+- `voice_events` shows a spike in `cost_usd_estimate` or a wave of `denied_cap` rows.
+
+Note: the voice cost axis is **independent** from the Anthropic axis. A voice runaway does not show up in `audit.cost_usd` or the `/stats` `spend24hUsd` number — check `voice_events` separately.
+
+### Diagnosis
+
+The voice cost cap fires *before* the ElevenLabs call, writing a `denied_cap` row. Find the recent caps:
+
+```sh
+sqlite3 data/solrac.sqlite \
+  "SELECT id, chat_id, kind, source, error_message,
+          datetime(ts_ms/1000,'unixepoch') AS at
+   FROM voice_events
+   WHERE status = 'denied_cap'
+   ORDER BY ts_ms DESC LIMIT 20"
+```
+
+`error_message='global_voice_cap'` → host-wide ceiling fired (every chat muted).
+`error_message='chat_voice_cap'` → only the named chat is muted; others still speak.
+
+Top spenders in the last hour:
+
+```sh
+sqlite3 data/solrac.sqlite \
+  "SELECT chat_id, kind,
+          ROUND(SUM(cost_usd_estimate), 4) AS spent,
+          COUNT(*) AS attempts
+   FROM voice_events
+   WHERE ts_ms >= (strftime('%s','now') - 3600) * 1000
+     AND status = 'ok'
+   GROUP BY chat_id, kind
+   ORDER BY spent DESC"
+```
+
+Likely causes:
+
+1. **Verbose TTS replies.** Long markdown answers get spoken in full; `VOICE_TTS_MAX_CHARS` (default 1500) is a hard wall but doesn't compress shorter replies. `VOICE_REPLY_WORDS_HINT` is a model hint, not enforced.
+2. **TTS price drift.** `ELEVENLABS_TTS_PRICE_USD_PER_1K_CHARS` is operator-set. If your plan changed, the local estimate diverges from reality — but the cap still fires at the operator-set rate. The cap is a behavior gate, not a billing reconciliation.
+3. **Replay button refetching.** Browser cache miss on the web 🔊 button forces a re-synthesis. Each click should hit a cached blob; if not, a UI regression — look for `voice.tts_called` log events repeating against the same `audit_id`.
+
+### Recovery
+
+Immediate stop for one chat — turn voice mode off:
+
+```sh
+sqlite3 data/solrac.sqlite \
+  "UPDATE sessions SET voice_replies = 0 WHERE chat_id = <chatId>"
+```
+
+(Or have the operator type `/voice off` in that chat.)
+
+Global stop — set `VOICE_ENABLED=false` in `.env` and restart. ElevenLabs calls return early at the boot gate; existing audio messages in chat history are untouched.
+
+Tighten the cap:
+
+```ini
+# .env
+VOICE_HOURLY_COST_CAP_USD=0.10
+VOICE_GLOBAL_HOURLY_COST_CAP_USD=0.50
+```
+
+Restart. Next turns will hit the lower ceiling sooner.
+
+### Prevention
+
+- Turning `/voice on` for a chat doesn't double the model spend — TTS speaks the text the model already produced. The voice axis is purely ElevenLabs.
+- Enable for one chat first, watch `voice_events` for a day, then decide the global ceiling.
+- Pin `ELEVENLABS_TTS_PRICE_USD_PER_1K_CHARS` and `ELEVENLABS_STT_PRICE_USD_PER_HOUR` to your actual plan rate so the local estimate matches your invoice.
+
+---
+
+<a id="voice-errors"></a>
+
+## Voice silent / ElevenLabs errors
+
+### Symptoms
+
+- `/voice on` confirmed, but Telegram replies arrive without an audio attachment.
+- Web mic button records and uploads, but the composer doesn't pre-fill with a transcript.
+- `voice_events` has rows with `status='error'`.
+- Boot exits with `ELEVENLABS_API_KEY is required when VOICE_ENABLED=true` (or the same for `ELEVENLABS_VOICE_ID`).
+
+### Diagnosis
+
+Read the most recent failures:
+
+```sh
+sqlite3 data/solrac.sqlite \
+  "SELECT id, chat_id, kind, source, status, error_message,
+          datetime(ts_ms/1000,'unixepoch') AS at
+   FROM voice_events
+   WHERE status IN ('error','denied_gate')
+   ORDER BY ts_ms DESC LIMIT 10"
+```
+
+| `status` / `error_message` | Cause | Fix |
+|---|---|---|
+| `error` / verbatim 401 message | API key invalid or missing required permission | Verify in the ElevenLabs dashboard that the key has *both* Speech to Text and Text to Speech permissions. A TTS-only key works for `/voice on` but mic button uploads fail. Log line: `voice.auth_failed`. |
+| `error` / verbatim 429 message | ElevenLabs throttled your account | Back off; lower TTS use; upgrade the plan. No Solrac-side retry (the cap already bounds spend). Log line: `voice.rate_limited`. |
+| `error` / verbatim 4xx message | Wrong voice id, malformed multipart, unsupported model | Confirm `ELEVENLABS_VOICE_ID` against the dashboard. For STT, Telegram voice notes are OGG/Opus (supported by Scribe). Log line: `voice.upstream_error` with the upstream status. |
+| `denied_gate` / `denied` | `from.id` not in the allowlist (for voice-note STT) | Add to `ALLOWLIST_BOOTSTRAP`, restart. Allowlist gates apply uniformly to voice and text. |
+| `denied_gate` / `no_from` | Update has no `from.id` (channel post, service message) | Not actionable — these are filtered by design. |
+| `denied_cap` / `chat_voice_cap` or `global_voice_cap` | Voice cost cap fired | See [Voice cost runaway](#voice-cost-runaway). |
+| `error` / `too_long` (rare, never reaches ElevenLabs) | Input exceeded `VOICE_STT_MAX_BYTES` / `VOICE_STT_MAX_SECONDS` or `VOICE_TTS_MAX_CHARS` | Adjust the limit, or trim the input. |
+| (boot rejection) | `VOICE_ENABLED=true` without `ELEVENLABS_API_KEY` or `ELEVENLABS_VOICE_ID` | Set both, restart. Both are required when voice is enabled. |
+
+### Recovery
+
+**Common gotcha: shell-exported `ELEVENLABS_API_KEY` overriding `.env`.** Bun reads `process.env` first; an exported value beats the file. Check:
+
+```sh
+echo "${ELEVENLABS_API_KEY:0:12}"          # what the shell sees
+grep ELEVENLABS_API_KEY .env | cut -c1-30  # what the file holds
+```
+
+If they don't match, unset the shell var and restart:
+
+```sh
+unset ELEVENLABS_API_KEY
+sudo systemctl restart solrac.service
+```
+
+For 401 / 403 specifically: rotate the key. Generate a new key in the ElevenLabs dashboard with both required permissions, paste into `.env`, restart. Verify with a direct curl before debugging Solrac further:
+
+```sh
+curl -sS -H "xi-api-key: $ELEVENLABS_API_KEY" https://api.elevenlabs.io/v1/user | jq
+```
+
+For 429: there's no retry layer in `elevenlabs.ts` — that's intentional. Wait it out; the cap already bounds in-Solrac spend.
+
+### Prevention
+
+- One unrestricted dev key + a restricted prod key with only Text to Speech + Speech to Text permissions. ElevenLabs billing is independent of Anthropic, so rotating either key is contained.
+- Never paste the key into chat or commit it. The CLAUDE.md secret-handling rules apply to ElevenLabs keys too.
+- `ELEVENLABS_*` and `VOICE_*` env vars are scrubbed from the Claude SDK subprocess (`agent.ts::sanitizedSubprocessEnv`); a compromised model can't exfiltrate them. If you add a new ElevenLabs-related env, add it to the scrub list in the same PR.
+
+---
+
 ## DB corruption
 
 ### Symptoms
diff --git a/docs/SCHEMA.md b/docs/SCHEMA.md
index 1e23a4f..5920c13 100644
--- a/docs/SCHEMA.md
+++ b/docs/SCHEMA.md
@@ -32,6 +32,7 @@ Pragmas Solrac sets at boot:
 | `sessions` | one row per chat | per-tier SDK session ids + pending `/compact` summaries |
 | `audit` | one row per attempted turn (allowed, denied, queue-full, tool-call sub-row) | append-mostly; the source of truth |
 | `scheduled_tasks` | one row per loaded `TASK.md` | upserted at boot; `last_run_at` / `one_off_consumed` updated by the tick loop |
+| `voice_events` | one row per ElevenLabs attempt (STT or TTS; ok, capped, gated, errored) | append-only; independent axis from `audit` (own sliding 60-min cost cap) |
 
 Authoritative source for shapes + migrations: `src/db.ts` (look at the `SCHEMA` constant and the post-`SCHEMA` `ALTER TABLE` block).
 
@@ -209,6 +210,48 @@ scheduled_tasks(
 
 One row per task loaded at boot. The loader upserts `source_path / source_hash / updated_at`; the tick loop updates `last_*` on each fire (and flips `one_off_consumed = 1` for `at <ISO>` tasks). When a `TASK.md` is removed from disk, the row stays — operators query `scheduled_tasks LEFT JOIN` the registry at runtime. `/tasks` slash command joins this table with the in-memory registry to render last/next fire info.
 
+### `voice_events`
+
+```sql
+voice_events(
+  id                 INTEGER PRIMARY KEY AUTOINCREMENT,
+  chat_id            INTEGER NOT NULL,
+  ts_ms              INTEGER NOT NULL,
+  kind               TEXT NOT NULL CHECK (kind IN ('stt','tts')),
+  source             TEXT NOT NULL CHECK (source IN ('web','telegram')),
+  model              TEXT NOT NULL,                       -- e.g. 'scribe_v1', 'eleven_flash_v2_5'
+  voice_id           TEXT,                                -- null for STT
+  audit_id           INTEGER,                             -- informational; NO FK (denied_gate has no audit row)
+  duration_ms        INTEGER,                             -- STT input duration
+  chars              INTEGER,                             -- TTS char count
+  cost_usd_estimate  REAL NOT NULL DEFAULT 0,             -- local estimate from env-set prices
+  status             TEXT NOT NULL CHECK (status IN ('ok','denied_cap','denied_gate','error')),
+  error_message      TEXT
+)
+```
+
+Parallel to `audit`, not nested inside it. One turn can emit multiple voice events (a Telegram voice note triggers an STT row, a `/voice on` reply triggers a TTS row — two separate rows). Every gate path writes a row before the ElevenLabs request fires, so denials show up here even when ElevenLabs never saw the call.
+
+#### `status` values
+
+| Value | Meaning | `cost_usd_estimate` |
+|---|---|---|
+| `ok` | ElevenLabs returned 2xx; `duration_ms` (STT) or `chars` (TTS) populated. | computed from env-set price |
+| `denied_cap` | Voice cost cap fired pre-flight. `error_message` ∈ `chat_voice_cap`, `global_voice_cap`. ElevenLabs never called. | 0 |
+| `denied_gate` | Allowlist / voice-disabled rejection. `error_message` ∈ `denied`, `no_from`. ElevenLabs never called. | 0 |
+| `error` | ElevenLabs returned non-2xx or the request threw. `error_message` carries the upstream message verbatim. | 0 |
+
+Special `error_message` value: `too_long` — input exceeded `VOICE_STT_MAX_BYTES` / `VOICE_STT_MAX_SECONDS` (STT) or `VOICE_TTS_MAX_CHARS` (TTS) before any request fired.
+
+#### Cost estimate
+
+`cost_usd_estimate` is computed locally — ElevenLabs doesn't return cost on the response. Two operator-set price knobs feed the math:
+
+- STT: `(duration_ms / 1000 / 3600) * ELEVENLABS_STT_PRICE_USD_PER_HOUR`
+- TTS: `(chars / 1000) * ELEVENLABS_TTS_PRICE_USD_PER_1K_CHARS`
+
+The cap is enforced against these local estimates, not against ElevenLabs' billing API. If the published prices change, update the env — the table values aren't backfilled.
+
 ## Indexes
 
 | Index | Columns | Used by |
@@ -217,6 +260,8 @@ One row per task loaded at boot. The loader upserts `source_path / source_hash /
 | `idx_audit_chat_started` | `(chat_id, started_at)` | the cost-cap query path (`db.sumChatCostSince`) — fires before every Claude tool call |
 | `idx_audit_chat_model_started` | `(chat_id, model, started_at)` | engine-scoped queries: `outOfBandForEngine`, `recentChatTurnsForEngine`, `lastSuccessfulTurnAt`, `countChatTurnsForEngine`, etc. |
 | `idx_audit_task_started` | `(task_name, started_at)` | scheduler queries: per-task cost windows (`max_cost_usd` pre-flight) and operator audit dumps |
+| `idx_voice_events_chat_ts` | `(chat_id, ts_ms)` | per-chat voice cost-cap window (`db.voiceCostUsedLast60min`) |
+| `idx_voice_events_ts` | `(ts_ms)` | global voice cost-cap window (`db.voiceCostUsedGlobalLast60min`) |
 
 The first two are declared in the `SCHEMA` constant; the others are created after the migration block so they can reference columns added incrementally on existing databases.
 
@@ -528,6 +573,91 @@ WHERE (model LIKE 'local:%' OR model LIKE 'ollama:%')          -- dual-pattern:
   AND started_at >= (strftime('%s','now') - 7*86400) * 1000;
 ```
 
+### Voice events
+
+For spend-focused voice queries (cost by chat/kind, daily totals, cap-hit rates) see [OPERATIONS.md#voice-events](./OPERATIONS.md#voice-events). The queries below are debugging-oriented and non-overlapping.
+
+**Trace one voice attempt end-to-end** (link from an `audit.id` to the voice event it produced):
+
+```sql
+SELECT id, chat_id, kind, source, status, error_message,
+       duration_ms, chars, ROUND(cost_usd_estimate, 4) AS cost,
+       datetime(ts_ms/1000,'unixepoch') AS at
+FROM voice_events
+WHERE audit_id = <audit_id>
+ORDER BY ts_ms;
+```
+
+**Recent voice errors with verbatim ElevenLabs messages.** Only `status='error'` rows carry upstream message strings; gate/cap denials carry their own short codes.
+
+```sql
+SELECT id, chat_id, kind, source, error_message,
+       datetime(ts_ms/1000,'unixepoch') AS at
+FROM voice_events
+WHERE status = 'error'
+ORDER BY ts_ms DESC LIMIT 20;
+```
+
+**Cap-hit breakdown (chat vs global).** `error_message` distinguishes which ceiling fired.
+
+```sql
+SELECT error_message,
+       COUNT(*) AS hits,
+       COUNT(DISTINCT chat_id) AS chats_affected
+FROM voice_events
+WHERE status = 'denied_cap'
+  AND ts_ms >= (strftime('%s','now') - 86400) * 1000
+GROUP BY error_message;
+```
+
+**Gate denials by reason** (`denied` = allowlist; `no_from` = malformed update).
+
+```sql
+SELECT error_message, source, COUNT(*) AS denials
+FROM voice_events
+WHERE status = 'denied_gate'
+GROUP BY error_message, source
+ORDER BY denials DESC;
+```
+
+**Longest STT inputs.** Useful for spotting users uploading long voice notes that risk the size wall.
+
+```sql
+SELECT id, chat_id, source,
+       ROUND(duration_ms / 1000.0, 1) AS seconds,
+       ROUND(cost_usd_estimate, 4) AS cost,
+       datetime(ts_ms/1000,'unixepoch') AS at
+FROM voice_events
+WHERE kind = 'stt' AND status = 'ok'
+ORDER BY duration_ms DESC LIMIT 20;
+```
+
+**TTS char distribution** — where are long replies coming from?
+
+```sql
+SELECT chat_id,
+       COUNT(*) AS tts_calls,
+       MIN(chars) AS min_chars,
+       ROUND(AVG(chars), 0) AS avg_chars,
+       MAX(chars) AS max_chars,
+       ROUND(SUM(cost_usd_estimate), 4) AS total_cost
+FROM voice_events
+WHERE kind = 'tts' AND status = 'ok'
+  AND ts_ms >= (strftime('%s','now') - 7*86400) * 1000
+GROUP BY chat_id
+ORDER BY total_cost DESC;
+```
+
+**`/voice on` state per chat** (lives in `sessions`, not `voice_events`).
+
+```sql
+SELECT chat_id, voice_replies,
+       datetime(updated_at/1000,'unixepoch') AS last_updated
+FROM sessions
+WHERE voice_replies = 1
+ORDER BY updated_at DESC;
+```
+
 ### Tool inspection
 
 **Tool-call distribution per Claude tier (last 7 days).**
@@ -682,6 +812,8 @@ The `audit` log is the source of truth for "what did the bot do?" but it's not a
 
 - [ARCHITECTURE.md#sqlite-schema](./ARCHITECTURE.md#sqlite-schema) — schema rationale and design decisions
 - [OPERATIONS.md#audit-queries](./OPERATIONS.md#audit-queries) — cost-focused operator queries
+- [OPERATIONS.md#voice-events](./OPERATIONS.md#voice-events) — voice spend and cap-hit queries
 - [OPERATIONS.md#backup-and-restore](./OPERATIONS.md#backup-and-restore) — backup procedure
 - [RUNBOOK.md#db-corruption](./RUNBOOK.md#db-corruption) — recovery from `database disk image is malformed`
+- [RUNBOOK.md#voice-cost-runaway](./RUNBOOK.md#voice-cost-runaway) — voice cost runaway recovery
 - `src/db.ts` — schema source of truth, prepared statements, migrations
diff --git a/docs/USAGE.md b/docs/USAGE.md
index f4aa682..42f7cfc 100644
--- a/docs/USAGE.md
+++ b/docs/USAGE.md
@@ -172,6 +172,7 @@ Slash commands give you control over conversation context and visibility into sp
 | `/context @\|!` | **none** — tier required | Show audit-table footprint (bytes), turn count, last turn's token breakdown (fresh / cache read / cache create / output), and estimated next-turn replay size. **Bare `/context` rejects** for the same reason as `/compact`. | Free |
 | `/help` | — | Engine prefix table + command reference. Engine section is dynamic (renders the deploy's actual default). | Free |
 | `/status` | — | Per-chat session/spend snapshot + global rollup + queue depth + uptime. Claude session lines render only when a session exists; a `local turns (24h): N` bullet is added when applicable. | Free |
+| `/voice [on\|off]` | shows state | Toggle per-chat voice replies. With voice mode on, Telegram replies attach an audio voice note AND a `<voice-mode>` system block tells the model to respond in under N words. Works in both Telegram and the web UI (web operators get the word-limit nudge; per-message speak button replaces the auto-attach). Requires `VOICE_ENABLED=true` at deploy. See [Voice (ElevenLabs STT + TTS)](#voice-elevenlabs-stt--tts). | Free (the flag write; TTS spend is independent) |
 
 ### Tier args
 
@@ -1114,6 +1115,97 @@ Claude and the local engine both emit markdown. Solrac now converts markdown to
 - Sessions are stored in process memory; restarting Solrac signs out all browsers (operator must log in again). The conversation history is hydrated from the audit log on next page load.
 - Confirmation prompts that arrive before the operator opens the UI are silently dropped on broker timeout (60 s). Same failure mode as Telegram when the operator's phone is off.
 
+## Voice (ElevenLabs STT + TTS)
+
+Off by default. Enabled via `VOICE_ENABLED=true` + an ElevenLabs key/voice id. Adds two affordances on both transports:
+
+- **Speech-in (STT):** Telegram voice notes get transcribed via ElevenLabs Scribe into the normal text path. The web UI gets a mic button that records via `MediaRecorder` and pre-fills the composer with the transcript.
+- **Speech-out (TTS):** with `/voice on` set for a chat, Telegram replies attach an audio voice note. On the web UI, each assistant reply gets a 🔊 button for on-demand playback.
+
+> ⚠️ **Privacy.** Audio (your spoken words) AND the assistant's reply text are sent to ElevenLabs SaaS for transcription and synthesis. `SOUL.md` and `SOLRAC.md` never leave the host, but the conversation content does. Don't enable voice on chats covering material you wouldn't paste into a third-party transcription tool.
+
+### Setup
+
+1. **ElevenLabs account.** Sign up at [elevenlabs.io](https://elevenlabs.io). Pick a paid plan if you want production usage limits — the free tier is fine for testing.
+2. **API key.** Profile + API Keys → Create API Key. Recommended: name it `solrac`, turn **Restrict Key** on, set **Text to Speech → Access** and **Speech to Text → Access**, leave everything else as `No Access`. ElevenLabs reveals the `sk_…` value **once at creation** — copy it immediately.
+3. **Voice id.** Sidebar → Voices (or VoiceLab) → pick a voice → copy the 20-char id from the voice's detail page. One voice per deploy in v1.
+4. **`.env` values:**
+   ```sh
+   VOICE_ENABLED=true
+   ELEVENLABS_API_KEY=sk_…           # the value from step 2
+   ELEVENLABS_VOICE_ID=…             # the id from step 3
+   ```
+   Restart Solrac. Boot fails loud if either of the two required vars is missing.
+
+Full env reference: [CONFIG.md#variables](./CONFIG.md#variables) (search for `VOICE_` and `ELEVENLABS_`).
+
+### Telegram voice notes
+
+**Sending audio in.** Hold the 🎙️ mic icon in Telegram's input bar (or click it on desktop), record, release to send. Solrac downloads the audio, hits Scribe, synthesizes a text Update, and feeds it back into the queue. Your prefix conventions still apply — if you say "at Sonnet what's the time" out loud, the transcript probably won't start with `@`, so the engine slot answers. To force a tier, type the prefix instead.
+
+The audio file size cap is `VOICE_STT_MAX_BYTES` (default 2 MiB). Telegram caps voice notes at ~1 minute / ~1 MB anyway, so the limit rarely fires.
+
+Solrac handles `msg.voice` only — files attached via "Send a File" (which arrive as `msg.audio`) are out of scope for v1.
+
+**Getting audio out.** Type `/voice on` once per chat — that flips a sticky per-chat flag (`sessions.voice_replies`) so every successful reply from that chat gets an audio note attached. `/voice off` turns it back off; `/voice` with no arg shows the current state. The flag survives restarts.
+
+When voice mode is on:
+- Each turn also injects a `<voice-mode>` system block telling the model to respond in under `VOICE_REPLY_WORDS_HINT` words (default 60) — the soft target keeps replies short enough to listen to.
+- After the text reply lands, Solrac strips the markdown, calls ElevenLabs TTS, and sends the audio via Telegram's `sendVoice` (Ogg/Opus voice note pill) or `sendAudio` (MP3 file) depending on the configured output format.
+- Replies longer than `VOICE_TTS_MAX_CHARS` (default 3000) are refused with a chat message — the prompt nudge usually keeps things short enough, but the wall is the last line of defense.
+
+### Web UI voice surface
+
+When `VOICE_ENABLED=true` AND you're logged in to the web UI:
+
+- **🎙️ mic button** in the composer (next to send). Click → grant mic permission → speak → click again to stop (or auto-stop at `VOICE_STT_MAX_SECONDS`, default 60). The transcript pre-fills the composer; review and send.
+- **🔊 speak button** on each assistant reply, bottom-right of the bubble. Only appears once the reply hits its final state (the `✅` footer sentinel). Click → audio plays, button switches to ⏹ (click to stop). Re-click 🔊 to replay from cache (no extra ElevenLabs spend).
+- **Voice mode badge** in the header (top right). Shows `🔊 voice mode on` when `/voice on` is active for the web chat id. Click to disable (sends `/voice off`).
+
+The web speak button is **on-demand**, not automatic — you control when you pay for synthesis. Different shape from Telegram, where voice-mode-on auto-attaches every reply.
+
+### Voice mode and engine compliance
+
+The `<voice-mode>` system block is a soft target, not a hard rule:
+
+| Engine | Compliance |
+|---|---|
+| Claude tiers (`@` / `!`) | Strong; ±20-30% drift typical. |
+| Local (Ollama / LMStudio) | Variable. Instruction-tuned models obey; tiny (1-3B param) models may drift wildly. |
+| Remote (OpenRouter) | Depends on the routed model; most modern instruction-tuned models comply. |
+
+If the model ignores the limit AND the reply exceeds `VOICE_TTS_MAX_CHARS`, the hard wall fires and you get the refusal text instead of audio. Two ways to work around:
+1. Ask for a shorter response in the next prompt ("be brief").
+2. Raise `VOICE_TTS_MAX_CHARS` in `.env`.
+
+The "expand 3×" carve-out in the prompt block gives the model room to elaborate when you explicitly ask ("explain in detail") — it'll know it has up to `60 × 3 = 180` words of headroom.
+
+### Voice cost cap
+
+Independent from the Anthropic cap. Two sliding 60-min windows:
+
+| Cap | Env var | Default |
+|---|---|---|
+| Per-chat voice | `VOICE_HOURLY_COST_CAP_USD` | $0.25/hr |
+| Global voice | `VOICE_GLOBAL_HOURLY_COST_CAP_USD` | $1.00/hr |
+
+When a cap fires:
+- Telegram STT: reply text `voice cap reached, try again in a minute`.
+- Telegram TTS: silent (text reply already shipped; we just skip the audio attach).
+- Web STT: HTTP 429, browser toast `voice cap reached`.
+- Web TTS: HTTP 429, browser toast `voice cap reached`.
+
+Cost is **estimated** from duration (STT) or character count (TTS) using the configured price constants. ElevenLabs doesn't return per-call billing on the wire, so estimates may drift from your statement — pin `ELEVENLABS_TTS_PRICE_USD_PER_1K_CHARS` and `ELEVENLABS_STT_PRICE_USD_PER_HOUR` to your plan if you want closer alignment.
+
+Audit queries: [OPERATIONS.md](./OPERATIONS.md) — every voice event (allowed, capped, denied, errored) writes a row to the `voice_events` table.
+
+### Notes & limits (v1)
+
+- Single deploy-wide voice. No per-chat override yet — change `ELEVENLABS_VOICE_ID` to switch voices for everyone.
+- Page reload drops cached audio blobs. Re-clicking 🔊 after a reload re-pays for TTS. Disk cache is a future enhancement.
+- No real-time WebSocket STT. File-based round-trip is enough at conversational latency.
+- Telegram voice mode attaches one audio note per turn; long multi-paragraph replies still respect the 3000-char wall.
+
 ## Related docs
 
 - [GLOSSARY.md](./GLOSSARY.md) — terminology reference
diff --git a/public/app.js b/public/app.js
index 9b56133..0580c9e 100644
--- a/public/app.js
+++ b/public/app.js
@@ -30,6 +30,9 @@ const els = {
   logoutBtn: $("logout-btn"),
   connState: $("conn-state"),
   enginePills: document.querySelectorAll(".engine-opt"),
+  micBtn: $("mic-btn"),
+  voiceBadge: $("voice-badge"),
+  toastHost: $("toast-host"),
 };
 
 // Track DOM nodes by message_id so streaming edits can replace them in place.
@@ -42,6 +45,22 @@ const messageNodes = new Map();
 let activeEngine = "";
 let stream = null;
 let firstTab = true;
+// Voice mode (mirror of sessions.voice_replies for the web chat id). Refreshed
+// from /api/voice/state on boot and after the user sends /voice on|off.
+let voiceModeOn = false;
+// Voice availability for the deploy. Probed once at boot by HEAD-ing
+// /api/voice/state — if it returns 401/200 the route exists (voice enabled);
+// 503 means the server has VOICE_ENABLED=false and the mic/speak surface
+// stays hidden.
+let voiceFeatureAvailable = false;
+// MediaRecorder state. `recorder` is the active recorder; `recordTimeoutId`
+// is the auto-stop timer that fires at VOICE_STT_MAX_SECONDS (60s server-side).
+let recorder = null;
+let recordTimeoutId = null;
+let recordChunks = [];
+// 60s — matches server-side VOICE_STT_MAX_SECONDS default. Client-side cap
+// keeps us from uploading audio the server will reject anyway.
+const RECORD_MAX_MS = 60_000;
 
 // ── Boot ───────────────────────────────────────────────
 
@@ -78,6 +97,8 @@ function bindUi() {
     pill.addEventListener("click", () => setEngine(pill.dataset.prefix));
   }
   setEngine("");
+  els.micBtn?.addEventListener("click", onMicClick);
+  els.voiceBadge?.addEventListener("click", () => sendUser("/voice off"));
 }
 
 // ── Auth ───────────────────────────────────────────────
@@ -98,6 +119,10 @@ function enterChat(history) {
   scrollToBottom();
   openStream();
   els.composerText.focus();
+  // Probe voice availability once per session. /api/voice/state returns 200
+  // when VOICE_ENABLED=true, 503 when off. We surface the mic + speak
+  // buttons only when the deploy supports them.
+  refreshVoiceState();
 }
 
 async function onLoginSubmit(e) {
@@ -185,6 +210,14 @@ function handleEvent(event) {
     if (!node) return;
     const stick = isNearBottom();
     renderBody(node.querySelector(".body"), event.markdown_source, event.html);
+    // Update the stashed markdown so the speak button (added when the
+    // final-state sentinel appears) picks up the latest text. Each edit
+    // overwrites — by the time `✅` lands, dataset.markdown has the full
+    // final reply.
+    if (typeof event.markdown_source === "string") {
+      node.dataset.markdown = event.markdown_source;
+    }
+    maybeAddSpeakButton(node);
     if (stick) scrollToBottom();
   } else if (event.kind === "reaction") {
     // We don't render reactions in the web UI v1.
@@ -233,6 +266,12 @@ async function sendUser(text) {
     els.sendBtn.disabled = false;
     els.composerText.focus();
   }
+  // Slash command that flips sessions.voice_replies — refresh the badge
+  // shortly after so the UI reflects new state. Small delay lets the
+  // command's audit row settle before we re-query.
+  if (/^\s*\/voice(\s|$)/i.test(text)) {
+    window.setTimeout(refreshVoiceState, 500);
+  }
 }
 
 function onComposerKeyDown(e) {
@@ -297,6 +336,9 @@ function appendMessage({ role, markdown, html_fallback, message_id }) {
   const li = document.createElement("li");
   li.className = `msg ${role}`;
   if (typeof message_id === "number") li.dataset.messageId = String(message_id);
+  // Stash the raw markdown on the LI so the speak button can read it
+  // straight from the DOM rather than tracking a parallel Map.
+  if (typeof markdown === "string") li.dataset.markdown = markdown;
   const roleLabel = document.createElement("div");
   roleLabel.className = "role";
   roleLabel.textContent = role === "user" ? "you" : "solrac";
@@ -307,6 +349,7 @@ function appendMessage({ role, markdown, html_fallback, message_id }) {
   li.appendChild(body);
   els.messages.appendChild(li);
   if (typeof message_id === "number") messageNodes.set(message_id, li);
+  if (role === "assistant") maybeAddSpeakButton(li);
   return li;
 }
 
@@ -344,3 +387,286 @@ function isNearBottom() {
   const el = els.messages;
   return el.scrollHeight - el.scrollTop - el.clientHeight <= STICK_THRESHOLD_PX;
 }
+
+// ── Voice ──────────────────────────────────────────────
+
+async function refreshVoiceState() {
+  try {
+    const res = await fetch("/api/voice/state");
+    if (res.status === 503) {
+      // Deploy has VOICE_ENABLED=false. Hide mic + speak surfaces entirely.
+      voiceFeatureAvailable = false;
+      voiceModeOn = false;
+      els.micBtn?.classList.add("hidden");
+      els.voiceBadge?.classList.add("hidden");
+      return;
+    }
+    if (!res.ok) return;
+    const body = await res.json();
+    voiceFeatureAvailable = true;
+    voiceModeOn = body.enabled === true;
+    els.micBtn?.classList.remove("hidden");
+    if (voiceModeOn) {
+      els.voiceBadge?.classList.remove("hidden");
+      if (els.voiceBadge) els.voiceBadge.textContent = "🔊 voice mode on";
+    } else {
+      els.voiceBadge?.classList.add("hidden");
+    }
+    // Audit existing assistant bubbles for speak buttons — when voice
+    // becomes available mid-session (e.g. after a server restart), already
+    // rendered messages should grow their speak button too.
+    for (const node of messageNodes.values()) {
+      maybeAddSpeakButton(node);
+    }
+  } catch {
+    // Network error — leave state as-is.
+  }
+}
+
+async function onMicClick() {
+  if (!voiceFeatureAvailable) return;
+  if (recorder) {
+    stopRecording(); // user tap to stop mid-flight
+    return;
+  }
+  try {
+    const mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });
+    const mime = pickMediaRecorderMime();
+    recorder = new MediaRecorder(mediaStream, mime ? { mimeType: mime } : undefined);
+    recordChunks = [];
+    recorder.addEventListener("dataavailable", (e) => {
+      if (e.data && e.data.size > 0) recordChunks.push(e.data);
+    });
+    recorder.addEventListener("stop", () => onRecordingStop(mediaStream));
+    recorder.start();
+    setMicState("recording");
+    recordTimeoutId = window.setTimeout(() => stopRecording(), RECORD_MAX_MS);
+  } catch (err) {
+    showToast(`mic error: ${err.message ?? "permission denied"}`);
+    recorder = null;
+    setMicState("idle");
+  }
+}
+
+function stopRecording() {
+  if (!recorder) return;
+  if (recordTimeoutId !== null) {
+    window.clearTimeout(recordTimeoutId);
+    recordTimeoutId = null;
+  }
+  try {
+    recorder.stop();
+  } catch {
+    // already inactive
+  }
+}
+
+async function onRecordingStop(mediaStream) {
+  setMicState("uploading");
+  // Stop the audio tracks so the browser indicator clears immediately.
+  for (const t of mediaStream.getTracks()) t.stop();
+  const chunks = recordChunks;
+  const mime = recorder?.mimeType || "audio/webm";
+  recorder = null;
+  recordChunks = [];
+  if (chunks.length === 0) {
+    setMicState("idle");
+    return;
+  }
+  const blob = new Blob(chunks, { type: mime });
+  const form = new FormData();
+  form.append("audio", blob, mime.includes("ogg") ? "audio.ogg" : "audio.webm");
+  try {
+    const res = await fetch("/api/stt", { method: "POST", body: form });
+    if (res.status === 401) {
+      enterLogin();
+      setMicState("idle");
+      return;
+    }
+    if (res.status === 413) {
+      showToast("audio too large — try a shorter clip");
+      setMicState("idle");
+      return;
+    }
+    if (res.status === 429) {
+      showToast("voice cap reached — try again in a minute");
+      setMicState("idle");
+      return;
+    }
+    if (!res.ok) {
+      const body = await res.json().catch(() => ({}));
+      showToast(`transcription failed: ${body.message ?? res.status}`);
+      setMicState("idle");
+      return;
+    }
+    const body = await res.json();
+    const text = typeof body.text === "string" ? body.text : "";
+    if (text) {
+      // Pre-fill the composer; cursor at end so a quick edit-then-send is
+      // one keystroke. Operator decides whether to send — defends against
+      // STT errors. No auto-send in v1.
+      els.composerText.value = els.composerText.value
+        ? els.composerText.value + " " + text
+        : text;
+      autoResize();
+      els.composerText.focus();
+      const len = els.composerText.value.length;
+      els.composerText.setSelectionRange(len, len);
+    }
+  } catch (err) {
+    showToast(`network error: ${err.message ?? "unknown"}`);
+  } finally {
+    setMicState("idle");
+  }
+}
+
+function setMicState(state) {
+  if (!els.micBtn) return;
+  els.micBtn.classList.toggle("recording", state === "recording");
+  els.micBtn.classList.toggle("uploading", state === "uploading");
+  els.micBtn.disabled = state === "uploading";
+  els.micBtn.title =
+    state === "recording" ? "stop recording" : state === "uploading" ? "uploading…" : "record voice";
+}
+
+// MediaRecorder mime varies by browser:
+//   - Chromium → 'audio/webm;codecs=opus' (preferred)
+//   - Firefox  → same
+//   - Safari   → 'audio/mp4' (Safari rejects 'audio/webm')
+// Pick the first supported; let MediaRecorder default if none match
+// (Scribe v2 accepts both webm/opus and mp4/aac).
+function pickMediaRecorderMime() {
+  const candidates = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4"];
+  if (typeof MediaRecorder === "undefined" || !MediaRecorder.isTypeSupported) return null;
+  for (const m of candidates) {
+    if (MediaRecorder.isTypeSupported(m)) return m;
+  }
+  return null;
+}
+
+// ── Speak (TTS) ────────────────────────────────────────
+
+// Detect "final state" of an assistant message — the agent and engine
+// runners suffix successful turns with a `✅ N turns · $X.XXXX` footer
+// in the markdown source (see agent.ts::buildFooter). When that sentinel
+// is present we know the stream is settled and the speak button is safe
+// to expose. Mid-stream the button stays absent so the operator can't
+// pay for TTS on a partial reply.
+function isFinalAssistantMarkdown(md) {
+  if (typeof md !== "string") return false;
+  return md.includes("*✅");
+}
+
+function maybeAddSpeakButton(node) {
+  if (!voiceFeatureAvailable) return;
+  if (node.classList.contains("user")) return;
+  if (node.querySelector(".speak-btn")) return; // already added
+  const md = node.dataset.markdown;
+  if (!isFinalAssistantMarkdown(md)) return;
+  const btn = document.createElement("button");
+  btn.type = "button";
+  btn.className = "speak-btn";
+  btn.title = "speak this reply";
+  btn.setAttribute("aria-label", "speak this reply");
+  btn.textContent = "🔊";
+  btn.addEventListener("click", () => onSpeakClick(node, btn));
+  node.appendChild(btn);
+}
+
+async function onSpeakClick(node, btn) {
+  const existingAudio = node.querySelector(".speak-audio");
+  // Click while playing → stop. Audio element stays attached (and its
+  // blob URL stays live) so the next click replays from cache without
+  // re-billing ElevenLabs.
+  if (existingAudio && !existingAudio.paused) {
+    existingAudio.pause();
+    existingAudio.currentTime = 0;
+    setSpeakButtonState(btn, "idle");
+    return;
+  }
+  // Click after a previous play that ended/stopped → replay from cache.
+  if (existingAudio) {
+    existingAudio.currentTime = 0;
+    try {
+      await existingAudio.play();
+      setSpeakButtonState(btn, "playing");
+    } catch {
+      // Autoplay policy or other transient — silently ignore; the user
+      // can click again. No toast for replay failures.
+    }
+    return;
+  }
+  // First click → fetch TTS, attach a hidden <audio>, autoplay.
+  const markdown = node.dataset.markdown ?? "";
+  const messageId = node.dataset.messageId ? Number(node.dataset.messageId) : null;
+  if (!markdown) return;
+  btn.disabled = true;
+  btn.classList.add("loading");
+  try {
+    const res = await fetch("/api/tts", {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({ message_id: messageId, markdown }),
+    });
+    if (res.status === 401) {
+      enterLogin();
+      return;
+    }
+    if (res.status === 413) {
+      showToast("reply too long to speak — try /voice on for terser replies");
+      return;
+    }
+    if (res.status === 429) {
+      const body = await res.json().catch(() => ({}));
+      showToast(body.message ?? "voice cap reached");
+      return;
+    }
+    if (!res.ok) {
+      const body = await res.json().catch(() => ({}));
+      showToast(`speak failed: ${body.message ?? res.status}`);
+      return;
+    }
+    const blob = await res.blob();
+    const url = URL.createObjectURL(blob);
+    // Inject audio via DOM API (NOT innerHTML) — the sanitizer is for
+    // marked-rendered LLM content; this audio element is UI chrome added
+    // by app code, so the trust boundary doesn't move (plan §10.3).
+    // No `controls` attr — CSS hides the element entirely; the speak
+    // button is the only UI affordance.
+    const audio = document.createElement("audio");
+    audio.src = url;
+    audio.className = "speak-audio";
+    audio.dataset.blobUrl = url;
+    audio.addEventListener("play", () => setSpeakButtonState(btn, "playing"));
+    audio.addEventListener("pause", () => setSpeakButtonState(btn, "idle"));
+    audio.addEventListener("ended", () => setSpeakButtonState(btn, "idle"));
+    node.appendChild(audio);
+    await audio.play();
+  } catch (err) {
+    showToast(`network error: ${err.message ?? "unknown"}`);
+  } finally {
+    btn.disabled = false;
+    btn.classList.remove("loading");
+  }
+}
+
+function setSpeakButtonState(btn, state) {
+  btn.classList.toggle("playing", state === "playing");
+  btn.textContent = state === "playing" ? "⏹" : "🔊";
+  btn.title = state === "playing" ? "stop" : "speak this reply";
+}
+
+// ── Toast ──────────────────────────────────────────────
+
+function showToast(text) {
+  if (!els.toastHost) return;
+  const t = document.createElement("div");
+  t.className = "toast";
+  t.textContent = text;
+  els.toastHost.appendChild(t);
+  // 4s fade-then-remove. The CSS animation handles fade; we just clean up.
+  window.setTimeout(() => {
+    t.classList.add("fading");
+    window.setTimeout(() => t.remove(), 500);
+  }, 4000);
+}
diff --git a/public/index.html b/public/index.html
index 0ef7cff..c7655bd 100644
--- a/public/index.html
+++ b/public/index.html
@@ -23,6 +23,7 @@ <h1>solrac</h1>
     <header class="chat-header">
       <span class="title">solrac · web</span>
       <span class="spacer"></span>
+      <button id="voice-badge" class="voice-badge hidden" type="button" title="click to disable voice mode">🔊 voice mode on</button>
       <span id="conn-state" class="conn-state" title="stream status">…</span>
       <button id="clear-btn" class="header-btn" title="/clear all">⨯</button>
       <button id="logout-btn" class="header-btn" title="logout">⏻</button>
@@ -35,8 +36,10 @@ <h1>solrac</h1>
         <button type="button" class="engine-opt" data-prefix="!" title="secondary Claude (Opus)">!</button>
       </div>
       <textarea id="composer-text" rows="1" placeholder="message solrac…" autofocus></textarea>
+      <button type="button" id="mic-btn" class="mic-btn hidden" title="record voice" aria-label="record voice">🎙️</button>
       <button type="submit" id="send-btn" title="send (Enter)">↵</button>
     </form>
+    <div id="toast-host" class="toast-host" aria-live="polite"></div>
   </section>
 </main>
 <script src="/static/marked.min.js"></script>
diff --git a/public/style.css b/public/style.css
index dce06ec..f7af281 100644
--- a/public/style.css
+++ b/public/style.css
@@ -314,3 +314,123 @@ body {
 }
 
 #send-btn:disabled { opacity: 0.5; cursor: not-allowed; }
+
+/* ── Voice (mic + speak + badge + toast) ──────────────── */
+
+.mic-btn {
+  padding: 10px 12px;
+  border: 1px solid var(--border);
+  border-radius: 6px;
+  background: var(--bg-elev);
+  color: var(--fg);
+  cursor: pointer;
+  font-size: 16px;
+  line-height: 1;
+}
+
+.mic-btn:hover { border-color: var(--accent); }
+.mic-btn:disabled { opacity: 0.5; cursor: not-allowed; }
+
+.mic-btn.recording {
+  border-color: var(--error);
+  color: var(--error);
+  animation: mic-pulse 1.2s ease-in-out infinite;
+}
+
+.mic-btn.uploading {
+  border-color: var(--accent);
+  color: var(--accent);
+  opacity: 0.7;
+}
+
+@keyframes mic-pulse {
+  0%, 100% { box-shadow: 0 0 0 0 rgba(185, 28, 28, 0.4); }
+  50%      { box-shadow: 0 0 0 6px rgba(185, 28, 28, 0); }
+}
+
+.voice-badge {
+  font-size: 12px;
+  color: var(--muted);
+  padding: 2px 8px;
+  border: 1px solid var(--border);
+  border-radius: 999px;
+  background: transparent;
+  cursor: pointer;
+  font-family: inherit;
+  line-height: 1.4;
+}
+
+.voice-badge:hover {
+  border-color: var(--error);
+  color: var(--error);
+}
+
+/* Per-message speak button, bottom-right of the bubble. */
+.msg.assistant {
+  position: relative;
+}
+
+.speak-btn {
+  position: absolute;
+  right: 8px;
+  bottom: 8px;
+  padding: 2px 8px;
+  border: 1px solid var(--border);
+  border-radius: 999px;
+  background: var(--bg-elev);
+  color: var(--muted);
+  cursor: pointer;
+  font-size: 13px;
+  line-height: 1;
+  opacity: 0.6;
+}
+
+.speak-btn:hover { opacity: 1; border-color: var(--accent); color: var(--accent); }
+.speak-btn:disabled { opacity: 0.4; cursor: not-allowed; }
+.speak-btn.loading { animation: mic-pulse 1.2s ease-in-out infinite; }
+
+/* Audio element is invisible — the 🔊 / ⏹ button is the only UI for
+ * playback. display:none does NOT prevent the element from playing. */
+.speak-audio { display: none; }
+
+.speak-btn.playing { opacity: 1; color: var(--accent); border-color: var(--accent); }
+
+/* Toast — bottom-right transient messages. */
+
+.toast-host {
+  position: fixed;
+  right: 16px;
+  bottom: 80px;
+  display: flex;
+  flex-direction: column;
+  gap: 8px;
+  pointer-events: none;
+  z-index: 10;
+}
+
+.toast {
+  pointer-events: auto;
+  max-width: 320px;
+  padding: 8px 12px;
+  border: 1px solid var(--border);
+  border-radius: 6px;
+  background: var(--bg-elev);
+  color: var(--fg);
+  font-size: 13px;
+  box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
+  animation: toast-in 200ms ease-out;
+}
+
+.toast.fading {
+  animation: toast-out 500ms ease-in forwards;
+}
+
+@keyframes toast-in {
+  from { opacity: 0; transform: translateY(8px); }
+  to   { opacity: 1; transform: translateY(0); }
+}
+
+@keyframes toast-out {
+  from { opacity: 1; }
+  to   { opacity: 0; }
+}
diff --git a/src/agent.ts b/src/agent.ts
index e2df33e..646883e 100644
--- a/src/agent.ts
+++ b/src/agent.ts
@@ -105,6 +105,7 @@ import {
 import type { SessionStore, SessionTier } from "./session.ts";
 import { mdToTelegramHtml } from "./markdown.ts";
 import { htmlEscapeText, type TelegramClient } from "./telegram.ts";
+import { buildVoiceModePrompt } from "./voice.ts";
 
 // Exported so the skill runner in commands.ts uses the same threshold as
 // runAgent — diverging values would make loop-detection behavior depend on
@@ -218,6 +219,21 @@ export interface AgentRunDeps {
   // that case we omit `mcpServers` entirely rather than registering an empty
   // server.
   mcpServer?: McpSdkServerConfigWithInstance | null;
+  // Voice — word target for the voice-mode prompt nudge. When set AND
+  // `sessions.voice_replies=1` for the chat, the augmented prompt carries
+  // a `<voice-mode>` block telling the model to keep the reply under this
+  // many words. Optional — omitted when `VOICE_ENABLED=false`.
+  voiceReplyWords?: number;
+  // Post-turn TTS attach. Called once per successful turn AFTER the audit
+  // row is finalized. Wired only on the Telegram transport when VOICE_ENABLED
+  // — web has its own per-message speak button. Best-effort: failures inside
+  // the callback log + return; they do NOT propagate.
+  attachVoiceReply?: (opts: {
+    chatId: number;
+    messageId: number | null;
+    auditId: number;
+    finalText: string;
+  }) => Promise<void>;
 }
 
 export interface AgentRunInput {
@@ -397,9 +413,23 @@ export async function runAgent(deps: AgentRunDeps, input: AgentRunInput): Promis
   // Returns null if the file is missing OR carries the unedited-template
   // marker; either way we skip the wrapper block.
   const instanceMd = readInstanceMd(deps.instanceMdPath);
+  // Voice-mode block when the chat has `/voice on` AND the deploy has
+  // configured a word target. SOLRAC.md loads BEFORE this block so an
+  // operator overlay can override (e.g. "ignore voice-mode word limits on
+  // this project") — see plan §14.
+  const voiceModeBlock =
+    deps.voiceReplyWords !== undefined && deps.db.getVoiceRepliesFlag(input.chatId)
+      ? buildVoiceModePrompt({ words: deps.voiceReplyWords })
+      : null;
   const promptToSend =
-    summary !== null || oobTurns.length > 0 || instanceMd !== null
-      ? buildAugmentedPrompt({ instanceMd, summary, oobTurns, currentPrompt: input.prompt })
+    summary !== null || oobTurns.length > 0 || instanceMd !== null || voiceModeBlock !== null
+      ? buildAugmentedPrompt({
+          instanceMd,
+          summary,
+          oobTurns,
+          currentPrompt: input.prompt,
+          voiceModeBlock,
+        })
       : input.prompt;
   if (summary !== null) {
     log.info("agent.summary_injected", {
@@ -568,6 +598,18 @@ export async function runAgent(deps: AgentRunDeps, input: AgentRunInput): Promis
     numTurns,
     isError,
   });
+
+  // Voice (TTS attach). Fire-and-forget — the text reply is already
+  // committed; a TTS failure must not propagate. The callback itself
+  // gates on `sessions.voice_replies` and cost caps.
+  if (deps.attachVoiceReply && !isError && finalText.trim()) {
+    await deps.attachVoiceReply({
+      chatId: input.chatId,
+      messageId: stubId,
+      auditId,
+      finalText,
+    });
+  }
 }
 
 const defaultAllowAll: CanUseTool = async (toolName) => {
@@ -608,6 +650,15 @@ export function sanitizedSubprocessEnv(): Record<string, string | undefined> {
     // future REMOTE_* secret (additional providers, BYO keys, etc.) is
     // covered without needing to revisit this list.
     if (key.startsWith("REMOTE_")) continue;
+    // ELEVENLABS_* — billed credential (`xi-api-key`) + voice id. The
+    // spawned `claude` subprocess never calls ElevenLabs; `Bash(echo
+    // $ELEVENLABS_API_KEY)` is auto-allowed under BASH_SAFE_PREFIXES and
+    // would exfiltrate the key. Same rationale as REMOTE_API_KEY.
+    if (key.startsWith("ELEVENLABS_")) continue;
+    // VOICE_* — cost-cap thresholds, models, output formats. Not a secret
+    // per se, but the SDK subprocess has no business reading them; clean
+    // env reduces accidental coupling.
+    if (key.startsWith("VOICE_")) continue;
     if (key === "STATS_BEARER_TOKEN") continue;
     if (key === "ALLOWLIST_BOOTSTRAP") continue;
     if (key === "NOTION_API_KEY") continue;
@@ -723,11 +774,19 @@ export function buildAugmentedPrompt(args: {
   summary: { text: string; at: number } | null;
   oobTurns: ReadonlyArray<{ prompt: string; response: string; model: string }>;
   currentPrompt: string;
+  // Voice mode — `<voice-mode>` system block injected when `/voice on` for
+  // the chat AND the deploy has a word target configured. Sits AFTER
+  // SOLRAC.md and BEFORE summary/OOB so operator overlays can override
+  // (plan §14). `null` = voice mode off.
+  voiceModeBlock?: string | null;
 }): string {
   const lines: string[] = [];
   if (args.instanceMd !== null) {
     lines.push(wrapInstanceMd(args.instanceMd), "");
   }
+  if (args.voiceModeBlock) {
+    lines.push(args.voiceModeBlock, "");
+  }
   if (args.summary !== null) {
     const ts = new Date(args.summary.at).toISOString();
     lines.push(
diff --git a/src/commands.test.ts b/src/commands.test.ts
index 60708e1..4c94c37 100644
--- a/src/commands.test.ts
+++ b/src/commands.test.ts
@@ -486,6 +486,8 @@ function makeFakeTg(): TelegramClient & { sent: SentMessage[] } {
       username: "solrac_dev_bot",
     })) as never,
     setMyCommands: (async () => true) as never,
+    sendVoice: (async () => ({ message_id: 0, date: 0, chat: { id: 0, type: "private" } })) as never,
+    sendAudio: (async () => ({ message_id: 0, date: 0, chat: { id: 0, type: "private" } })) as never,
   } as TelegramClient & { sent: SentMessage[] };
 }
 
diff --git a/src/commands.ts b/src/commands.ts
index 4b4fd3d..65ad4d3 100644
--- a/src/commands.ts
+++ b/src/commands.ts
@@ -143,7 +143,13 @@ export type SolracCommand =
   // - `/tasks run <name>` fires a task on demand (bypasses the schedule
   //   clock; honors enabled / one_off_consumed / max_cost_usd / queue-full).
   | { kind: "tasks_list" }
-  | { kind: "tasks_run"; name: string };
+  | { kind: "tasks_run"; name: string }
+  // Voice (ElevenLabs). `enabled === null` is the no-arg query form ("what
+  // is voice mode set to right now"); explicit true/false flips
+  // sessions.voice_replies. Same dispatch for Telegram + web — web operators
+  // can `/voice on` to get the word-limit nudge even though web has its
+  // own per-message speak button.
+  | { kind: "voice_set"; enabled: boolean | null };
 
 export type ParseCommandResult =
   | { kind: "run"; cmd: SolracCommand }
@@ -167,6 +173,7 @@ export const BOT_COMMAND_REGISTRY: ReadonlyArray<BotCommand> = [
   { command: "status", description: "Show session and spend snapshot" },
   { command: "context", description: "Show context window size in bytes + tokens" },
   { command: "tasks", description: "List scheduled tasks (or run <name>)" },
+  { command: "voice", description: "Toggle voice replies (on/off)" },
   { command: "help", description: "Show available commands" },
 ];
 
@@ -201,6 +208,7 @@ const KNOWN_COMMANDS = new Set([
   "context",
   "help",
   "tasks",
+  "voice",
 ]);
 
 const TIER_ARG_MAP: Record<string, TierArg> = {
@@ -317,6 +325,20 @@ export function parseCommand(text: string, deps: ParseCommandDeps): ParseCommand
     return { kind: "run", cmd: { kind: "unknown", raw: `${prefix}tasks ${argRaw}` } };
   }
 
+  if (name === "voice") {
+    // No-arg → query current state (rendered by the dispatcher). Operators
+    // commonly type `/voice` to remember which mode they're in.
+    if (argRaw === "") return { kind: "run", cmd: { kind: "voice_set", enabled: null } };
+    const lower = argRaw.toLowerCase();
+    if (lower === "on" || lower === "1" || lower === "true") {
+      return { kind: "run", cmd: { kind: "voice_set", enabled: true } };
+    }
+    if (lower === "off" || lower === "0" || lower === "false") {
+      return { kind: "run", cmd: { kind: "voice_set", enabled: false } };
+    }
+    return { kind: "run", cmd: { kind: "unknown", raw: `${prefix}voice ${argRaw}` } };
+  }
+
   // /compact — `all` is invalid (compacting both tiers in one command is two
   // real Claude calls and surprising). No-arg → reject for the same reason
   // as /context above (silent `primary` default would summarize an empty
@@ -455,6 +477,8 @@ export async function runCommand(
       return runTasksList(deps, msg, updateId);
     case "tasks_run":
       return runTasksRun(deps, msg, updateId, cmd.name);
+    case "voice_set":
+      return runVoiceSet(deps, msg, updateId, cmd.enabled);
   }
 }
 
@@ -1186,6 +1210,7 @@ const HELP_COMMANDS_MD = [
   "- **context** `@|!` — show context-window size in bytes + tokens for that tier.",
   "- **help** — this card.",
   "- **status** — show session and spend snapshot for this chat.",
+  "- **voice** `[on|off]` — toggle voice replies (Telegram audio note + word-limit prompt). No arg shows current state.",
   "",
   "**Customize**",
   "",
@@ -1955,6 +1980,35 @@ async function runTasksRun(
   writeSystemAudit(deps, msg, updateId, `tasks_run:${result.kind}`, status);
 }
 
+// ---------------------------------------------------------------------------
+// /voice on|off
+// ---------------------------------------------------------------------------
+
+async function runVoiceSet(
+  deps: RunCommandDeps,
+  msg: Message,
+  updateId: number,
+  enabled: boolean | null,
+): Promise<void> {
+  const chatId = msg.chat.id;
+  // No-arg → query. Render current state without writing the DB so an
+  // operator typing `/voice` doesn't bump `sessions.updated_at`.
+  if (enabled === null) {
+    const current = deps.db.getVoiceRepliesFlag(chatId);
+    const state = current ? "<b>on</b>" : "<b>off</b>";
+    const reply = `🔊 voice mode: ${state} (use <code>/voice on</code> or <code>/voice off</code>)`;
+    await sendOrLog(deps.tg, chatId, reply, "cmd.voice_reply_failed");
+    writeSystemAudit(deps, msg, updateId, `voice_query:${current ? "on" : "off"}`, "ok");
+    return;
+  }
+  deps.db.setVoiceRepliesFlag(chatId, enabled);
+  const reply = enabled
+    ? "🔊 voice mode <b>on</b> — replies will be terser; Telegram replies attach an audio note."
+    : "🔇 voice mode <b>off</b> — replies return to normal length, no audio attached.";
+  await sendOrLog(deps.tg, chatId, reply, "cmd.voice_reply_failed");
+  writeSystemAudit(deps, msg, updateId, `voice_set:${enabled ? "on" : "off"}`, "ok");
+}
+
 function formatScheduleSpec(t: ScheduledTask): string {
   const s = t.spec;
   if (s.kind === "cron") {
diff --git a/src/config.ts b/src/config.ts
index 7bf8388..05ad3a9 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -48,6 +48,7 @@
 import { existsSync } from "node:fs";
 import { homedir } from "node:os";
 import { isAbsolute, resolve } from "node:path";
+import { log } from "./log.ts";
 
 type Transport = "poll" | "webhook";
 
@@ -201,6 +202,41 @@ export interface Config {
   readonly webPort: number;
   readonly webToken: string | null;
   readonly webChatId: number;
+  // Voice (ElevenLabs STT + TTS). Master switch is `voiceEnabled` — every
+  // call site short-circuits when false, so unset ELEVENLABS_* keys are
+  // tolerated. When true, `elevenlabsApiKey` and `elevenlabsVoiceId` MUST
+  // be set (boot validation enforces). Independent cost cap (per-chat +
+  // global, sliding 60-min) over `voice_events.cost_usd_estimate` is
+  // separate from the Anthropic `audit.cost_usd` cap.
+  readonly voiceEnabled: boolean;
+  readonly elevenlabsApiKey: string | null;
+  readonly elevenlabsVoiceId: string | null;
+  readonly elevenlabsTtsModel: string;
+  readonly elevenlabsSttModel: string;
+  // Hard wall — TTS requests over this length are refused with HTTP 413 +
+  // chat hint. The word-limit prompt nudge defends against this; the wall
+  // is the last line of defense.
+  readonly voiceTtsMaxChars: number;
+  // Soft target — injected into the LLM prompt when sessions.voice_replies
+  // = 1. Clamped to [30, 200] at boot; out-of-range values warn and clamp.
+  readonly voiceReplyWordsHint: number;
+  readonly voiceSttMaxBytes: number;
+  readonly voiceSttMaxSeconds: number;
+  readonly voiceHourlyCostCapUsd: number;
+  readonly voiceGlobalHourlyCostCapUsd: number;
+  // Output format used by the web `<audio>` element. MP3 plays everywhere
+  // (Chromium, Firefox, Safari, mobile).
+  readonly elevenlabsTtsFormatWeb: string;
+  // Output format used for Telegram `sendVoice`. Default opus_48000_64 —
+  // §17 probe (May 2026) confirmed ElevenLabs returns Ogg-containerized
+  // Opus (`OggS` magic, `content-type: audio/opus`) which sendVoice accepts.
+  // Fallback to mp3_44100_64 + sendAudio if a future ElevenLabs change
+  // flips to raw Opus; voice.ts branches on the env var.
+  readonly elevenlabsTtsFormatTg: string;
+  // Per-1k-char TTS price + per-hour STT price. Constants so pricing drift
+  // can be pinned to the operator's plan without code changes.
+  readonly elevenlabsTtsPriceUsdPer1k: number;
+  readonly elevenlabsSttPriceUsdPerHour: number;
 }
 
 function parseAllowlist(raw: string): number[] {
@@ -596,6 +632,95 @@ export function loadConfig(env: NodeJS.ProcessEnv = process.env): Config {
       ? env.SOLRAC_INTEGRATIONS_DIR.trim()
       : "./integrations";
 
+  // Voice (ElevenLabs STT + TTS). Master switch off by default — unset
+  // ELEVENLABS_* keys are tolerated when voiceEnabled=false. When true, the
+  // key + voice id are mandatory (mirrors REMOTE_ENABLED requiring API key).
+  const voiceEnabled = parseBoolean("VOICE_ENABLED", env.VOICE_ENABLED, false);
+  const elevenlabsApiKey =
+    env.ELEVENLABS_API_KEY && env.ELEVENLABS_API_KEY.trim() !== ""
+      ? env.ELEVENLABS_API_KEY.trim()
+      : null;
+  const elevenlabsVoiceId =
+    env.ELEVENLABS_VOICE_ID && env.ELEVENLABS_VOICE_ID.trim() !== ""
+      ? env.ELEVENLABS_VOICE_ID.trim()
+      : null;
+  if (voiceEnabled) {
+    if (!elevenlabsApiKey) {
+      throw new Error(
+        "ELEVENLABS_API_KEY is required when VOICE_ENABLED=true (get one from your ElevenLabs Profile + API Keys page)",
+      );
+    }
+    if (!elevenlabsVoiceId) {
+      throw new Error(
+        "ELEVENLABS_VOICE_ID is required when VOICE_ENABLED=true (find it on the voice's detail page in ElevenLabs VoiceLab)",
+      );
+    }
+  }
+  const elevenlabsTtsModel =
+    env.ELEVENLABS_TTS_MODEL && env.ELEVENLABS_TTS_MODEL.trim() !== ""
+      ? env.ELEVENLABS_TTS_MODEL.trim()
+      : "eleven_flash_v2_5";
+  const elevenlabsSttModel =
+    env.ELEVENLABS_STT_MODEL && env.ELEVENLABS_STT_MODEL.trim() !== ""
+      ? env.ELEVENLABS_STT_MODEL.trim()
+      : "scribe_v2";
+  const voiceTtsMaxChars = parsePositiveInt("VOICE_TTS_MAX_CHARS", env.VOICE_TTS_MAX_CHARS, 3000);
+  // Words hint clamps to [30, 200]. Below 30 the model can't say anything
+  // useful; above 200 defeats the "user is listening, be brief" purpose.
+  // Out-of-range = warn + clamp (operator's env honored as closely as
+  // possible) rather than throw, because voice mode is non-critical.
+  const voiceReplyWordsHintRaw = parsePositiveInt(
+    "VOICE_REPLY_WORDS_HINT",
+    env.VOICE_REPLY_WORDS_HINT,
+    60,
+  );
+  let voiceReplyWordsHint = voiceReplyWordsHintRaw;
+  if (voiceReplyWordsHintRaw < 30) {
+    voiceReplyWordsHint = 30;
+    log.warn("config.voice_reply_words_clamped", { from: voiceReplyWordsHintRaw, to: 30 });
+  } else if (voiceReplyWordsHintRaw > 200) {
+    voiceReplyWordsHint = 200;
+    log.warn("config.voice_reply_words_clamped", { from: voiceReplyWordsHintRaw, to: 200 });
+  }
+  const voiceSttMaxBytes = parsePositiveInt(
+    "VOICE_STT_MAX_BYTES",
+    env.VOICE_STT_MAX_BYTES,
+    2_097_152,
+  );
+  const voiceSttMaxSeconds = parsePositiveInt(
+    "VOICE_STT_MAX_SECONDS",
+    env.VOICE_STT_MAX_SECONDS,
+    60,
+  );
+  const voiceHourlyCostCapUsd = parsePositiveNumber(
+    "VOICE_HOURLY_COST_CAP_USD",
+    env.VOICE_HOURLY_COST_CAP_USD,
+    0.25,
+  );
+  const voiceGlobalHourlyCostCapUsd = parsePositiveNumber(
+    "VOICE_GLOBAL_HOURLY_COST_CAP_USD",
+    env.VOICE_GLOBAL_HOURLY_COST_CAP_USD,
+    1.0,
+  );
+  const elevenlabsTtsFormatWeb =
+    env.ELEVENLABS_TTS_OUTPUT_FORMAT_WEB && env.ELEVENLABS_TTS_OUTPUT_FORMAT_WEB.trim() !== ""
+      ? env.ELEVENLABS_TTS_OUTPUT_FORMAT_WEB.trim()
+      : "mp3_44100_64";
+  const elevenlabsTtsFormatTg =
+    env.ELEVENLABS_TTS_OUTPUT_FORMAT_TG && env.ELEVENLABS_TTS_OUTPUT_FORMAT_TG.trim() !== ""
+      ? env.ELEVENLABS_TTS_OUTPUT_FORMAT_TG.trim()
+      : "opus_48000_64";
+  const elevenlabsTtsPriceUsdPer1k = parsePositiveNumber(
+    "ELEVENLABS_TTS_PRICE_USD_PER_1K_CHARS",
+    env.ELEVENLABS_TTS_PRICE_USD_PER_1K_CHARS,
+    0.05,
+  );
+  const elevenlabsSttPriceUsdPerHour = parsePositiveNumber(
+    "ELEVENLABS_STT_PRICE_USD_PER_HOUR",
+    env.ELEVENLABS_STT_PRICE_USD_PER_HOUR,
+    0.22,
+  );
+
   return Object.freeze({
     anthropicApiKey: env.ANTHROPIC_API_KEY!,
     telegramBotToken: env.TELEGRAM_BOT_TOKEN!,
@@ -648,5 +773,20 @@ export function loadConfig(env: NodeJS.ProcessEnv = process.env): Config {
     webPort,
     webToken,
     webChatId,
+    voiceEnabled,
+    elevenlabsApiKey,
+    elevenlabsVoiceId,
+    elevenlabsTtsModel,
+    elevenlabsSttModel,
+    voiceTtsMaxChars,
+    voiceReplyWordsHint,
+    voiceSttMaxBytes,
+    voiceSttMaxSeconds,
+    voiceHourlyCostCapUsd,
+    voiceGlobalHourlyCostCapUsd,
+    elevenlabsTtsFormatWeb,
+    elevenlabsTtsFormatTg,
+    elevenlabsTtsPriceUsdPer1k,
+    elevenlabsSttPriceUsdPerHour,
   });
 }
diff --git a/src/db.ts b/src/db.ts
index 0f0764c..f06140e 100644
--- a/src/db.ts
+++ b/src/db.ts
@@ -144,6 +144,31 @@ CREATE TABLE IF NOT EXISTS scheduled_tasks (
   source_hash TEXT NOT NULL,
   updated_at INTEGER NOT NULL
 );
+
+-- Voice (ElevenLabs STT + TTS) per-event audit log. Separate from the
+-- audit table because one turn can produce multiple voice events (one STT
+-- input + one TTS output) and the two-writes-per-turn shape doesn't fit.
+-- audit_id is informational only -- no FK so a denied-gate STT (which
+-- never reaches the audit table) still has a row here. Cost-cap queries
+-- sum cost_usd_estimate over the sliding 60-min window per chat and
+-- globally, independent of the Anthropic audit.cost_usd cap.
+CREATE TABLE IF NOT EXISTS voice_events (
+  id INTEGER PRIMARY KEY AUTOINCREMENT,
+  chat_id INTEGER NOT NULL,
+  ts_ms INTEGER NOT NULL,
+  kind TEXT NOT NULL CHECK (kind IN ('stt','tts')),
+  source TEXT NOT NULL CHECK (source IN ('web','telegram')),
+  model TEXT NOT NULL,
+  voice_id TEXT,
+  audit_id INTEGER,
+  duration_ms INTEGER,
+  chars INTEGER,
+  cost_usd_estimate REAL NOT NULL DEFAULT 0,
+  status TEXT NOT NULL CHECK (status IN ('ok','denied_cap','denied_gate','error')),
+  error_message TEXT
+);
+CREATE INDEX IF NOT EXISTS idx_voice_events_chat_ts ON voice_events (chat_id, ts_ms);
+CREATE INDEX IF NOT EXISTS idx_voice_events_ts ON voice_events (ts_ms);
 `;
 
 export interface AuditInsert {
@@ -335,6 +360,19 @@ export interface SolracDb {
   }) => void;
   setTaskLastRunOnly: (name: string, lastRunAt: number) => void;
   sumTaskCostSince: (taskName: string, sinceMs: number) => number;
+  // Voice (ElevenLabs) — see `voice_events` table comment.
+  recordVoiceEvent: (row: VoiceEventInsert) => number;
+  // Per-chat sliding 60-min window over voice_events.cost_usd_estimate for
+  // the voice-axis cost cap. Independent of `audit.cost_usd` (Anthropic
+  // burn); voice spend and Claude spend cap separately.
+  voiceCostUsedLast60min: (chatId: number, sinceMs: number) => number;
+  voiceCostUsedGlobalLast60min: (sinceMs: number) => number;
+  // Per-chat sticky toggle backing `/voice on|off`. Returns false for unknown
+  // chats (no row yet — same as `voice_replies=0`).
+  getVoiceRepliesFlag: (chatId: number) => boolean;
+  // Upserts the sessions row; touches `updated_at` so a fresh chat that
+  // toggles voice before its first turn still has a created/updated stamp.
+  setVoiceRepliesFlag: (chatId: number, enabled: boolean) => void;
   close: () => void;
 }
 
@@ -349,6 +387,32 @@ export interface ScheduledTaskRow {
   updatedAt: number;
 }
 
+// Outcome of a voice event. `denied_cap`/`denied_gate` rows still persist so
+// the operator can see "we spent N tries denied at the gate" — audit-before-
+// acting parity with the main `audit` table.
+export type VoiceEventStatus = "ok" | "denied_cap" | "denied_gate" | "error";
+
+export interface VoiceEventInsert {
+  chatId: number;
+  tsMs: number;
+  kind: "stt" | "tts";
+  source: "web" | "telegram";
+  model: string;
+  voiceId: string | null;
+  // Informational link to `audit.id` when this voice event corresponds to a
+  // turn that touched the audit table. Null for STT events that happen
+  // before the turn is created and for denied-gate STT (no turn ever runs).
+  auditId: number | null;
+  // STT: audio duration in ms; TTS: null. Cost math uses `audio_duration_secs`
+  // from the upstream response, persisted here as ms for a single time unit.
+  durationMs: number | null;
+  // TTS: input character count fed to ElevenLabs; STT: null.
+  chars: number | null;
+  costUsdEstimate: number;
+  status: VoiceEventStatus;
+  errorMessage: string | null;
+}
+
 export async function openDb(dataDir: string): Promise<SolracDb> {
   await mkdir(dataDir, { recursive: true });
   const dbPath = join(dataDir, "solrac.sqlite");
@@ -468,6 +532,14 @@ export async function openDb(dataDir: string): Promise<SolracDb> {
     db.run("ALTER TABLE sessions ADD COLUMN local_cutoff_ms INTEGER");
     log.info("db.migrated", { migration: "sessions.local_cutoff_ms_added" });
   }
+  // Voice — per-chat sticky toggle for the post-turn TTS attach + the
+  // voice-mode word-limit prompt. 0 = off (default), 1 = on. Read by both
+  // SOLRAC.md injection sites every turn so toggling takes effect on the
+  // next message without a restart.
+  if (!sessionCols.some((c) => c.name === "voice_replies")) {
+    db.run("ALTER TABLE sessions ADD COLUMN voice_replies INTEGER NOT NULL DEFAULT 0");
+    log.info("db.migrated", { migration: "sessions.voice_replies_added" });
+  }
   // PLAN Step 12 — retag legacy `audit.model='claude'` rows. They ran on the
   // then-default SOLRAC_MODEL=claude-opus-4-7, which is now the secondary
   // tier. Predicate-idempotent: after first boot, no row matches.
@@ -663,6 +735,36 @@ export async function openDb(dataDir: string): Promise<SolracDb> {
     "SELECT COALESCE(SUM(cost_usd), 0) AS spent FROM audit " +
       "WHERE task_name = ? AND started_at >= ?",
   );
+  const stInsertVoiceEvent = db.prepare(
+    "INSERT INTO voice_events " +
+      "(chat_id, ts_ms, kind, source, model, voice_id, audit_id, duration_ms, chars, " +
+      "cost_usd_estimate, status, error_message) " +
+      "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
+  );
+  // Cap windows match the Anthropic per-chat / global cap shape: the caller
+  // passes `Date.now() - 3600_000`; we sum `cost_usd_estimate` since then.
+  // Only `status='ok'` rows count — denied events don't bill, so denying
+  // them shouldn't deny again on the next call.
+  const stSumVoiceCostChat = db.prepare(
+    "SELECT COALESCE(SUM(cost_usd_estimate), 0) AS spent FROM voice_events " +
+      "WHERE chat_id = ? AND ts_ms >= ? AND status = 'ok'",
+  );
+  const stSumVoiceCostGlobal = db.prepare(
+    "SELECT COALESCE(SUM(cost_usd_estimate), 0) AS spent FROM voice_events " +
+      "WHERE ts_ms >= ? AND status = 'ok'",
+  );
+  const stGetVoiceReplies = db.prepare(
+    "SELECT voice_replies FROM sessions WHERE chat_id = ?",
+  );
+  // INSERT OR IGNORE creates a baseline row if absent (default created_at +
+  // updated_at = now), then the UPDATE flips the flag. Two statements stay
+  // simpler than the equivalent UPSERT for a column not on the PK.
+  const stEnsureSession = db.prepare(
+    "INSERT OR IGNORE INTO sessions (chat_id, created_at, updated_at) VALUES (?, ?, ?)",
+  );
+  const stSetVoiceReplies = db.prepare(
+    "UPDATE sessions SET voice_replies = ?, updated_at = ? WHERE chat_id = ?",
+  );
 
   return {
     raw: db,
@@ -879,6 +981,40 @@ export async function openDb(dataDir: string): Promise<SolracDb> {
       const row = stSumTaskCostSince.get(taskName, sinceMs) as { spent: number | null } | null;
       return row?.spent ?? 0;
     },
+    recordVoiceEvent(row) {
+      const r = stInsertVoiceEvent.run(
+        row.chatId,
+        row.tsMs,
+        row.kind,
+        row.source,
+        row.model,
+        row.voiceId,
+        row.auditId,
+        row.durationMs,
+        row.chars,
+        row.costUsdEstimate,
+        row.status,
+        row.errorMessage,
+      );
+      return Number(r.lastInsertRowid);
+    },
+    voiceCostUsedLast60min(chatId, sinceMs) {
+      const row = stSumVoiceCostChat.get(chatId, sinceMs) as { spent: number | null } | null;
+      return row?.spent ?? 0;
+    },
+    voiceCostUsedGlobalLast60min(sinceMs) {
+      const row = stSumVoiceCostGlobal.get(sinceMs) as { spent: number | null } | null;
+      return row?.spent ?? 0;
+    },
+    getVoiceRepliesFlag(chatId) {
+      const row = stGetVoiceReplies.get(chatId) as { voice_replies: number } | null;
+      return row?.voice_replies === 1;
+    },
+    setVoiceRepliesFlag(chatId, enabled) {
+      const now = Date.now();
+      stEnsureSession.run(chatId, now, now);
+      stSetVoiceReplies.run(enabled ? 1 : 0, now, chatId);
+    },
     close() {
       db.close();
     },
diff --git a/src/elevenlabs.ts b/src/elevenlabs.ts
new file mode 100644
index 0000000..68143e4
--- /dev/null
+++ b/src/elevenlabs.ts
@@ -0,0 +1,175 @@
+/**
+ * @fileoverview Typed `fetch` wrapper for the ElevenLabs HTTP API.
+ * @purpose Two endpoints, one file. STT (`POST /v1/speech-to-text`, multipart)
+ *          and TTS streaming (`POST /v1/text-to-speech/{voice_id}/stream`,
+ *          JSON in / chunked body out). No SDK — preserves the "marked is the
+ *          only other runtime dep" anti-goal.
+ *
+ * Solrac sits behind ElevenLabs at conversational latency. STT is a single
+ * round-trip with an audio blob; TTS is proxy-streamed from upstream straight
+ * back to the caller (browser `<audio>` or buffered for Telegram `sendVoice`),
+ * so this module returns the upstream `ReadableStream` as-is rather than
+ * buffering server-side.
+ *
+ * §17 probe captured at impl time (2026-05): with
+ * `output_format=opus_48000_64`, ElevenLabs returns an Ogg-containerized
+ * Opus payload (magic bytes `OggS`, `content-type: audio/opus`). Telegram's
+ * `sendVoice` accepts this directly — no transcoding step needed. If a future
+ * ElevenLabs change flips this to raw Opus, the `voice.ts` probe falls back
+ * to `mp3_44100_64` + `sendAudio` via env var.
+ *
+ * Position in the dependency graph:
+ *   log + config → elevenlabs → consumed by voice
+ *
+ * Exports:
+ *   - `speechToText(opts)` — multipart upload, returns `{ text, durationSeconds }`.
+ *   - `textToSpeechStream(opts)` — JSON request, returns `{ contentType, body }`
+ *     where `body` is the upstream `ReadableStream<Uint8Array>` (proxy through).
+ *   - `ElevenLabsError` / `ElevenLabsAuthError` / `ElevenLabsRateError` —
+ *     typed errors; orchestration layer maps to user-facing outcomes.
+ *
+ * Key invariants:
+ *   - The API key is passed in via the call site (`opts.apiKey`) — this module
+ *     never reads `process.env`. Lets `voice.ts` own the "voice disabled"
+ *     branching without an env probe per call.
+ *   - Never logs the API key, the upload bytes, or the response audio. The
+ *     plain transcript text is fine to log (the operator owns the audit log)
+ *     but the wrapper itself does NOT log — callers do.
+ *   - `signal?: AbortSignal` is honored on both requests so an operator-side
+ *     timeout (or shutdown) can cut the upstream connection. Default timeout
+ *     wiring lives in the caller, not here.
+ *
+ * Gotchas:
+ *   - ElevenLabs returns HTTP 401 for missing/invalid keys, 429 for rate
+ *     limits, and 422 for malformed bodies. We surface 401 → `ElevenLabsAuthError`,
+ *     429 → `ElevenLabsRateError`, everything else 4xx/5xx → `ElevenLabsError`
+ *     with the upstream message in `.message` if JSON-decodable.
+ *   - The STT response shape is `{ text, audio_duration_secs, ... }` — we
+ *     consume only the two fields we bill on; future fields (language, words)
+ *     are ignored silently to avoid churn on minor upstream additions.
+ *   - For TTS we return the `ReadableStream` from `Response.body`. The caller
+ *     is responsible for consuming it (`Response(body)` pipe-through for web,
+ *     `arrayBuffer()` aggregation for Telegram). Leaking it = leaking a socket.
+ */
+
+const STT_URL = "https://api.elevenlabs.io/v1/speech-to-text";
+const TTS_BASE = "https://api.elevenlabs.io/v1/text-to-speech";
+
+export class ElevenLabsError extends Error {
+  readonly status: number;
+  constructor(message: string, status: number) {
+    super(message);
+    this.name = "ElevenLabsError";
+    this.status = status;
+  }
+}
+
+export class ElevenLabsAuthError extends ElevenLabsError {
+  constructor(message: string) {
+    super(message, 401);
+    this.name = "ElevenLabsAuthError";
+  }
+}
+
+export class ElevenLabsRateError extends ElevenLabsError {
+  constructor(message: string) {
+    super(message, 429);
+    this.name = "ElevenLabsRateError";
+  }
+}
+
+export interface SttResult {
+  text: string;
+  durationSeconds: number;
+}
+
+export interface SttRequestOpts {
+  apiKey: string;
+  modelId: string;
+  audio: Blob;
+  filename?: string;
+  signal?: AbortSignal;
+}
+
+export interface TtsRequestOpts {
+  apiKey: string;
+  voiceId: string;
+  modelId: string;
+  outputFormat: string;
+  text: string;
+  signal?: AbortSignal;
+}
+
+export interface TtsStream {
+  contentType: string;
+  body: ReadableStream<Uint8Array>;
+}
+
+async function decodeUpstreamError(res: Response): Promise<string> {
+  try {
+    const body = await res.text();
+    try {
+      const parsed = JSON.parse(body) as { detail?: { message?: string } | string };
+      if (typeof parsed.detail === "string") return parsed.detail;
+      if (parsed.detail && typeof parsed.detail.message === "string") return parsed.detail.message;
+    } catch {
+      // Body is not JSON; fall through and surface the raw text bounded.
+    }
+    return body.slice(0, 200);
+  } catch {
+    return `HTTP ${res.status}`;
+  }
+}
+
+function throwForStatus(status: number, message: string): never {
+  if (status === 401) throw new ElevenLabsAuthError(message);
+  if (status === 429) throw new ElevenLabsRateError(message);
+  throw new ElevenLabsError(message, status);
+}
+
+export async function speechToText(opts: SttRequestOpts): Promise<SttResult> {
+  const form = new FormData();
+  form.append("model_id", opts.modelId);
+  form.append("file", opts.audio, opts.filename ?? "audio.webm");
+  const res = await fetch(STT_URL, {
+    method: "POST",
+    headers: { "xi-api-key": opts.apiKey },
+    body: form,
+    signal: opts.signal,
+  });
+  if (!res.ok) {
+    const msg = await decodeUpstreamError(res);
+    throwForStatus(res.status, `ElevenLabs STT failed: ${msg}`);
+  }
+  const json = (await res.json()) as {
+    text?: unknown;
+    audio_duration_secs?: unknown;
+  };
+  const text = typeof json.text === "string" ? json.text : "";
+  const durationSeconds =
+    typeof json.audio_duration_secs === "number" ? json.audio_duration_secs : 0;
+  return { text, durationSeconds };
+}
+
+export async function textToSpeechStream(opts: TtsRequestOpts): Promise<TtsStream> {
+  const url = `${TTS_BASE}/${encodeURIComponent(opts.voiceId)}/stream?output_format=${encodeURIComponent(opts.outputFormat)}`;
+  const res = await fetch(url, {
+    method: "POST",
+    headers: {
+      "xi-api-key": opts.apiKey,
+      "content-type": "application/json",
+      accept: "audio/*",
+    },
+    body: JSON.stringify({ text: opts.text, model_id: opts.modelId }),
+    signal: opts.signal,
+  });
+  if (!res.ok) {
+    const msg = await decodeUpstreamError(res);
+    throwForStatus(res.status, `ElevenLabs TTS failed: ${msg}`);
+  }
+  if (res.body === null) {
+    throw new ElevenLabsError("ElevenLabs TTS returned empty body", res.status);
+  }
+  const contentType = res.headers.get("content-type") ?? "application/octet-stream";
+  return { contentType, body: res.body };
+}
diff --git a/src/engine.ts b/src/engine.ts
index f15a689..05f8011 100644
--- a/src/engine.ts
+++ b/src/engine.ts
@@ -77,6 +77,7 @@ import type { SdkMcpToolDefinition } from "@anthropic-ai/claude-agent-sdk";
 import type { SolracDb } from "./db.ts";
 import type { SessionStore } from "./session.ts";
 import { readInstanceMd, wrapInstanceMd } from "./instance.ts";
+import { buildVoiceModePrompt } from "./voice.ts";
 import type { IntegrationTier } from "./integrations.ts";
 import {
   type EngineChatMessage,
@@ -144,6 +145,21 @@ export interface EngineRunDeps {
   // `LOCAL_MAX_TOOL_ITERATIONS` / `REMOTE_MAX_TOOL_ITERATIONS`. Defaults to
   // 8; only consulted when tools are enabled.
   maxToolIterations?: number;
+  // Voice — word target for the voice-mode prompt nudge. When set AND
+  // `sessions.voice_replies=1` for the chat, an extra `system` message
+  // carrying the `<voice-mode>` block is pushed right after the SOLRAC.md
+  // overlay. Optional — omitted when `VOICE_ENABLED=false`.
+  voiceReplyWords?: number;
+  // Post-turn TTS attach. Called once per successful turn AFTER the audit
+  // row is finalized. Wired only on the Telegram transport when VOICE_ENABLED
+  // — web has its own per-message speak button. Best-effort: failures inside
+  // the callback log + return; they do NOT propagate.
+  attachVoiceReply?: (opts: {
+    chatId: number;
+    messageId: number | null;
+    auditId: number;
+    finalText: string;
+  }) => Promise<void>;
 }
 
 export interface EngineRunInput {
@@ -221,6 +237,16 @@ export async function runEngineTurn(
   if (instanceMd !== null) {
     messages.push({ role: "system", content: wrapInstanceMd(instanceMd) });
   }
+  // Voice-mode block — pushed as its own `system` message right after the
+  // SOLRAC.md overlay (per plan §14). Local models without RLHF on
+  // instruction hierarchy do better with a distinct system turn than
+  // concatenation into another one.
+  if (deps.voiceReplyWords !== undefined && deps.db.getVoiceRepliesFlag(input.chatId)) {
+    messages.push({
+      role: "system",
+      content: buildVoiceModePrompt({ words: deps.voiceReplyWords }),
+    });
+  }
   // History reconstruction: stateful chat context per chat. Pulls every
   // successful turn for the chat regardless of engine — primary Claude,
   // secondary Claude, prior local. Each row's `model` field tags origin but
@@ -386,6 +412,15 @@ export async function runEngineTurn(
     costUsd: costForAudit,
     isError,
   });
+
+  if (deps.attachVoiceReply && !isError && assistantText.trim()) {
+    await deps.attachVoiceReply({
+      chatId: input.chatId,
+      messageId: stubId,
+      auditId,
+      finalText: assistantText,
+    });
+  }
 }
 
 /**
@@ -556,6 +591,15 @@ async function runEngineTurnWithTools(
   if (instanceMd !== null) {
     initialMessages.push({ role: "system", content: wrapInstanceMd(instanceMd) });
   }
+  // Voice-mode block — same as the single-shot path. Both modes must
+  // agree so toggling `/voice on` is consistent regardless of whether
+  // integrations are wired.
+  if (deps.voiceReplyWords !== undefined && deps.db.getVoiceRepliesFlag(input.chatId)) {
+    initialMessages.push({
+      role: "system",
+      content: buildVoiceModePrompt({ words: deps.voiceReplyWords }),
+    });
+  }
   // Same cutoff treatment as the single-shot path; the tool-loop variant
   // must agree so `/clear local` is consistent across both modes.
   const cutoff = deps.sessions?.getLocalCutoff(input.chatId) ?? 0;
@@ -697,6 +741,15 @@ async function runEngineTurnWithTools(
     iterationCapHit: result.iterationCapHit,
     isError,
   });
+
+  if (deps.attachVoiceReply && !isError && result.assistantText.trim()) {
+    await deps.attachVoiceReply({
+      chatId: input.chatId,
+      messageId: stubId,
+      auditId,
+      finalText: result.assistantText,
+    });
+  }
 }
 
 // Render variants for the tools-on path. Mirror the single-shot
diff --git a/src/main.ts b/src/main.ts
index b88d81e..c6a138b 100644
--- a/src/main.ts
+++ b/src/main.ts
@@ -118,7 +118,7 @@ import {
   type DenialThrottle,
   type GlobalCostCapGuard,
 } from "./policy.ts";
-import { createTurnQueue } from "./queue.ts";
+import { createTurnQueue, type EnqueueResult } from "./queue.ts";
 import {
   EMPTY_TASK_REGISTRY,
   getScheduledContext,
@@ -151,6 +151,12 @@ import { createTelegramClient, type TelegramClient } from "./telegram.ts";
 import { TurnTracker } from "./turn-tracker.ts";
 import { createWebClient, type WebClient } from "./web-client.ts";
 import { startWebServer } from "./web.ts";
+import {
+  handleTelegramVoiceStt,
+  maybeReplyWithVoice,
+  type VoiceDeps,
+  type VoiceTelegramSender,
+} from "./voice.ts";
 
 interface RunTurnDeps {
   tg: TelegramClient;
@@ -195,13 +201,102 @@ interface RunTurnDeps {
   // value created by `createSdkMcpServer` and threaded into `runAgent`'s
   // `options.mcpServers`. Claude tiers only — local path ignores this.
   mcpServer: McpSdkServerConfigWithInstance | null;
+  // Voice (ElevenLabs). `null` when VOICE_ENABLED=false. When non-null, the
+  // dispatcher routes `msg.voice` through `handleTelegramVoiceStt` to
+  // transcribe → enqueue the synthesized text Update. Wired only on the
+  // Telegram transport (web has its own /api/stt route).
+  voiceDeps: VoiceDeps | null;
+  // Forward reference to `queue.enqueue` so the voice dispatcher can
+  // re-enqueue a synthesized text Update once STT lands. Mutable closure
+  // bound at boot (queue is built after makeRunTurn — same pattern as
+  // `getQueueSnapshot` and `triggerScheduledTask`).
+  enqueue: (update: Update) => EnqueueResult;
+  // Phase 5 — post-turn TTS attach. Threaded into AgentRunDeps and the
+  // localDeps EngineRunDeps so Claude tiers + local + remote all attach a
+  // voice note when `voice_replies=1`. `undefined` on the web RunTurnDeps
+  // (web has its own per-message speak button).
+  attachVoiceReply?: AttachVoiceReply;
+  // Word target for the voice-mode prompt nudge (Phase 1). Threaded to
+  // both runners. `undefined` when VOICE_ENABLED=false; identical for
+  // Telegram + web (both transports inject the nudge when /voice on).
+  voiceReplyWords?: number;
 }
 
+type AttachVoiceReply = (opts: {
+  chatId: number;
+  messageId: number | null;
+  auditId: number;
+  finalText: string;
+}) => Promise<void>;
+
 function makeRunTurn(deps: RunTurnDeps): (update: Update) => Promise<void> {
   return async (update) => {
     const msg = update.message;
-    if (!msg || !msg.text || !msg.from) {
-      log.debug("turn.ignored", { update_id: update.update_id, kind: "non-text-or-no-from" });
+    if (!msg || !msg.from) {
+      log.debug("turn.ignored", { update_id: update.update_id, kind: "no-msg-or-no-from" });
+      return;
+    }
+    // Voice note → STT → re-enqueue as synthesized text. Gated on
+    // `voiceDeps` (null when VOICE_ENABLED=false). The synthesized Update
+    // carries the same update_id/from/chat but `text=transcript` and NO
+    // `voice` field, so this branch can't loop. Cap-check, gate, and
+    // voice_events row all happen inside `handleTelegramVoiceStt`.
+    if (!msg.text && msg.voice && deps.voiceDeps) {
+      const chatId = msg.chat.id;
+      log.info("turn.voice_received", {
+        update_id: update.update_id,
+        chat_id: chatId,
+        from_id: msg.from.id,
+        duration: msg.voice.duration,
+      });
+      await deps.tg
+        .sendChatAction(chatId, "record_voice")
+        .catch((err) =>
+          log.debug("voice.chat_action_failed", { error: (err as Error).message }),
+        );
+      const result = await handleTelegramVoiceStt(deps.voiceDeps, {
+        update,
+        voiceFileId: msg.voice.file_id,
+      });
+      if (result.kind === "synthesized") {
+        log.info("voice.stt_ok", {
+          update_id: update.update_id,
+          chat_id: chatId,
+          text_preview: result.update.message?.text?.slice(0, 80) ?? "",
+        });
+        const enq = deps.enqueue(result.update);
+        if (enq.kind === "dropped_queue_full") {
+          await deps.tg
+            .sendMessage(chatId, `queue full (${enq.depth} waiting) — try again in a moment`)
+            .catch((err) =>
+              log.warn("voice.queue_full_notice_failed", { error: (err as Error).message }),
+            );
+        }
+        return;
+      }
+      if (result.kind === "denied_cap") {
+        await deps.tg
+          .sendMessage(chatId, "voice cap reached, try again in a minute")
+          .catch((err) =>
+            log.warn("voice.cap_notice_failed", { error: (err as Error).message }),
+          );
+      } else if (result.kind === "error") {
+        log.warn("voice.stt_failed", {
+          update_id: update.update_id,
+          chat_id: chatId,
+          message: result.message,
+        });
+        await deps.tg
+          .sendMessage(chatId, "voice transcription failed — try sending as text")
+          .catch((err) =>
+            log.warn("voice.error_notice_failed", { error: (err as Error).message }),
+          );
+      }
+      // denied_gate is silent (parity with the text-gate path).
+      return;
+    }
+    if (!msg.text) {
+      log.debug("turn.ignored", { update_id: update.update_id, kind: "non-text-or-voice" });
       return;
     }
     // Scheduler — when this update was synthesized by the tick driver, the
@@ -337,6 +432,8 @@ function makeRunTurn(deps: RunTurnDeps): (update: Update) => Promise<void> {
         globalCostGuard: deps.globalCostGuard,
         createCanUseTool: deps.createCanUseTool,
         mcpServer: deps.mcpServer,
+        voiceReplyWords: deps.voiceReplyWords,
+        attachVoiceReply: deps.attachVoiceReply,
       },
       {
         chatId: msg.chat.id,
@@ -823,6 +920,8 @@ async function main(): Promise<void> {
             toolTiers: localToolsActive ? integrationToolTiers : undefined,
             broker: localToolsActive ? broker : undefined,
             maxToolIterations: localSlotMaxToolIterations,
+            voiceReplyWords: config.voiceEnabled ? config.voiceReplyWordsHint : undefined,
+            attachVoiceReply: undefined,
           }
         : null;
     if (localDeps && localDriver) {
@@ -974,6 +1073,67 @@ async function main(): Promise<void> {
         : null;
     }
 
+    // Voice (Phase 4 STT + Phase 5 TTS attach). Built once at boot when
+    // VOICE_ENABLED=true; null otherwise. Telegram dispatcher gets the
+    // populated value; web transport passes `null` because it has its own
+    // /api/stt and /api/tts routes (Phase 2).
+    //
+    // The `telegramSender` lets the post-turn hook attach a voice note to
+    // assistant replies on the Telegram transport. Phase 5's
+    // `maybeReplyWithVoice` no-ops when this is missing.
+    const telegramSender: VoiceTelegramSender | null = config.voiceEnabled
+      ? {
+          sendVoice: async (chatId, audio, opts) => {
+            await tg.sendVoice(chatId, audio, {
+              reply_to_message_id: opts?.replyToMessageId,
+              mime_type: opts?.mimeType,
+            });
+          },
+          sendAudio: async (chatId, audio, opts) => {
+            await tg.sendAudio(chatId, audio, {
+              reply_to_message_id: opts?.replyToMessageId,
+              mime_type: opts?.mimeType,
+            });
+          },
+        }
+      : null;
+    const voiceDeps: VoiceDeps | null = config.voiceEnabled
+      ? {
+          db,
+          tg,
+          config,
+          isAllowed: allowlist.isAllowed,
+          telegramSender: telegramSender ?? undefined,
+        }
+      : null;
+    // Bound once so AgentRunDeps + EngineRunDeps share the same callback
+    // (and the same VoiceDeps, including cost-cap state).
+    const attachVoiceReply: AttachVoiceReply | undefined = voiceDeps
+      ? (opts) => maybeReplyWithVoice(voiceDeps, opts)
+      : undefined;
+    // Splice the post-turn hook onto the Telegram-bound `localDeps` so the
+    // engine runner sees it when the operator types into a Telegram chat.
+    // `webLocalDeps` (built above as a spread of `localDeps`) is overridden
+    // explicitly below — web has its own per-message speak button.
+    if (localDeps) {
+      localDeps.attachVoiceReply = attachVoiceReply;
+    }
+    if (webLocalDeps) {
+      webLocalDeps.attachVoiceReply = undefined;
+    }
+    // Forward reference for the voice dispatcher's re-enqueue. Mirrors the
+    // `queueRef` pattern — queue is built after makeRunTurn, but the closure
+    // resolves the latest reference at call time.
+    const voiceEnqueue = (update: Update): EnqueueResult => {
+      if (!queueRef) {
+        // Defensive: voice dispatch fires inside a queue worker, so queue
+        // must exist. If it doesn't, drop loud rather than NPE.
+        log.warn("voice.enqueue_pre_queue", { update_id: update.update_id });
+        return { kind: "dropped_queue_full", depth: 0, key: 0 };
+      }
+      return queueRef.enqueue(update);
+    };
+
     const tgRunTurn = makeRunTurn({
       tg,
       db,
@@ -991,6 +1151,10 @@ async function main(): Promise<void> {
       botUsername,
       skillRegistry,
       mcpServer: integrationsMcpServer,
+      voiceDeps,
+      enqueue: voiceEnqueue,
+      voiceReplyWords: config.voiceEnabled ? config.voiceReplyWordsHint : undefined,
+      attachVoiceReply,
     });
     const webRunTurn = webClient
       ? makeRunTurn({
@@ -1008,6 +1172,10 @@ async function main(): Promise<void> {
           localDeps: webLocalDeps,
           commandDeps: webCommandDeps!,
           botUsername: null,
+          voiceDeps: null,
+          enqueue: voiceEnqueue,
+          voiceReplyWords: config.voiceEnabled ? config.voiceReplyWordsHint : undefined,
+          attachVoiceReply: undefined,
           skillRegistry,
           mcpServer: integrationsMcpServer,
         })
@@ -1086,6 +1254,11 @@ async function main(): Promise<void> {
           db
             .recentChatTurns(config.webChatId, 50)
             .filter((r) => !r.model.startsWith("system")),
+        // Voice (Phase 2). Same VoiceDeps as the Telegram path so the
+        // sliding 60-min cap is shared across transports — operator can't
+        // double up by talking on web + Telegram simultaneously.
+        voiceDeps,
+        voiceRepliesEnabled: () => db.getVoiceRepliesFlag(config.webChatId),
       });
     }
     // Scheduler tick loop — started AFTER queue construction (it depends on
diff --git a/src/telegram.ts b/src/telegram.ts
index 814c92d..216b2f1 100644
--- a/src/telegram.ts
+++ b/src/telegram.ts
@@ -26,7 +26,7 @@
  *   - `createTelegramClient(token)` — factory returning a `TelegramClient`.
  *   - `TelegramClient` — interface with generic `call`, plus typed helpers:
  *     `getUpdates`, `sendMessage`, `editMessageText`, `setMessageReaction`,
- *     `sendChatAction`.
+ *     `sendChatAction`, `sendVoice`, `sendAudio`.
  *   - `TelegramConflictError` — 409 variant; signals "exit 1, don't retry."
  *   - `TelegramApiError` — generic non-409, non-429 API failure.
  *   - `htmlEscapeText(s)` — escape `&`, `<`, `>` for HTML text-context interpolation.
@@ -134,6 +134,14 @@ export interface BotCommand {
   description: string;
 }
 
+export interface SendVoiceOpts {
+  reply_to_message_id?: number;
+  // Telegram infers the MIME from the multipart filename if absent; explicit
+  // override here is for callers (like voice.ts) that already know the
+  // upstream content-type. Both sendVoice and sendAudio accept this.
+  mime_type?: string;
+}
+
 export interface TelegramClient {
   call: <T>(
     method: string,
@@ -156,6 +164,27 @@ export interface TelegramClient {
   // Both are non-fatal at boot — wrapped in `.catch` at the call site.
   getMe: () => Promise<BotIdentity>;
   setMyCommands: (commands: ReadonlyArray<BotCommand>) => Promise<true>;
+  // Voice (Phase 5). Multipart POST to /sendVoice or /sendAudio.
+  //   sendVoice expects Ogg/Opus — renders as a chat voice pill with a
+  //     waveform preview. Per Telegram's docs the file must be encoded with
+  //     OPUS (other formats yield a generic "audio file" rendering).
+  //   sendAudio expects an MP3 (or other media-codec audio); renders as a
+  //     file attachment with play controls. Used as the fallback path when
+  //     ElevenLabs returns raw Opus (no OggS container).
+  // The `audio` field accepts an ArrayBuffer — the caller already aggregated
+  // the upstream stream. We wrap as a Blob with `mime_type` (default
+  // audio/ogg for voice, audio/mpeg for audio) so Telegram's content-type
+  // sniff matches.
+  sendVoice: (
+    chatId: number,
+    audio: ArrayBuffer,
+    opts?: SendVoiceOpts,
+  ) => Promise<Message>;
+  sendAudio: (
+    chatId: number,
+    audio: ArrayBuffer,
+    opts?: SendVoiceOpts,
+  ) => Promise<Message>;
 }
 
 export function htmlEscapeText(s: string): string {
@@ -203,6 +232,52 @@ export function createTelegramClient(token: string): TelegramClient {
     }
   }
 
+  async function callMultipart<T>(method: string, form: FormData): Promise<T> {
+    // Telegram's multipart endpoints share the same 409/429 semantics as the
+    // JSON ones, so the retry shape mirrors `call()`. We rebuild FormData
+    // on retry only if needed — currently the body is the same on each
+    // attempt (no per-call hash/signature).
+    let attempt = 0;
+    while (true) {
+      attempt++;
+      const res = await fetch(`${base}/${method}`, {
+        method: "POST",
+        body: form,
+      });
+      const json = (await res.json()) as TelegramResponse<T>;
+      if (json.ok) return json.result as T;
+      const code = json.error_code ?? res.status;
+      const description = json.description ?? "unknown";
+      if (code === 409) throw new TelegramConflictError(description);
+      if (code === 429) {
+        const retryAfter = Number(json.parameters?.retry_after ?? 1);
+        log.warn("telegram.rate_limited", { method, retryAfter, attempt });
+        await Bun.sleep(Math.max(retryAfter * 1000, 100));
+        continue;
+      }
+      throw new TelegramApiError(code, description);
+    }
+  }
+
+  function buildAudioForm(
+    chatId: number,
+    audio: ArrayBuffer,
+    field: "voice" | "audio",
+    filename: string,
+    defaultMime: string,
+    opts: SendVoiceOpts | undefined,
+  ): FormData {
+    const mime = opts?.mime_type ?? defaultMime;
+    const blob = new Blob([audio], { type: mime });
+    const form = new FormData();
+    form.append("chat_id", String(chatId));
+    form.append(field, blob, filename);
+    if (opts?.reply_to_message_id !== undefined) {
+      form.append("reply_to_message_id", String(opts.reply_to_message_id));
+    }
+    return form;
+  }
+
   return {
     call,
     getUpdates(params, signal) {
@@ -239,5 +314,13 @@ export function createTelegramClient(token: string): TelegramClient {
     setMyCommands(commands) {
       return call<true>("setMyCommands", { commands });
     },
+    sendVoice(chatId, audio, opts) {
+      const form = buildAudioForm(chatId, audio, "voice", "voice.ogg", "audio/ogg", opts);
+      return callMultipart<Message>("sendVoice", form);
+    },
+    sendAudio(chatId, audio, opts) {
+      const form = buildAudioForm(chatId, audio, "audio", "audio.mp3", "audio/mpeg", opts);
+      return callMultipart<Message>("sendAudio", form);
+    },
   };
 }
diff --git a/src/voice.test.ts b/src/voice.test.ts
new file mode 100644
index 0000000..15de1f0
--- /dev/null
+++ b/src/voice.test.ts
@@ -0,0 +1,185 @@
+/**
+ * @fileoverview Unit tests for the pure-logic helpers in `voice.ts`.
+ * @proves Markdown → speech token-walk handles each construct correctly,
+ *         length-cap math is right, cost-estimate math matches the
+ *         documented rates, and the voice-mode prompt substitutes `words`
+ *         and `words * 3` correctly.
+ *
+ * Per CLAUDE.md "Testing Philosophy" — `bun:test` for pure logic only.
+ * Orchestration helpers (`handleWebStt` etc.) touch ElevenLabs HTTP +
+ * sqlite and are verified manually + smoke-flood.
+ */
+
+import { describe, expect, test } from "bun:test";
+import {
+  buildVoiceModePrompt,
+  estimateSttCostUsd,
+  estimateTtsCostUsd,
+  stripMarkdownForSpeech,
+} from "./voice.ts";
+
+describe("buildVoiceModePrompt", () => {
+  test("default 60 words → 'under 60' and 'up to 180'", () => {
+    const out = buildVoiceModePrompt({ words: 60 });
+    expect(out).toContain("under 60 words");
+    expect(out).toContain("up to 180 words");
+  });
+
+  test("expand budget is words × 3", () => {
+    expect(buildVoiceModePrompt({ words: 30 })).toContain("up to 90 words");
+    expect(buildVoiceModePrompt({ words: 100 })).toContain("up to 300 words");
+  });
+
+  test("wraps in <voice-mode> tags", () => {
+    const out = buildVoiceModePrompt({ words: 60 });
+    expect(out.startsWith("<voice-mode>")).toBe(true);
+    expect(out.endsWith("</voice-mode>")).toBe(true);
+  });
+
+  test("instructs against preamble and lists/code/tables", () => {
+    const out = buildVoiceModePrompt({ words: 60 });
+    expect(out).toMatch(/no preamble/i);
+    expect(out).toMatch(/prose over lists/i);
+  });
+});
+
+describe("stripMarkdownForSpeech", () => {
+  test("plain paragraph passes through", () => {
+    expect(stripMarkdownForSpeech("hello world")).toBe("hello world");
+  });
+
+  test("headers strip hash marks, keep text", () => {
+    expect(stripMarkdownForSpeech("# Big news")).toBe("Big news.");
+  });
+
+  test("bold and italic unwrap to inner text", () => {
+    expect(stripMarkdownForSpeech("this is **important** and *fine*")).toBe(
+      "this is important and fine",
+    );
+  });
+
+  test("fenced code blocks summarize", () => {
+    const md = "Here is code:\n\n```js\nconst x = 1;\n```\n\nDone.";
+    const out = stripMarkdownForSpeech(md);
+    expect(out).toContain("[code block omitted]");
+    expect(out).not.toContain("const x = 1");
+  });
+
+  test("inline code keeps inner text", () => {
+    expect(stripMarkdownForSpeech("call `foo()` first")).toBe("call foo() first");
+  });
+
+  test("tables summarize", () => {
+    const md = "before\n\n| a | b |\n|---|---|\n| 1 | 2 |\n\nafter";
+    const out = stripMarkdownForSpeech(md);
+    expect(out).toContain("[table omitted]");
+    expect(out).not.toContain("| a | b |");
+  });
+
+  test("lists flatten with comma separators", () => {
+    const md = "Items:\n- apples\n- oranges\n- pears";
+    const out = stripMarkdownForSpeech(md);
+    expect(out).toContain("apples, oranges, pears");
+  });
+
+  test("link text kept, URL dropped", () => {
+    expect(stripMarkdownForSpeech("see [the docs](https://example.com)")).toBe(
+      "see the docs",
+    );
+  });
+
+  test("blockquote prefixed with 'Quote:'", () => {
+    expect(stripMarkdownForSpeech("> remember this")).toContain("Quote: remember this");
+  });
+
+  test("hr renders as a sentence break", () => {
+    const out = stripMarkdownForSpeech("before\n\n---\n\nafter");
+    expect(out).toContain("before");
+    expect(out).toContain("after");
+  });
+
+  test("strikethrough text is kept (TTS can't whisper a strike)", () => {
+    expect(stripMarkdownForSpeech("this is ~~wrong~~ actually right")).toBe(
+      "this is wrong actually right",
+    );
+  });
+
+  test("whitespace collapses", () => {
+    expect(stripMarkdownForSpeech("hello   \n\n  world  ")).toBe("hello world");
+  });
+
+  test("empty input returns empty string", () => {
+    expect(stripMarkdownForSpeech("")).toBe("");
+  });
+
+  test("strips Claude tier agent footer", () => {
+    const md = "Here's the answer.\n\n*✅ 2 turns · $0.0123*";
+    const out = stripMarkdownForSpeech(md);
+    expect(out).toBe("Here's the answer.");
+    expect(out).not.toContain("✅");
+    expect(out).not.toContain("turns");
+    expect(out).not.toContain("$");
+  });
+
+  test("strips engine-slot footer with model and tools", () => {
+    const md =
+      "London is in BST right now, 19:42.\n\n" +
+      "*✅ remote:openrouter:z-ai/glm-5.1 · 1 tools · 6.6s · $0.0048*";
+    const out = stripMarkdownForSpeech(md);
+    expect(out).toContain("London");
+    expect(out).not.toContain("openrouter");
+    expect(out).not.toContain("$");
+    expect(out).not.toContain("tools");
+  });
+
+  test("strips footer even with surrounding whitespace", () => {
+    const md = "Answer.\n\n  *  ✅ 1 turn · $0.0001  *  \n\n";
+    const out = stripMarkdownForSpeech(md);
+    expect(out).toBe("Answer.");
+  });
+
+  test("plain ✅ in content is preserved (no italic markers)", () => {
+    // The footer specifically uses italic markers (*...*). A bare ✅
+    // inside content shouldn't be stripped.
+    const md = "Test passed ✅ — moving on.";
+    const out = stripMarkdownForSpeech(md);
+    expect(out).toContain("Test passed");
+    expect(out).toContain("moving on");
+  });
+});
+
+describe("estimateSttCostUsd", () => {
+  test("1 hour at $0.22/hr = $0.22", () => {
+    expect(estimateSttCostUsd(3600, 0.22)).toBeCloseTo(0.22, 6);
+  });
+
+  test("30 seconds at $0.22/hr ≈ $0.00183", () => {
+    expect(estimateSttCostUsd(30, 0.22)).toBeCloseTo(0.001833, 5);
+  });
+
+  test("0 seconds is free", () => {
+    expect(estimateSttCostUsd(0, 0.22)).toBe(0);
+  });
+
+  test("custom rate respected", () => {
+    expect(estimateSttCostUsd(3600, 1.0)).toBe(1.0);
+  });
+});
+
+describe("estimateTtsCostUsd", () => {
+  test("1000 chars at $0.05/1k = $0.05", () => {
+    expect(estimateTtsCostUsd(1000, 0.05)).toBeCloseTo(0.05, 6);
+  });
+
+  test("3000 chars at $0.05/1k = $0.15", () => {
+    expect(estimateTtsCostUsd(3000, 0.05)).toBeCloseTo(0.15, 6);
+  });
+
+  test("0 chars is free", () => {
+    expect(estimateTtsCostUsd(0, 0.05)).toBe(0);
+  });
+
+  test("multi v2 price ($0.10/1k) doubles the spend", () => {
+    expect(estimateTtsCostUsd(1000, 0.1)).toBeCloseTo(0.1, 6);
+  });
+});
diff --git a/src/voice.ts b/src/voice.ts
new file mode 100644
index 0000000..7848347
--- /dev/null
+++ b/src/voice.ts
@@ -0,0 +1,873 @@
+/**
+ * @fileoverview Voice (ElevenLabs STT + TTS) orchestration.
+ * @purpose Sits between transport surfaces (web routes, Telegram dispatcher,
+ *          agent post-turn hook) and the typed `elevenlabs.ts` HTTP wrapper.
+ *          Owns gate + cap-check + audit-write + transport delivery for every
+ *          voice event.
+ *
+ * Audit-before-acting parity: every entrypoint writes a `voice_events` row
+ * regardless of outcome — allowed, denied at the gate, capped, or upstream
+ * error. This mirrors the `audit` table's posture for Anthropic turns: the
+ * voice log is the source of truth for "did we attempt N requests this hour?"
+ *
+ * Cost cap is per-axis. Anthropic burn (`audit.cost_usd`) and ElevenLabs
+ * burn (`voice_events.cost_usd_estimate`) ride independent sliding-60-min
+ * windows; one cap firing doesn't gate the other. Order of checks (per
+ * PLAN.md §12.2):
+ *   - STT: gate → global voice cap → per-chat voice cap → upstream → write
+ *   - TTS: per-chat voice cap → global voice cap → length wall → upstream → write
+ * Global before per-chat for the same reason the Anthropic hook orders it
+ * that way: a host-wide hit shouldn't be masked by a per-chat pass.
+ *
+ * Position in the dependency graph:
+ *   elevenlabs + db + log + config + policy + telegram + markdown → voice →
+ *     consumed by web, main, agent, commands
+ *
+ * Exports:
+ *   - `buildVoiceModePrompt(opts)` — voice-mode system block injected by both
+ *     SOLRAC.md sites (`agent.ts::buildAugmentedPrompt`, engine.ts
+ *     wrapInstanceMd seams) when `sessions.voice_replies=1`.
+ *   - `stripMarkdownForSpeech(md)` — token-walk transform; tables/code → "[…]";
+ *     lists flattened; links unwrapped; whitespace collapsed.
+ *   - `estimateSttCostUsd` / `estimateTtsCostUsd` — cost math.
+ *   - `handleWebStt` / `handleWebTts` — web `/api/stt` + `/api/tts` entrypoints.
+ *   - `handleTelegramVoiceStt` — Telegram dispatcher branch for `msg.voice`.
+ *   - `maybeReplyWithVoice` — post-turn hook (agent.ts/engine.ts) that
+ *     attaches a voice note when `voice_replies=1` and transport is Telegram.
+ *
+ * Key invariants:
+ *   - Every entrypoint writes a `voice_events` row regardless of outcome.
+ *     Denied rows have `cost_usd_estimate=0` so they don't count toward caps.
+ *   - The `<voice-mode>` block is plumbing, NOT user content. Audit rows
+ *     persist the original user prompt; the prompt-augmentation layer
+ *     (buildAugmentedPrompt / engine system messages) injects the block.
+ *   - `stripMarkdownForSpeech` is pure: `marked.lexer` walks tokens; runtime
+ *     state is per-call. Safe to call from any context.
+ *   - TTS length is checked AFTER markdown strip so a long markdown reply
+ *     that's mostly code fences (replaced with "[code block omitted]")
+ *     doesn't refuse on the raw markdown length.
+ *
+ * Gotchas:
+ *   - `maybeReplyWithVoice` is best-effort. Failures log + return; they do
+ *     NOT propagate to the caller (the original turn must succeed even if
+ *     TTS fails — voice is a UX layer, not the conversation).
+ *   - `handleTelegramVoiceStt` synthesizes a new Update with `text=transcript`
+ *     and NO `voice` field. The dispatcher loop must not re-process it as a
+ *     voice message (the field strip is what prevents loops).
+ *   - `handleWebTts` returns the upstream `ReadableStream` for proxy-through.
+ *     The caller (`web.ts`) is responsible for not buffering server-side —
+ *     that's the whole point of `/v1/text-to-speech/{id}/stream`.
+ */
+
+import { Marked } from "marked";
+import type { Token, Tokens } from "marked";
+import type { Update } from "@grammyjs/types";
+import type { Config } from "./config.ts";
+import type { SolracDb } from "./db.ts";
+import { log as defaultLog } from "./log.ts";
+import {
+  ElevenLabsAuthError,
+  ElevenLabsError,
+  ElevenLabsRateError,
+  speechToText,
+  textToSpeechStream,
+} from "./elevenlabs.ts";
+import { gateUpdate } from "./policy.ts";
+import type { TelegramClient } from "./telegram.ts";
+
+// One hour sliding window — matches `HOURLY_COST_CAP_USD` for Anthropic.
+const COST_WINDOW_MS = 60 * 60 * 1000;
+
+// Reuse a single Marked instance for `lexer()` — token shapes are identical
+// across calls, and the lexer carries no per-call state.
+const speechMarked = new Marked({ gfm: true, breaks: false });
+
+// ---------------------------------------------------------------------------
+// Pure helpers (unit-tested)
+// ---------------------------------------------------------------------------
+
+/**
+ * Returns the voice-mode system-prompt block. Injected by both SOLRAC.md
+ * sites when `sessions.voice_replies=1`. `words` comes from
+ * `config.voiceReplyWordsHint` (clamped to [30, 200] at boot).
+ *
+ * The "expand 3×" carve-out gives the model headroom when the operator
+ * explicitly asks for more — without it, "explain in detail" would be
+ * curtailed by the soft target. The hard wall (`VOICE_TTS_MAX_CHARS`) is
+ * the last line of defense if the model ignores both.
+ */
+export function buildVoiceModePrompt(opts: { words: number }): string {
+  const w = opts.words;
+  const expanded = w * 3;
+  return [
+    "<voice-mode>",
+    `The user is listening to your reply as audio. Respond in under ${w} words.`,
+    "No preamble — answer directly. Prefer prose over lists, code, or tables.",
+    "If the user asks you to expand, elaborate, or give more detail, you may",
+    `use up to ${expanded} words.`,
+    "</voice-mode>",
+  ].join("\n");
+}
+
+/**
+ * Token-walk transform: markdown → plain text suitable for TTS. Code
+ * fences and tables are summarized rather than read literally — neither
+ * speaks well, and reading code character-by-character is an antifeature.
+ * Returns trimmed, single-spaced text.
+ */
+export function stripMarkdownForSpeech(markdown: string): string {
+  const trimmed = stripAgentFooter(markdown);
+  let tokens: Token[];
+  try {
+    tokens = speechMarked.lexer(trimmed);
+  } catch {
+    // Parser glitch — fall back to the raw text so we never refuse to
+    // speak. The downstream length wall still gates absurdly long inputs.
+    return collapseWhitespace(trimmed);
+  }
+  const out = walkTokens(tokens);
+  return collapseWhitespace(out);
+}
+
+// Footer pattern from agent.ts::buildFooter / engine.ts::renderFinal:
+//   *✅ <metadata>*   — italicized line with the ✅ prefix carrying turn
+//                       count / cost / engine info. Pure UI chrome that
+//                       should never be spoken aloud. The ✅ (U+2705) is
+//                       distinctive enough that any occurrence is the
+//                       footer; strip from ✅ to the closing `*` on the
+//                       same line.
+const FOOTER_RE = /\*\s*✅[^*\n]*\*/g;
+function stripAgentFooter(markdown: string): string {
+  return markdown.replace(FOOTER_RE, "");
+}
+
+function walkTokens(tokens: Token[]): string {
+  const parts: string[] = [];
+  for (const t of tokens) {
+    parts.push(renderToken(t));
+  }
+  return parts.join(" ");
+}
+
+function renderToken(t: Token): string {
+  switch (t.type) {
+    case "space":
+      return " ";
+    case "heading": {
+      return walkTokens((t as Tokens.Heading).tokens as Token[]) + ". ";
+    }
+    case "paragraph":
+      return walkTokens((t as Tokens.Paragraph).tokens as Token[]) + " ";
+    case "text": {
+      const tt = t as Tokens.Text;
+      if (tt.tokens) return walkTokens(tt.tokens as Token[]);
+      return tt.text;
+    }
+    case "strong":
+      return walkTokens((t as Tokens.Strong).tokens as Token[]);
+    case "em":
+      return walkTokens((t as Tokens.Em).tokens as Token[]);
+    case "del":
+      return walkTokens((t as Tokens.Del).tokens as Token[]);
+    case "link":
+      return walkTokens((t as Tokens.Link).tokens as Token[]);
+    case "image":
+      return (t as Tokens.Image).text ?? "";
+    case "codespan":
+      return (t as Tokens.Codespan).text;
+    case "code":
+      return "[code block omitted]";
+    case "table":
+      return "[table omitted]";
+    case "blockquote":
+      return "Quote: " + walkTokens((t as Tokens.Blockquote).tokens as Token[]) + " ";
+    case "list": {
+      const list = t as Tokens.List;
+      const items = list.items.map((it) =>
+        walkTokens(it.tokens as Token[]).trim(),
+      );
+      return items.join(", ") + ". ";
+    }
+    case "list_item":
+      return walkTokens((t as Tokens.ListItem).tokens as Token[]);
+    case "hr":
+      return ". ";
+    case "br":
+      return " ";
+    case "html":
+      // Strip tags; HTML rarely appears in our outputs but a stray <br>
+      // shouldn't read aloud as "less-than br greater-than."
+      return (t as Tokens.HTML).text.replace(/<[^>]+>/g, " ");
+    case "escape":
+      return (t as Tokens.Escape).text;
+    default:
+      return "";
+  }
+}
+
+function collapseWhitespace(s: string): string {
+  return s.replace(/\s+/g, " ").trim();
+}
+
+/**
+ * STT cost estimate. ElevenLabs Scribe is billed per hour of audio at
+ * $0.22/hr (May 2026). Operator can override the price via
+ * `ELEVENLABS_STT_PRICE_USD_PER_HOUR` if their plan differs.
+ */
+export function estimateSttCostUsd(durationSeconds: number, pricePerHour: number): number {
+  return (durationSeconds / 3600) * pricePerHour;
+}
+
+/**
+ * TTS cost estimate. Flash v2.5 is $0.05/1k chars; bump to $0.10 for Multi
+ * v2. Operator pins via `ELEVENLABS_TTS_PRICE_USD_PER_1K_CHARS`.
+ */
+export function estimateTtsCostUsd(chars: number, pricePer1k: number): number {
+  return (chars / 1000) * pricePer1k;
+}
+
+// ---------------------------------------------------------------------------
+// Orchestration entrypoints
+// ---------------------------------------------------------------------------
+
+export interface VoiceTelegramSender {
+  // Phase 5 wires these to typed `telegram.ts` helpers backed by multipart
+  // POSTs to sendVoice / sendAudio. Kept as a small injected interface so
+  // Phase 1 voice.ts has no compile-time dependency on the Telegram bot
+  // token (which lives in main.ts).
+  sendVoice: (
+    chatId: number,
+    audio: ArrayBuffer,
+    opts?: { replyToMessageId?: number; mimeType?: string },
+  ) => Promise<void>;
+  sendAudio: (
+    chatId: number,
+    audio: ArrayBuffer,
+    opts?: { replyToMessageId?: number; mimeType?: string },
+  ) => Promise<void>;
+}
+
+export interface VoiceDeps {
+  db: SolracDb;
+  tg: TelegramClient;
+  config: Config;
+  log?: typeof defaultLog;
+  // Phase 5 — when present, `maybeReplyWithVoice` uses it to send audio.
+  // When omitted (Phase 1 + web transport), the post-turn TTS attach is
+  // a no-op (web has its own per-message speak button).
+  telegramSender?: VoiceTelegramSender;
+  // Phase 4 — when present, `handleTelegramVoiceStt` uses it to gate before
+  // paying for Scribe. When omitted, gate is skipped (test scaffolding).
+  isAllowed?: (userId: number) => boolean;
+}
+
+export type WebSttResult =
+  | { kind: "ok"; text: string }
+  | { kind: "denied_cap" }
+  | { kind: "error"; message: string };
+
+export type WebTtsResult =
+  | { kind: "stream"; contentType: string; body: ReadableStream<Uint8Array> }
+  | { kind: "too_long"; maxChars: number }
+  | { kind: "denied_cap" }
+  | { kind: "error"; message: string };
+
+export type TelegramSttResult =
+  | { kind: "synthesized"; update: Update }
+  | { kind: "denied_gate" }
+  | { kind: "denied_cap" }
+  | { kind: "error"; message: string };
+
+/**
+ * Web `/api/stt` entrypoint. Cap → ElevenLabs STT → voice_events row.
+ * No gate here — the web route's session-cookie auth is the gate (web is
+ * single-user; the cookie proves operator identity).
+ */
+export async function handleWebStt(
+  deps: VoiceDeps,
+  opts: { chatId: number; audio: Blob; signal?: AbortSignal },
+): Promise<WebSttResult> {
+  const log = deps.log ?? defaultLog;
+  const cfg = deps.config;
+  if (!cfg.voiceEnabled || cfg.elevenlabsApiKey === null) {
+    return { kind: "error", message: "voice disabled" };
+  }
+  const now = Date.now();
+  const sinceMs = now - COST_WINDOW_MS;
+  const globalUsed = deps.db.voiceCostUsedGlobalLast60min(sinceMs);
+  if (globalUsed >= cfg.voiceGlobalHourlyCostCapUsd) {
+    deps.db.recordVoiceEvent({
+      chatId: opts.chatId,
+      tsMs: now,
+      kind: "stt",
+      source: "web",
+      model: cfg.elevenlabsSttModel,
+      voiceId: null,
+      auditId: null,
+      durationMs: null,
+      chars: null,
+      costUsdEstimate: 0,
+      status: "denied_cap",
+      errorMessage: "global_voice_cap",
+    });
+    log.warn("voice.cost_cap_exceeded", {
+      chat_id: opts.chatId,
+      kind: "stt",
+      scope: "global",
+      window_cost_usd: globalUsed,
+    });
+    return { kind: "denied_cap" };
+  }
+  const chatUsed = deps.db.voiceCostUsedLast60min(opts.chatId, sinceMs);
+  if (chatUsed >= cfg.voiceHourlyCostCapUsd) {
+    deps.db.recordVoiceEvent({
+      chatId: opts.chatId,
+      tsMs: now,
+      kind: "stt",
+      source: "web",
+      model: cfg.elevenlabsSttModel,
+      voiceId: null,
+      auditId: null,
+      durationMs: null,
+      chars: null,
+      costUsdEstimate: 0,
+      status: "denied_cap",
+      errorMessage: "chat_voice_cap",
+    });
+    log.warn("voice.cost_cap_exceeded", {
+      chat_id: opts.chatId,
+      kind: "stt",
+      scope: "chat",
+      window_cost_usd: chatUsed,
+    });
+    return { kind: "denied_cap" };
+  }
+  try {
+    const result = await speechToText({
+      apiKey: cfg.elevenlabsApiKey,
+      modelId: cfg.elevenlabsSttModel,
+      audio: opts.audio,
+      signal: opts.signal,
+    });
+    const cost = estimateSttCostUsd(result.durationSeconds, cfg.elevenlabsSttPriceUsdPerHour);
+    deps.db.recordVoiceEvent({
+      chatId: opts.chatId,
+      tsMs: now,
+      kind: "stt",
+      source: "web",
+      model: cfg.elevenlabsSttModel,
+      voiceId: null,
+      auditId: null,
+      durationMs: Math.round(result.durationSeconds * 1000),
+      chars: null,
+      costUsdEstimate: cost,
+      status: "ok",
+      errorMessage: null,
+    });
+    return { kind: "ok", text: result.text };
+  } catch (err) {
+    return handleElevenLabsError(deps, {
+      err,
+      chatId: opts.chatId,
+      tsMs: now,
+      kind: "stt",
+      source: "web",
+      model: cfg.elevenlabsSttModel,
+      voiceId: null,
+    });
+  }
+}
+
+/**
+ * Web `/api/tts` entrypoint. Cap → length wall → ElevenLabs TTS stream →
+ * voice_events row. Returns the upstream `ReadableStream` for proxy-through;
+ * the caller wraps in `new Response(body, …)`.
+ */
+export async function handleWebTts(
+  deps: VoiceDeps,
+  opts: { chatId: number; markdown: string; auditId: number | null; signal?: AbortSignal },
+): Promise<WebTtsResult> {
+  const log = deps.log ?? defaultLog;
+  const cfg = deps.config;
+  if (!cfg.voiceEnabled || cfg.elevenlabsApiKey === null || cfg.elevenlabsVoiceId === null) {
+    return { kind: "error", message: "voice disabled" };
+  }
+  const speech = stripMarkdownForSpeech(opts.markdown);
+  const now = Date.now();
+  const sinceMs = now - COST_WINDOW_MS;
+  const chatUsed = deps.db.voiceCostUsedLast60min(opts.chatId, sinceMs);
+  if (chatUsed >= cfg.voiceHourlyCostCapUsd) {
+    deps.db.recordVoiceEvent({
+      chatId: opts.chatId,
+      tsMs: now,
+      kind: "tts",
+      source: "web",
+      model: cfg.elevenlabsTtsModel,
+      voiceId: cfg.elevenlabsVoiceId,
+      auditId: opts.auditId,
+      durationMs: null,
+      chars: speech.length,
+      costUsdEstimate: 0,
+      status: "denied_cap",
+      errorMessage: "chat_voice_cap",
+    });
+    log.warn("voice.cost_cap_exceeded", {
+      chat_id: opts.chatId,
+      kind: "tts",
+      scope: "chat",
+      window_cost_usd: chatUsed,
+    });
+    return { kind: "denied_cap" };
+  }
+  const globalUsed = deps.db.voiceCostUsedGlobalLast60min(sinceMs);
+  if (globalUsed >= cfg.voiceGlobalHourlyCostCapUsd) {
+    deps.db.recordVoiceEvent({
+      chatId: opts.chatId,
+      tsMs: now,
+      kind: "tts",
+      source: "web",
+      model: cfg.elevenlabsTtsModel,
+      voiceId: cfg.elevenlabsVoiceId,
+      auditId: opts.auditId,
+      durationMs: null,
+      chars: speech.length,
+      costUsdEstimate: 0,
+      status: "denied_cap",
+      errorMessage: "global_voice_cap",
+    });
+    log.warn("voice.cost_cap_exceeded", {
+      chat_id: opts.chatId,
+      kind: "tts",
+      scope: "global",
+      window_cost_usd: globalUsed,
+    });
+    return { kind: "denied_cap" };
+  }
+  if (speech.length > cfg.voiceTtsMaxChars) {
+    deps.db.recordVoiceEvent({
+      chatId: opts.chatId,
+      tsMs: now,
+      kind: "tts",
+      source: "web",
+      model: cfg.elevenlabsTtsModel,
+      voiceId: cfg.elevenlabsVoiceId,
+      auditId: opts.auditId,
+      durationMs: null,
+      chars: speech.length,
+      costUsdEstimate: 0,
+      status: "error",
+      errorMessage: "too_long",
+    });
+    return { kind: "too_long", maxChars: cfg.voiceTtsMaxChars };
+  }
+  try {
+    const stream = await textToSpeechStream({
+      apiKey: cfg.elevenlabsApiKey,
+      voiceId: cfg.elevenlabsVoiceId,
+      modelId: cfg.elevenlabsTtsModel,
+      outputFormat: cfg.elevenlabsTtsFormatWeb,
+      text: speech,
+      signal: opts.signal,
+    });
+    const cost = estimateTtsCostUsd(speech.length, cfg.elevenlabsTtsPriceUsdPer1k);
+    deps.db.recordVoiceEvent({
+      chatId: opts.chatId,
+      tsMs: now,
+      kind: "tts",
+      source: "web",
+      model: cfg.elevenlabsTtsModel,
+      voiceId: cfg.elevenlabsVoiceId,
+      auditId: opts.auditId,
+      durationMs: null,
+      chars: speech.length,
+      costUsdEstimate: cost,
+      status: "ok",
+      errorMessage: null,
+    });
+    return { kind: "stream", contentType: stream.contentType, body: stream.body };
+  } catch (err) {
+    return handleElevenLabsError(deps, {
+      err,
+      chatId: opts.chatId,
+      tsMs: now,
+      kind: "tts",
+      source: "web",
+      model: cfg.elevenlabsTtsModel,
+      voiceId: cfg.elevenlabsVoiceId,
+      chars: speech.length,
+      auditId: opts.auditId,
+    });
+  }
+}
+
+/**
+ * Telegram dispatcher branch for `msg.voice`. Gate → cap → getFile →
+ * stream-download → Scribe → synthesize a new text Update. The caller
+ * (main.ts) feeds the synthesized Update back into the queue, where it
+ * follows the normal text-message path.
+ */
+export async function handleTelegramVoiceStt(
+  deps: VoiceDeps,
+  opts: { update: Update; voiceFileId: string },
+): Promise<TelegramSttResult> {
+  const log = deps.log ?? defaultLog;
+  const cfg = deps.config;
+  const update = opts.update;
+  const msg = update.message;
+  if (!msg) return { kind: "error", message: "no message" };
+  const chatId = msg.chat.id;
+  const now = Date.now();
+  if (!cfg.voiceEnabled || cfg.elevenlabsApiKey === null) {
+    return { kind: "error", message: "voice disabled" };
+  }
+  // Gate first — never pay Scribe for a non-allowlisted sender.
+  if (deps.isAllowed) {
+    const gate = gateUpdate(update, deps.isAllowed);
+    if (gate.kind !== "ok") {
+      deps.db.recordVoiceEvent({
+        chatId,
+        tsMs: now,
+        kind: "stt",
+        source: "telegram",
+        model: cfg.elevenlabsSttModel,
+        voiceId: null,
+        auditId: null,
+        durationMs: null,
+        chars: null,
+        costUsdEstimate: 0,
+        status: "denied_gate",
+        errorMessage: gate.kind,
+      });
+      return { kind: "denied_gate" };
+    }
+  }
+  const sinceMs = now - COST_WINDOW_MS;
+  const globalUsed = deps.db.voiceCostUsedGlobalLast60min(sinceMs);
+  if (globalUsed >= cfg.voiceGlobalHourlyCostCapUsd) {
+    deps.db.recordVoiceEvent({
+      chatId,
+      tsMs: now,
+      kind: "stt",
+      source: "telegram",
+      model: cfg.elevenlabsSttModel,
+      voiceId: null,
+      auditId: null,
+      durationMs: null,
+      chars: null,
+      costUsdEstimate: 0,
+      status: "denied_cap",
+      errorMessage: "global_voice_cap",
+    });
+    log.warn("voice.cost_cap_exceeded", {
+      chat_id: chatId,
+      kind: "stt",
+      scope: "global",
+      window_cost_usd: globalUsed,
+    });
+    return { kind: "denied_cap" };
+  }
+  const chatUsed = deps.db.voiceCostUsedLast60min(chatId, sinceMs);
+  if (chatUsed >= cfg.voiceHourlyCostCapUsd) {
+    deps.db.recordVoiceEvent({
+      chatId,
+      tsMs: now,
+      kind: "stt",
+      source: "telegram",
+      model: cfg.elevenlabsSttModel,
+      voiceId: null,
+      auditId: null,
+      durationMs: null,
+      chars: null,
+      costUsdEstimate: 0,
+      status: "denied_cap",
+      errorMessage: "chat_voice_cap",
+    });
+    log.warn("voice.cost_cap_exceeded", {
+      chat_id: chatId,
+      kind: "stt",
+      scope: "chat",
+      window_cost_usd: chatUsed,
+    });
+    return { kind: "denied_cap" };
+  }
+  try {
+    // `getFile` resolves the file_path; the actual download URL is
+    // `https://api.telegram.org/file/bot<TOKEN>/<file_path>`. Never log
+    // that URL — it contains the bot token.
+    const fileInfo = await deps.tg.call<{ file_path?: string }>("getFile", {
+      file_id: opts.voiceFileId,
+    });
+    if (!fileInfo.file_path) {
+      throw new Error("telegram getFile returned no file_path");
+    }
+    const downloadUrl = `https://api.telegram.org/file/bot${cfg.telegramBotToken}/${fileInfo.file_path}`;
+    const dlRes = await fetch(downloadUrl);
+    if (!dlRes.ok) {
+      throw new Error(`telegram file download failed: HTTP ${dlRes.status}`);
+    }
+    const audioBuf = await dlRes.arrayBuffer();
+    if (audioBuf.byteLength > cfg.voiceSttMaxBytes) {
+      throw new Error(
+        `telegram voice exceeds VOICE_STT_MAX_BYTES (${audioBuf.byteLength} > ${cfg.voiceSttMaxBytes})`,
+      );
+    }
+    const audioBlob = new Blob([audioBuf], { type: "audio/ogg" });
+    const result = await speechToText({
+      apiKey: cfg.elevenlabsApiKey,
+      modelId: cfg.elevenlabsSttModel,
+      audio: audioBlob,
+      filename: "voice.ogg",
+    });
+    const cost = estimateSttCostUsd(result.durationSeconds, cfg.elevenlabsSttPriceUsdPerHour);
+    deps.db.recordVoiceEvent({
+      chatId,
+      tsMs: now,
+      kind: "stt",
+      source: "telegram",
+      model: cfg.elevenlabsSttModel,
+      voiceId: null,
+      auditId: null,
+      durationMs: Math.round(result.durationSeconds * 1000),
+      chars: null,
+      costUsdEstimate: cost,
+      status: "ok",
+      errorMessage: null,
+    });
+    const synthesized = synthesizeTextUpdate(update, result.text);
+    return { kind: "synthesized", update: synthesized };
+  } catch (err) {
+    return handleElevenLabsError(deps, {
+      err,
+      chatId,
+      tsMs: now,
+      kind: "stt",
+      source: "telegram",
+      model: cfg.elevenlabsSttModel,
+      voiceId: null,
+    });
+  }
+}
+
+/**
+ * Post-turn TTS attach for Telegram. No-ops when voice mode is off, when
+ * the voice transport disabled, when the deploy has no `telegramSender`,
+ * or when the cap/length wall fires. Failures are best-effort and DO NOT
+ * propagate — the assistant's text reply must already have been sent.
+ */
+export async function maybeReplyWithVoice(
+  deps: VoiceDeps,
+  opts: {
+    chatId: number;
+    messageId: number | null;
+    auditId: number | null;
+    finalText: string;
+  },
+): Promise<void> {
+  const log = deps.log ?? defaultLog;
+  const cfg = deps.config;
+  if (!cfg.voiceEnabled || cfg.elevenlabsApiKey === null || cfg.elevenlabsVoiceId === null) {
+    return;
+  }
+  if (!deps.telegramSender) return;
+  if (!deps.db.getVoiceRepliesFlag(opts.chatId)) return;
+  const speech = stripMarkdownForSpeech(opts.finalText);
+  if (speech.length === 0) return;
+  const now = Date.now();
+  const sinceMs = now - COST_WINDOW_MS;
+  const chatUsed = deps.db.voiceCostUsedLast60min(opts.chatId, sinceMs);
+  if (chatUsed >= cfg.voiceHourlyCostCapUsd) {
+    deps.db.recordVoiceEvent({
+      chatId: opts.chatId,
+      tsMs: now,
+      kind: "tts",
+      source: "telegram",
+      model: cfg.elevenlabsTtsModel,
+      voiceId: cfg.elevenlabsVoiceId,
+      auditId: opts.auditId,
+      durationMs: null,
+      chars: speech.length,
+      costUsdEstimate: 0,
+      status: "denied_cap",
+      errorMessage: "chat_voice_cap",
+    });
+    log.warn("voice.cost_cap_exceeded", {
+      chat_id: opts.chatId,
+      kind: "tts",
+      scope: "chat",
+      window_cost_usd: chatUsed,
+    });
+    return;
+  }
+  const globalUsed = deps.db.voiceCostUsedGlobalLast60min(sinceMs);
+  if (globalUsed >= cfg.voiceGlobalHourlyCostCapUsd) {
+    deps.db.recordVoiceEvent({
+      chatId: opts.chatId,
+      tsMs: now,
+      kind: "tts",
+      source: "telegram",
+      model: cfg.elevenlabsTtsModel,
+      voiceId: cfg.elevenlabsVoiceId,
+      auditId: opts.auditId,
+      durationMs: null,
+      chars: speech.length,
+      costUsdEstimate: 0,
+      status: "denied_cap",
+      errorMessage: "global_voice_cap",
+    });
+    log.warn("voice.cost_cap_exceeded", {
+      chat_id: opts.chatId,
+      kind: "tts",
+      scope: "global",
+      window_cost_usd: globalUsed,
+    });
+    return;
+  }
+  if (speech.length > cfg.voiceTtsMaxChars) {
+    deps.db.recordVoiceEvent({
+      chatId: opts.chatId,
+      tsMs: now,
+      kind: "tts",
+      source: "telegram",
+      model: cfg.elevenlabsTtsModel,
+      voiceId: cfg.elevenlabsVoiceId,
+      auditId: opts.auditId,
+      durationMs: null,
+      chars: speech.length,
+      costUsdEstimate: 0,
+      status: "error",
+      errorMessage: "too_long",
+    });
+    await deps.tg
+      .sendMessage(opts.chatId, "reply too long to speak — try a shorter prompt")
+      .catch((err) => log.warn("voice.too_long_notice_failed", { error: (err as Error).message }));
+    return;
+  }
+  try {
+    const stream = await textToSpeechStream({
+      apiKey: cfg.elevenlabsApiKey,
+      voiceId: cfg.elevenlabsVoiceId,
+      modelId: cfg.elevenlabsTtsModel,
+      outputFormat: cfg.elevenlabsTtsFormatTg,
+      text: speech,
+    });
+    const buf = await new Response(stream.body).arrayBuffer();
+    const cost = estimateTtsCostUsd(speech.length, cfg.elevenlabsTtsPriceUsdPer1k);
+    deps.db.recordVoiceEvent({
+      chatId: opts.chatId,
+      tsMs: now,
+      kind: "tts",
+      source: "telegram",
+      model: cfg.elevenlabsTtsModel,
+      voiceId: cfg.elevenlabsVoiceId,
+      auditId: opts.auditId,
+      durationMs: null,
+      chars: speech.length,
+      costUsdEstimate: cost,
+      status: "ok",
+      errorMessage: null,
+    });
+    // Opus format implies sendVoice; anything else (mp3 fallback) uses
+    // sendAudio. The env-var-driven format pick at boot decides which.
+    const useVoiceNote = cfg.elevenlabsTtsFormatTg.startsWith("opus_");
+    const mimeType = stream.contentType;
+    const replyOpts = opts.messageId
+      ? { replyToMessageId: opts.messageId, mimeType }
+      : { mimeType };
+    if (useVoiceNote) {
+      await deps.telegramSender.sendVoice(opts.chatId, buf, replyOpts);
+    } else {
+      await deps.telegramSender.sendAudio(opts.chatId, buf, replyOpts);
+    }
+  } catch (err) {
+    deps.db.recordVoiceEvent({
+      chatId: opts.chatId,
+      tsMs: now,
+      kind: "tts",
+      source: "telegram",
+      model: cfg.elevenlabsTtsModel,
+      voiceId: cfg.elevenlabsVoiceId,
+      auditId: opts.auditId,
+      durationMs: null,
+      chars: speech.length,
+      costUsdEstimate: 0,
+      status: "error",
+      errorMessage: (err as Error).message,
+    });
+    log.warn("voice.tts_failed", { chat_id: opts.chatId, error: (err as Error).message });
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Internals
+// ---------------------------------------------------------------------------
+
+interface ErrorContext {
+  err: unknown;
+  chatId: number;
+  tsMs: number;
+  kind: "stt" | "tts";
+  source: "web" | "telegram";
+  model: string;
+  voiceId: string | null;
+  chars?: number | null;
+  auditId?: number | null;
+}
+
+function handleElevenLabsError(
+  deps: VoiceDeps,
+  ctx: ErrorContext,
+): { kind: "error"; message: string } {
+  const log = deps.log ?? defaultLog;
+  const message =
+    ctx.err instanceof Error ? ctx.err.message : "elevenlabs request failed";
+  deps.db.recordVoiceEvent({
+    chatId: ctx.chatId,
+    tsMs: ctx.tsMs,
+    kind: ctx.kind,
+    source: ctx.source,
+    model: ctx.model,
+    voiceId: ctx.voiceId,
+    auditId: ctx.auditId ?? null,
+    durationMs: null,
+    chars: ctx.chars ?? null,
+    costUsdEstimate: 0,
+    status: "error",
+    errorMessage: message,
+  });
+  if (ctx.err instanceof ElevenLabsAuthError) {
+    log.error("voice.auth_failed", { chat_id: ctx.chatId, kind: ctx.kind, source: ctx.source });
+  } else if (ctx.err instanceof ElevenLabsRateError) {
+    log.warn("voice.rate_limited", { chat_id: ctx.chatId, kind: ctx.kind, source: ctx.source });
+  } else if (ctx.err instanceof ElevenLabsError) {
+    log.warn("voice.upstream_error", {
+      chat_id: ctx.chatId,
+      kind: ctx.kind,
+      source: ctx.source,
+      status: ctx.err.status,
+      message,
+    });
+  } else {
+    log.warn("voice.failed", { chat_id: ctx.chatId, kind: ctx.kind, source: ctx.source, message });
+  }
+  return { kind: "error", message };
+}
+
+/**
+ * Construct a synthesized text Update from a voice Update. Carries the
+ * same update_id / chat / from so the downstream dispatcher treats it
+ * identically to a typed message. The `voice` field is intentionally
+ * dropped — the dispatcher must not re-route it as a voice message.
+ */
+function synthesizeTextUpdate(original: Update, transcript: string): Update {
+  const msg = original.message!;
+  return {
+    update_id: original.update_id,
+    message: {
+      message_id: msg.message_id,
+      date: msg.date,
+      chat: msg.chat,
+      from: msg.from,
+      text: transcript,
+    },
+  } as Update;
+}
diff --git a/src/web-client.ts b/src/web-client.ts
index 1c105d9..676ea1f 100644
--- a/src/web-client.ts
+++ b/src/web-client.ts
@@ -181,5 +181,16 @@ export function createWebClient(opts: WebClientOpts = {}): WebClient {
     async setMyCommands() {
       return true as const;
     },
+    async sendVoice() {
+      // The web transport has its own per-message speak button (handled
+      // by the browser); the post-turn TTS attach (`voice.maybeReplyWithVoice`)
+      // is gated on the presence of a `telegramSender` dep, which the web
+      // path never wires. So sendVoice/sendAudio should never be called on
+      // a WebClient — if they are, throw loudly rather than silently drop.
+      throw new Error("WebClient.sendVoice is not implemented — web uses per-message speak button");
+    },
+    async sendAudio() {
+      throw new Error("WebClient.sendAudio is not implemented — web uses per-message speak button");
+    },
   };
 }
diff --git a/src/web.ts b/src/web.ts
index d34bed9..12c3ffa 100644
--- a/src/web.ts
+++ b/src/web.ts
@@ -35,6 +35,7 @@ import { randomUUID } from "node:crypto";
 import { log } from "./log.ts";
 import { sanitizeHtml } from "./web-sanitize.ts";
 import type { WebClient, WebBusEvent } from "./web-client.ts";
+import { handleWebStt, handleWebTts, type VoiceDeps } from "./voice.ts";
 
 // PNX-168 — text imports of UI assets so the compiled binary serves the web
 // UI without runtime fs reads of the source tree. In dev these resolve to the
@@ -96,6 +97,19 @@ export interface WebServerDeps {
    * runs `marked` + sanitizer.
    */
   loadHistory: () => Array<{ prompt: string; response: string; model: string }>;
+  /**
+   * Voice (Phase 2). When set, the server exposes /api/stt + /api/tts
+   * routes backed by ElevenLabs. `null` when VOICE_ENABLED=false — the
+   * routes still respond with HTTP 503 so the browser can render a clean
+   * "voice disabled" toast rather than a 404 mystery.
+   */
+  voiceDeps: VoiceDeps | null;
+  /**
+   * Voice-replies current state for this chat (the web one shares a single
+   * synthetic chat id). The browser badges the composer with "voice mode
+   * on" when true. main.ts wires this to `db.getVoiceRepliesFlag(webChatId)`.
+   */
+  voiceRepliesEnabled: () => boolean;
 }
 
 export interface WebServerHandle {
@@ -180,6 +194,97 @@ export function startWebServer(deps: WebServerDeps): WebServerHandle {
     return Response.json({ ok: true, turns: deps.loadHistory() });
   }
 
+  function voiceStateHandler(req: Request): Response {
+    if (!authorize(req)) return Response.json({ ok: false }, { status: 401 });
+    return Response.json({ ok: true, enabled: deps.voiceRepliesEnabled() });
+  }
+
+  async function sttHandler(req: Request): Promise<Response> {
+    if (!authorize(req)) return Response.json({ ok: false }, { status: 401 });
+    if (!deps.voiceDeps) {
+      return Response.json({ ok: false, message: "voice disabled" }, { status: 503 });
+    }
+    const cap = deps.voiceDeps.config.voiceSttMaxBytes;
+    // `req.formData()` buffers the entire body. The Content-Length check
+    // is a fast-path reject for oversized uploads before we pay the buffer.
+    const lenHeader = req.headers.get("content-length");
+    const len = lenHeader ? Number(lenHeader) : NaN;
+    if (Number.isFinite(len) && len > cap) {
+      return Response.json(
+        { ok: false, message: `audio exceeds ${cap} bytes` },
+        { status: 413 },
+      );
+    }
+    let form;
+    try {
+      form = await req.formData();
+    } catch (err) {
+      log.warn("web.stt_form_parse_failed", { error: (err as Error).message });
+      return Response.json({ ok: false, message: "invalid multipart" }, { status: 400 });
+    }
+    const audio = form.get("audio");
+    if (!(audio instanceof Blob)) {
+      return Response.json({ ok: false, message: "missing audio" }, { status: 400 });
+    }
+    if (audio.size > cap) {
+      return Response.json(
+        { ok: false, message: `audio exceeds ${cap} bytes` },
+        { status: 413 },
+      );
+    }
+    const result = await handleWebStt(deps.voiceDeps, {
+      chatId: deps.webChatId,
+      audio,
+    });
+    if (result.kind === "ok") return Response.json({ ok: true, text: result.text });
+    if (result.kind === "denied_cap") {
+      return Response.json({ ok: false, message: "voice cap reached" }, { status: 429 });
+    }
+    return Response.json({ ok: false, message: result.message }, { status: 502 });
+  }
+
+  async function ttsHandler(req: Request): Promise<Response> {
+    if (!authorize(req)) return Response.json({ ok: false }, { status: 401 });
+    if (!deps.voiceDeps) {
+      return Response.json({ ok: false, message: "voice disabled" }, { status: 503 });
+    }
+    const body = (await req.json().catch(() => null)) as
+      | { message_id?: unknown; markdown?: unknown }
+      | null;
+    const markdown = body && typeof body.markdown === "string" ? body.markdown : "";
+    const auditId =
+      body && typeof body.message_id === "number" ? body.message_id : null;
+    if (!markdown.trim()) {
+      return Response.json({ ok: false, message: "empty markdown" }, { status: 400 });
+    }
+    const result = await handleWebTts(deps.voiceDeps, {
+      chatId: deps.webChatId,
+      markdown,
+      auditId,
+    });
+    if (result.kind === "stream") {
+      return new Response(result.body, {
+        headers: {
+          "content-type": result.contentType,
+          "cache-control": "no-cache, no-transform",
+        },
+      });
+    }
+    if (result.kind === "too_long") {
+      return Response.json(
+        {
+          ok: false,
+          message: `reply too long to speak (max ${result.maxChars} chars)`,
+        },
+        { status: 413 },
+      );
+    }
+    if (result.kind === "denied_cap") {
+      return Response.json({ ok: false, message: "voice cap reached" }, { status: 429 });
+    }
+    return Response.json({ ok: false, message: result.message }, { status: 502 });
+  }
+
   function streamHandler(req: Request): Response {
     if (!authorize(req)) return new Response("unauthorized", { status: 401 });
     const encoder = new TextEncoder();
@@ -283,6 +388,9 @@ export function startWebServer(deps: WebServerDeps): WebServerHandle {
         if (req.method === "POST" && path === "/api/confirm") return await confirmHandler(req);
         if (req.method === "GET" && path === "/api/stream") return streamHandler(req);
         if (req.method === "GET" && path === "/api/history") return historyHandler(req);
+        if (req.method === "GET" && path === "/api/voice/state") return voiceStateHandler(req);
+        if (req.method === "POST" && path === "/api/stt") return await sttHandler(req);
+        if (req.method === "POST" && path === "/api/tts") return await ttsHandler(req);
         return new Response("not found", { status: 404 });
       } catch (err) {
         log.error("web.handler_failed", {