diff --git a/.env.example b/.env.example
index 6780d0f..50e226d 100644
--- a/.env.example
+++ b/.env.example
@@ -143,3 +143,70 @@ SOLRAC_WEB_TOKEN=
# Synthetic chat id all web traffic shares. Negative integer; default −1000
# avoids collision with real Telegram chat ids.
# SOLRAC_WEB_CHAT_ID=-1000
+
+# ── Voice (ElevenLabs STT + TTS) ────────────────────────────────────────────
+# Off by default. When VOICE_ENABLED=true:
+# - Web UI gets a mic button (record → STT pre-fills the composer) and a
+# per-message speak button (TTS plays in the browser).
+# - Telegram voice notes (msg.voice) are transcribed and fed through the
+# normal text path.
+# - `/voice on` (per chat, sticky) injects a word-limit prompt to the LLM
+# AND attaches a Telegram voice note to each reply.
+#
+# REQUIRED when VOICE_ENABLED=true (boot fails loud otherwise):
+# ELEVENLABS_API_KEY — from https://elevenlabs.io → Profile + API Keys
+# (starts `sk_…`). Restrict the key to Text-to-Speech
+# + Speech-to-Text endpoints only.
+# ELEVENLABS_VOICE_ID — 20-char id from VoiceLab → voice detail page.
+#
+# Both ELEVENLABS_* and VOICE_* keys are scrubbed from the Claude SDK
+# subprocess env (agent.ts::sanitizedSubprocessEnv) so a compromised model
+# can't exfiltrate them via an auto-allowed Bash command.
+#
+# Privacy: audio + transcripts + TTS-bound replies hit ElevenLabs SaaS.
+# SOUL.md / SOLRAC.md never leave the host, but the speech does.
+#
+# VOICE_ENABLED=true
+# ELEVENLABS_API_KEY=sk_REPLACE_ME
+# ELEVENLABS_VOICE_ID=REPLACE_ME
+
+# TTS model. eleven_flash_v2_5 is $0.05/1k chars, low latency. Other options:
+# eleven_turbo_v2_5 (similar price/latency), eleven_multilingual_v2 ($0.10/1k,
+# better quality). Operator pin to match plan / preference.
+# ELEVENLABS_TTS_MODEL=eleven_flash_v2_5
+
+# STT model. scribe_v2 went GA March 2026 and replaces v1.
+# ELEVENLABS_STT_MODEL=scribe_v2
+
+# Hard wall — TTS requests over this length are refused with HTTP 413 + a
+# user-visible chat hint. The voice-mode prompt nudge (see below) defends
+# against this softly; the wall is the last line of defense.
+# VOICE_TTS_MAX_CHARS=3000
+
+# Soft target — when `/voice on` is set for a chat, this many words is the
+# budget injected into the LLM prompt ("respond in under N words"). The
+# model may use up to 3× when the user explicitly asks for more.
+# Clamps to [30, 200] at boot (out-of-range warns + clamps).
+# VOICE_REPLY_WORDS_HINT=60
+
+# STT upload limits. Web `/api/stt` rejects oversized bodies before paying
+# Scribe; Telegram voice-note download is bounded by the same byte cap.
+# VOICE_STT_MAX_BYTES=2097152 # 2 MiB
+# VOICE_STT_MAX_SECONDS=60 # client MediaRecorder stops at this
+
+# Independent voice cost cap (Anthropic burn is separate). Sliding 60-min
+# windows over voice_events.cost_usd_estimate.
+# VOICE_HOURLY_COST_CAP_USD=0.25 # per-chat
+# VOICE_GLOBAL_HOURLY_COST_CAP_USD=1.00 # host-wide
+
+# Output formats. Web uses MP3 (plays everywhere). Telegram uses Ogg/Opus
+# (sendVoice). §17 probe (May 2026) confirmed ElevenLabs returns
+# Ogg-containerized Opus for opus_48000_64; if a future change flips that
+# to raw Opus, set this to mp3_44100_64 and Telegram path uses sendAudio.
+# ELEVENLABS_TTS_OUTPUT_FORMAT_WEB=mp3_44100_64
+# ELEVENLABS_TTS_OUTPUT_FORMAT_TG=opus_48000_64
+
+# Pricing constants used for cap math + voice_events.cost_usd_estimate.
+# Pin to your ElevenLabs plan if it differs from the published defaults.
+# ELEVENLABS_TTS_PRICE_USD_PER_1K_CHARS=0.05
+# ELEVENLABS_STT_PRICE_USD_PER_HOUR=0.22
diff --git a/README.md b/README.md
index 24d8db6..ce2bf8b 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# Solrac
-> A self-hosted, hackable personal Agent: free local LLM (Ollama or LMStudio) or remote LLM (OpenRouter) by default, with explicit escalation to Anthropic's Claude Sonnet/Opus via the Claude Agent SDK. Reach it from Telegram or a browser; own every audit row, permission rule, and budget cap.
+> A self-hosted personal Agent you can configure, hack, and **converse with**. Reach it by text from Telegram or a browser, or by voice (ElevenLabs STT + TTS) on either transport. Free local LLM (Ollama / LMStudio) or remote (OpenRouter) by default; escalate to Anthropic's Claude Sonnet (`@`) or Opus (`!`) only when you mean it. Own every audit row, permission rule, and budget cap.
@@ -16,11 +16,12 @@ It's deliberately smaller and narrower than other personal-assistant projects:
Both are broader and better-resourced. **Solrac's distinct value:**
- **BYO-model engine slot.** No-prefix messages route to whichever model source you wire — free on-host (Ollama / LMStudio) or pay-per-token remote (OpenRouter). `@` (Sonnet) and `!` (Opus) are paid Claude escalations only on operator intent.
-- **Cost enforcement, not just visibility.** Sliding per-chat and global hourly USD caps that *deny* turns when hit — they sum every `cost_usd` row (Claude or OpenRouter), so remote-mode burn is gated by the same ceilings without extra configuration. Plus a daily cost-report DM.
-- **Audit-before-acting.** Every update (allowed, denied, queue-full) writes a row to one append-only SQLite table, tagged with the engine that served it (`local:ollama:...`, `remote:openrouter:...`, `claude:primary:...`).
+- **Cost enforcement, not just visibility.** Sliding per-chat and global hourly USD caps that *deny* turns when hit — they sum every `cost_usd` row (Claude or OpenRouter), so remote-mode burn is gated by the same ceilings without extra configuration. Plus a daily cost-report DM. Voice spend (ElevenLabs STT + TTS, when enabled) rides a **second** independent cost-cap axis with its own per-chat + global ceilings.
+- **Voice on every transport.** Telegram voice notes get transcribed; the web UI has a mic button and per-message speak buttons. `/voice on` turns on terse audio replies. ~120 lines of `fetch` against ElevenLabs — no SDK, no realtime WebSocket. Off by default.
+- **Audit-before-acting.** Every update (allowed, denied, queue-full) writes a row to one append-only SQLite table, tagged with the engine that served it (`local:ollama:...`, `remote:openrouter:...`, `claude:primary:...`). Voice gets a parallel `voice_events` log — every STT/TTS attempt (allowed, capped, denied, errored) is recorded.
- **Single-process minimalism.** No HTTP framework, no Telegram framework runtime, no queue server, no Docker, no sub-agents. A few thousand lines of TypeScript you can read in an afternoon and fork.
-If you need multi-tenancy, voice wake, mobile companions, or 25 chat platforms, use OpenClaw or Hermes. If you want a small, cost-capped, fully audited foundation you can bend to your shape, Solrac fits.
+If you need multi-tenancy, always-listening voice wake, mobile companions, or 25 chat platforms, use OpenClaw or Hermes. If you want a small, cost-capped, fully audited foundation — with optional speech-to-text and text-to-speech on Telegram and the browser — that you can bend to your shape, Solrac fits.
## Quick start
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
index 774b493..acaf97a 100644
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -1216,6 +1216,151 @@ The existing `policy.ts::createConfirmationBroker` is transport-agnostic — `re
The transport adds `web.ts`, `web-client.ts`, `web-sanitize.ts`, and `markdown.ts`. No HTTP framework, no WebSocket framework, no extra runtime dependencies beyond `marked` (used on both transports). The "no HTTP framework" anti-goal is honored — `Bun.serve` `routes` and `fetch` only, same shape as `server.ts`.
+## Voice transport (optional)
+
+Off by default. Enabled via `VOICE_ENABLED=true` + ElevenLabs credentials. Adds two flows — **speech-in** (operator → text prompt) and **speech-out** (assistant reply → audio) — to both Telegram and the web UI. Implementation lives in two modules:
+
+| Module | Role |
+|---|---|
+| `src/elevenlabs.ts` | Typed `fetch` wrapper for ElevenLabs HTTP. STT (`POST /v1/speech-to-text`, multipart) and TTS-stream (`POST /v1/text-to-speech/{voice_id}/stream`, chunked body). ~165 lines, no SDK. |
+| `src/voice.ts` | Orchestration — gate, cost-cap, audit-write, transport delivery. Exports `handleWebStt`, `handleWebTts`, `handleTelegramVoiceStt`, `maybeReplyWithVoice`, `stripMarkdownForSpeech`, `buildVoiceModePrompt`. |
+
+### The two flows
+
+```
+SPEECH-IN (STT)
+ Telegram voice note Web mic button
+ ─────────────────── ────────────────
+ poll loop → msg.voice MediaRecorder → /api/stt (multipart)
+ │ │
+ │ gateUpdate (allowlist) │ session-cookie auth
+ │ voice cost cap check │ voice cost cap check
+ ▼ ▼
+ getFile + download bytes parse multipart, validate size
+ │ │
+ └──────► voice.handleXxxStt ◄──────┘
+ │
+ ▼
+ ElevenLabs Scribe
+ │
+ ▼
+ voice_events row written
+ │
+ Telegram: synthesize text Update Web: return { ok, text } →
+ → queue.enqueue (normal turn) browser pre-fills composer
+
+SPEECH-OUT (TTS)
+ Telegram: post-turn hook Web: speak button on assistant msg
+ ────────────────────── ──────────────────────────────────
+ agent/engine done + audit closed user click → POST /api/tts
+ │ │ session-cookie auth
+ │ /voice on? (sessions table) │ voice cost cap check
+ │ voice cost cap check │ length wall
+ ▼ ▼
+ stripMarkdownForSpeech(final) stripMarkdownForSpeech(markdown)
+ │ │
+ └──────► ElevenLabs TTS-stream ◄────┘
+ │
+ ▼
+ voice_events row written
+ │
+ Telegram: buffer → sendVoice Web: proxy-stream →
+ (Ogg/Opus) or sendAudio (MP3) blob URL → autoplay
+```
+
+### `voice_events` table — separate from `audit`
+
+One turn can produce **multiple** voice events (one STT input + one TTS output, sometimes). The `audit` table's two-writes-per-turn shape doesn't fit, so voice gets its own append-only log:
+
+```sql
+CREATE TABLE voice_events (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ chat_id INTEGER NOT NULL,
+ ts_ms INTEGER NOT NULL,
+ kind TEXT NOT NULL CHECK (kind IN ('stt','tts')),
+ source TEXT NOT NULL CHECK (source IN ('web','telegram')),
+ model TEXT NOT NULL,
+ voice_id TEXT,
+ audit_id INTEGER, -- informational link to audit.id (not FK)
+ duration_ms INTEGER, -- STT only
+ chars INTEGER, -- TTS only
+ cost_usd_estimate REAL NOT NULL DEFAULT 0,
+ status TEXT NOT NULL CHECK (status IN ('ok','denied_cap','denied_gate','error')),
+ error_message TEXT
+);
+CREATE INDEX idx_voice_events_chat_ts ON voice_events (chat_id, ts_ms);
+CREATE INDEX idx_voice_events_ts ON voice_events (ts_ms);
+```
+
+`audit_id` is informational only (no FK) so a `denied_gate` STT — which never reaches `audit` because no allowlisted sender existed — still gets a row. Cost-cap queries sum `cost_usd_estimate` over a sliding 60-min window, filtered to `status='ok'` so denials don't double-count.
+
+### Independent voice cost cap
+
+Anthropic burn (`audit.cost_usd`) and ElevenLabs burn (`voice_events.cost_usd_estimate`) are **separate axes**. Each has its own per-chat + global sliding-60-min ceiling:
+
+| Axis | Per-chat env var | Global env var | Default |
+|---|---|---|---|
+| Anthropic | `HOURLY_COST_CAP_USD` | `GLOBAL_HOURLY_COST_CAP_USD` | $1.00 / $4.00 (4×) |
+| Voice | `VOICE_HOURLY_COST_CAP_USD` | `VOICE_GLOBAL_HOURLY_COST_CAP_USD` | $0.25 / $1.00 |
+
+Order of checks inside `voice.ts` mirrors the Anthropic cap shape: **global first, then per-chat** (a host-wide hit shouldn't be masked by a per-chat pass). For STT the gate fires before either cap; for TTS the length wall fires after both caps. Cost is **estimated** at write time (ElevenLabs doesn't return per-call billing on the wire) using the configured price constants:
+
+- **STT:** `audio_duration_secs / 3600 × ELEVENLABS_STT_PRICE_USD_PER_HOUR`
+- **TTS:** `chars / 1000 × ELEVENLABS_TTS_PRICE_USD_PER_1K_CHARS`
+
+Pin the prices to your ElevenLabs plan if the published defaults don't match.
+
+### Voice mode (`sessions.voice_replies` + `/voice on|off`)
+
+Per-chat sticky toggle backing both Telegram TTS attach AND the word-limit prompt nudge. Added to `sessions` as a `0/1` column (idempotent ALTER). The `/voice` command's parser accepts `on`, `off`, `1`, `0`, `true`, `false`, or no-arg (renders current state).
+
+When `voice_replies=1` for a chat, two things happen on every turn:
+
+1. **`voice.ts::buildVoiceModePrompt`** is called by both SOLRAC.md injection sites and a `` block is prepended that tells the model to keep the reply under `VOICE_REPLY_WORDS_HINT` words (default 60). The block sits **after** SOLRAC.md and **before** the cross-engine OOB block, so operator overlays can override the word limit if needed. The model may use up to 3× the limit when the user explicitly asks for more.
+2. **`maybeReplyWithVoice`** runs as the post-turn hook (after the audit row closes, only on `!isError`). It strips the markdown, checks the cost cap + length wall, calls ElevenLabs TTS, buffers the audio, and sends via `sendVoice` (Ogg/Opus) or `sendAudio` (MP3 fallback). Web turns don't invoke this — the per-message speak button does it on user demand instead.
+
+The post-turn hook is wired as an optional `attachVoiceReply` callback on `AgentRunDeps` and `EngineRunDeps`. Telegram-bound deps carry the callback; web-bound deps don't. Same VoiceDeps instance backs both Telegram STT/TTS and web STT/TTS — the sliding 60-min cap is shared across transports, so an operator can't double up by talking on web + Telegram simultaneously.
+
+### Footer strip
+
+The `*✅ ...*` line agent/engine append to every successful reply (turn count, cost, model) is UI chrome, not content. `voice.ts::stripMarkdownForSpeech` regex-strips that pattern before tokenizing, so TTS never reads "✅ remote:openrouter:z-ai/glm-5.1 · 1 tools · 6.6s · $0.0048" aloud.
+
+The strip also handles standard markdown → speech transforms via `marked.lexer`:
+- Code fences → `[code block omitted]`
+- Tables → `[table omitted]`
+- Lists → comma-joined items
+- Links → text (URL dropped)
+- Headers / bold / italic → unwrap, keep text
+
+### Env scrub additions
+
+`ELEVENLABS_*` and `VOICE_*` are added to `agent.ts::sanitizedSubprocessEnv`'s scrub list. `ELEVENLABS_API_KEY` is a billed credential that the spawned `claude` SDK subprocess has no business reading; `VOICE_*` (cost caps, model ids) shouldn't leak via an auto-allowed `Bash(echo $VOICE_...)`.
+
+### What about the web sanitizer?
+
+`web-sanitize.ts` deliberately excludes `` from its allowlist. We do NOT widen it. The `` element on the web UI is injected via `document.createElement` by `app.js`, AFTER sanitization runs on the reply body — the trust boundary doesn't move. The sanitizer is for marked-rendered LLM content; audio playback is UI chrome.
+
+### Dependency direction
+
+```
+elevenlabs.ts → log + config
+voice.ts → elevenlabs + db + log + config + policy + telegram + marked
+agent.ts → + voice (post-turn hook + buildVoiceModePrompt)
+engine.ts → + voice (post-turn hook + buildVoiceModePrompt)
+web.ts → + voice (handleWebStt, handleWebTts)
+main.ts → + voice (handleTelegramVoiceStt dispatcher, maybeReplyWithVoice)
+commands.ts → unchanged structurally — /voice command dispatches via db.setVoiceRepliesFlag
+```
+
+No new runtime dependency outside `marked` (already shipped). `fetch` and `FormData` are global in Bun.
+
+### Anti-goal preservation
+
+- `marked` is still the only non-SDK runtime dep — ElevenLabs is raw `fetch`, no SDK.
+- No HTTP framework added (the two new routes ride the existing `Bun.serve` instance).
+- No Telegram framework runtime added (`sendVoice`/`sendAudio` use multipart `fetch`).
+- One-PR-per-feature was reversed deliberately for the voice change (phases 1–5 landed together) — per PLAN.md §16, called out as an explicit re-evaluation.
+
## Anti-goals
Decisions deliberately not made. Don't relitigate without strong justification.
diff --git a/docs/CONFIG.md b/docs/CONFIG.md
index a512766..2479fe7 100644
--- a/docs/CONFIG.md
+++ b/docs/CONFIG.md
@@ -52,6 +52,21 @@ Every Solrac knob is an environment variable, validated and frozen at boot by `s
| `SOLRAC_WEB_PORT` | no | `8080` | positive int | Port for the web UI. Must differ from `PORT` (which serves `/health` & `/stats`). |
| `SOLRAC_WEB_TOKEN` | when `SOLRAC_WEB_ENABLED=true` | — | string | Login secret. **Required even on `127.0.0.1`** — a co-tenant on a shared host could otherwise reach the unauthenticated UI. Generate with `openssl rand -hex 32`. Cookie set after login is HttpOnly + SameSite=Strict + Path=/ + Max-Age=24h. |
| `SOLRAC_WEB_CHAT_ID` | no | `-1000` | negative int | Synthetic chat id all web traffic shares. One session per Claude tier, one cost-cap bucket, one `/clear` scope. Negative to avoid collision with real Telegram chat ids. |
+| `VOICE_ENABLED` | no | `false` | boolean | Master switch for voice (ElevenLabs STT + TTS). When `true`, `ELEVENLABS_API_KEY` AND `ELEVENLABS_VOICE_ID` MUST be set (boot fails loud otherwise). Telegram voice notes get transcribed via Scribe; `/voice on` enables per-chat audio replies via the configured voice. The web UI surfaces a mic button + per-message speak button. Independent voice cost cap (per-chat + global, sliding 60-min) over `voice_events.cost_usd_estimate` — Anthropic burn cap is unaffected. |
+| `ELEVENLABS_API_KEY` | when `VOICE_ENABLED=true` | — | string | ElevenLabs API key (`sk_…`). Get one at [elevenlabs.io](https://elevenlabs.io) → Profile + API Keys. Recommended restriction: **Text to Speech + Speech to Text only**, nothing else. **Scrubbed** from the SDK-spawned `claude` subprocess env (`agent.ts::sanitizedSubprocessEnv` strips the entire `ELEVENLABS_*` prefix). |
+| `ELEVENLABS_VOICE_ID` | when `VOICE_ENABLED=true` | — | string | 20-char voice id from ElevenLabs VoiceLab → voice detail page. Single deploy-wide voice (no per-chat override in v1). |
+| `ELEVENLABS_TTS_MODEL` | no | `eleven_flash_v2_5` | string | TTS model id. `eleven_flash_v2_5` is $0.05/1k chars, low latency. Alternatives: `eleven_turbo_v2_5` (similar), `eleven_multilingual_v2` ($0.10/1k, better quality). |
+| `ELEVENLABS_STT_MODEL` | no | `scribe_v2` | string | STT model id. `scribe_v2` is the GA replacement for v1 (March 2026). |
+| `VOICE_TTS_MAX_CHARS` | no | `3000` | positive int | Hard wall — TTS requests over this length (post-markdown-strip) are refused with HTTP 413 (web) or a chat hint (Telegram). The voice-mode prompt nudge (`VOICE_REPLY_WORDS_HINT`) defends against this softly; the wall is the last line of defense. |
+| `VOICE_REPLY_WORDS_HINT` | no | `60` | positive int | Soft target — when `/voice on` is set for a chat, this word budget is injected as a system prompt block. The model may use up to 3× when the user asks for more detail. Clamped to `[30, 200]` at boot (out-of-range warns + clamps). |
+| `VOICE_STT_MAX_BYTES` | no | `2097152` (2 MiB) | positive int | Hard ceiling on audio upload size. Web `/api/stt` rejects oversized bodies before paying Scribe; Telegram voice-note download is bounded by the same cap. |
+| `VOICE_STT_MAX_SECONDS` | no | `60` | positive int | Client-side `MediaRecorder` stop timer for the web UI mic button. |
+| `VOICE_HOURLY_COST_CAP_USD` | no | `0.25` | positive float | **Per-chat** voice cost ceiling. Sliding 60-min window over `voice_events.cost_usd_estimate`. Independent of Anthropic `HOURLY_COST_CAP_USD`. |
+| `VOICE_GLOBAL_HOURLY_COST_CAP_USD` | no | `1.00` | positive float | **Global** voice cost ceiling. Sliding 60-min window across all chats. Independent of Anthropic `GLOBAL_HOURLY_COST_CAP_USD`. |
+| `ELEVENLABS_TTS_OUTPUT_FORMAT_WEB` | no | `mp3_44100_64` | string | Output format the browser `` consumes. MP3 plays everywhere (Chromium, Firefox, Safari). |
+| `ELEVENLABS_TTS_OUTPUT_FORMAT_TG` | no | `opus_48000_64` | string | Output format Telegram `sendVoice` consumes. Defaults to Ogg/Opus — verified against ElevenLabs (May 2026 probe). If a future upstream change flips to raw Opus, set to `mp3_44100_64` and Telegram path uses `sendAudio` instead (the env-var picks). |
+| `ELEVENLABS_TTS_PRICE_USD_PER_1K_CHARS` | no | `0.05` | positive float | Pricing constant for TTS cost estimate (used by the voice cost cap). Pin to your ElevenLabs plan if it differs from the published default. |
+| `ELEVENLABS_STT_PRICE_USD_PER_HOUR` | no | `0.22` | positive float | Pricing constant for STT cost estimate. Pin to your plan if it differs. |
## Validation rules
@@ -73,6 +88,7 @@ Every Solrac knob is an environment variable, validated and frozen at boot by `s
- **Local/remote mutex:** `LOCAL_ENABLED=true && REMOTE_ENABLED=true` is rejected at boot. The engine slot has a single driver per boot — picking between modes is structural, not per-message. Operators wanting both should pin `SOLRAC_DEFAULT_ENGINE=primary` and use Claude for the no-prefix path.
- **Local-tools constraint:** `LOCAL_TOOLS_ENABLED=true` requires `SOLRAC_INTEGRATIONS_ENABLED=true` (else there are no tools to expose; boot throws).
- **Web UI constraint:** when `SOLRAC_WEB_ENABLED=true`, `SOLRAC_WEB_TOKEN` must be set (any value; ≥32 chars recommended). `SOLRAC_WEB_PORT` must differ from `PORT`. `SOLRAC_WEB_CHAT_ID` must be a negative integer.
+- **Voice constraint:** when `VOICE_ENABLED=true`, both `ELEVENLABS_API_KEY` AND `ELEVENLABS_VOICE_ID` must be set and non-blank. `VOICE_REPLY_WORDS_HINT` is clamped to `[30, 200]` at boot with a `config.voice_reply_words_clamped` warn line if the operator value falls outside that range. All other voice values must parse as positive numbers / integers when provided. Voice cost caps (`VOICE_HOURLY_COST_CAP_USD`, `VOICE_GLOBAL_HOURLY_COST_CAP_USD`) are **independent** of the Anthropic caps — they sum `voice_events.cost_usd_estimate` over their own sliding 60-min windows.
The returned `Config` object is `Object.freeze`d; `allowlistBootstrap` is also frozen. There's no runtime mutation path.
@@ -175,6 +191,19 @@ SOLRAC_WEB_HOST=127.0.0.1 # 0.0.0.0 to expose on LAN/Tailscale/public
SOLRAC_WEB_PORT=8080 # must differ from PORT
SOLRAC_WEB_TOKEN= # required when enabled; generate: openssl rand -hex 32
# SOLRAC_WEB_CHAT_ID=-1000 # synthetic shared chat id for the web transport
+
+# Voice (ElevenLabs STT + TTS). Off by default. When VOICE_ENABLED=true,
+# ELEVENLABS_API_KEY + ELEVENLABS_VOICE_ID are required (boot fails loud).
+# Independent voice cost cap separate from the Anthropic cap.
+VOICE_ENABLED=false
+# ELEVENLABS_API_KEY=sk_… # restrict the key to TTS + STT only
+# ELEVENLABS_VOICE_ID=… # 20-char id from VoiceLab
+# ELEVENLABS_TTS_MODEL=eleven_flash_v2_5
+# ELEVENLABS_STT_MODEL=scribe_v2
+# VOICE_TTS_MAX_CHARS=3000 # hard wall on TTS input length
+# VOICE_REPLY_WORDS_HINT=60 # soft word budget when /voice on
+# VOICE_HOURLY_COST_CAP_USD=0.25 # per-chat sliding 60-min cap
+# VOICE_GLOBAL_HOURLY_COST_CAP_USD=1.00 # global sliding 60-min cap
```
### Claude-only deploy
@@ -219,6 +248,8 @@ The SDK spawns a `claude` subprocess that **inherits parent env**. Solrac scrubs
- `TG_*` (any prefix)
- `LOCAL_*` (any prefix — backend URL/model)
- `REMOTE_*` (any prefix — OpenRouter API key + base URL)
+- `ELEVENLABS_*` (any prefix — voice API key + voice id)
+- `VOICE_*` (any prefix — voice cost caps + limits)
- `STATS_BEARER_TOKEN`
- `ALLOWLIST_BOOTSTRAP`
- `NOTION_API_KEY`
diff --git a/docs/FEATURES.md b/docs/FEATURES.md
index 11a488a..aac5cb0 100644
--- a/docs/FEATURES.md
+++ b/docs/FEATURES.md
@@ -19,6 +19,7 @@ The complete feature list, grouped by theme. See [../README.md](../README.md) fo
## Transport
- **Optional browser web UI** — a second `Bun.serve` instance on a configurable port serves a minimal vanilla-JS chat interface with the same agent loop, slash commands, engine routing, and tool-confirm UX as Telegram. Full markdown rendering (headers, lists, tables, fenced code) on both transports — Claude and local responses get a server-side markdown→HTML pass for Telegram and the raw markdown to the browser. Off by default; enable with `SOLRAC_WEB_ENABLED=true` plus a token. See [USAGE.md#web-ui-browser-interface](./USAGE.md#web-ui-browser-interface).
+- **Optional voice (ElevenLabs STT + TTS)** — Telegram voice notes get transcribed via ElevenLabs Scribe into the normal text path; the web UI gets a mic button that pre-fills the composer. With `/voice on` (per-chat sticky toggle), Telegram replies attach an audio voice note and a `` system block tells the model to respond in under N words. Web UI gets a per-message 🔊 speak button for on-demand playback (with blob caching — replay doesn't re-bill). Independent cost cap separate from the Anthropic cap. Off by default; enable with `VOICE_ENABLED=true` plus an ElevenLabs key/voice id. See [USAGE.md#voice-elevenlabs-stt--tts](./USAGE.md#voice-elevenlabs-stt--tts).
- **Multi-user, multi-chat** — gated by per-`from.id` allowlist.
## Safety & audit
@@ -26,7 +27,8 @@ The complete feature list, grouped by theme. See [../README.md](../README.md) fo
- **Three-tier permission policy** — auto-allow / auto-deny / Telegram-inline-keyboard-confirm. Configurable rule tables.
- **Per-chat hourly cost cap** — sliding 60-minute window over the audit log. Default $1.00/chat/hour.
- **Loop detector** — denies the third call to the same `(toolName, input)` within a turn. Order-insensitive over JSON keys.
-- **Persistent audit trail** — every turn (allowed, denied, queue-full) writes a SQLite row with prompt, response, tool calls, cost, tokens, session id, status, **and engine** (`claude:primary:` / `claude:secondary:` / `local::` / `remote:openrouter:`).
+- **Persistent audit trail** — every turn (allowed, denied, queue-full) writes a SQLite row with prompt, response, tool calls, cost, tokens, session id, status, **and engine** (`claude:primary:` / `claude:secondary:` / `local::` / `remote:openrouter:`). When voice is enabled, every STT/TTS attempt — allowed, capped, denied at the gate, errored — writes a row to the separate `voice_events` table with kind/source/cost-estimate/status.
+- **Independent voice cost cap** — per-chat and global sliding 60-min ceilings on ElevenLabs spend (`voice_events.cost_usd_estimate`), separate from the Anthropic burn cap. ElevenLabs API key + voice-mode env vars (`ELEVENLABS_*`, `VOICE_*`) are scrubbed from the Claude SDK subprocess env so a compromised model can't exfiltrate the billed credential.
- **Session resume across restarts** — SDK session ids persisted per chat **and per tier**; conversations survive process death.
- **Inline-keyboard confirm UX** — 60-second timeout, fail-closed on send failure, verdict stamped into chat history after tap.
- **Sub-agent default-deny** — `Agent`/`Task` tools disabled at SDK + policy layers.
diff --git a/docs/GLOSSARY.md b/docs/GLOSSARY.md
index e3c490f..06b5bae 100644
--- a/docs/GLOSSARY.md
+++ b/docs/GLOSSARY.md
@@ -34,6 +34,8 @@ Terms that recur across Solrac's codebase and docs. Alphabetical.
**edit (Telegram)** — `editMessageText` API call. Solrac edits its 🤔 stub message rather than sending many small ones. Throttled to 1.5s between edits (`agent.ts:19`).
+**ElevenLabs** — Hosted speech provider used by Solrac's optional voice path. STT via Scribe (`/v1/speech-to-text`); TTS via voice models (`/v1/text-to-speech/{voice_id}/stream`). Two HTTP calls, no SDK. `ELEVENLABS_*` and `VOICE_*` env vars are scrubbed from the Claude SDK subprocess (`agent.ts::sanitizedSubprocessEnv`) so a compromised model can't exfiltrate the billed credential. See `src/elevenlabs.ts`.
+
**from.id** — Telegram user identifier. The user who actually sent a message. Differs from `chat.id` in groups and forwarded messages.
**handled_updates** — SQLite table holding all claimed `update_id`s. Idempotency surface for the poll loop; pruned by a future janitor (deferred to a follow-up).
@@ -90,6 +92,8 @@ Terms that recur across Solrac's codebase and docs. Alphabetical.
**stub** — The `🤔 thinking…` placeholder message Solrac sends at turn start, then edits with progress. Final state is the same message edited to the answer + footer (`✅ N turns · $X.XXXX `). No separate "final" message — that's intentional (see ARCHITECTURE.md "No-op-edit guard").
+**STT** — Speech-to-text. ElevenLabs Scribe in Solrac. Telegram voice notes get transcribed inline (`handleTelegramVoiceStt` synthesizes a text Update for the existing dispatcher); the web UI's mic button pre-fills the composer with the transcript. Off by default; gated by `VOICE_ENABLED=true`.
+
**SOUL.md** — Operator-editable persona file at the launch cwd's root. Contains voice, stance, and the `` safety clause. Read once at boot via `instance.ts::loadSoul`; joined with an engine-specific capability note and shipped as `systemPrompt.append` (Claude path) or as the first `system` message (local path). Hard-fails at boot if missing or empty. Mirrors OpenClaw's SOUL concept (voice, not operating rules).
**SOLRAC.md** — Operator-editable instance overlay at the launch cwd's root. Contains operator-specific operating rules (operator name, channel posture, project hints). Re-read per turn so live edits take effect immediately. Wrapped in `... ` and injected at the top of the user-message envelope (Claude path) or as a second `system` message (local path). Soft-warn if missing — Solrac runs vanilla without it. Carries a `solrac-md:unedited` sentinel marker on first install so a fresh template injects nothing until the operator activates the overlay. Analogous to a per-project CLAUDE.md.
@@ -104,6 +108,8 @@ Terms that recur across Solrac's codebase and docs. Alphabetical.
**TurnTracker** — `turn-tracker.ts`. Symbol-keyed `Set` tracking active turns. `count` for `/stats`; `drain()` for shutdown.
+**TTS** — Text-to-speech. ElevenLabs voice models. Web UI gets a per-message 🔊 button with a cached blob — replay doesn't re-bill. Telegram gets a voice-note attachment when `/voice on` (`sessions.voice_replies = 1`). `VOICE_TTS_MAX_CHARS` (default 1500) is a length wall before the call fires. Off by default; gated by `VOICE_ENABLED=true`.
+
**`tree_id`** — see above.
**untrusted-content wrapper** — `policy.ts::wrapUntrustedContent(text, source)` returns `text `. Paired with a system-prompt clause that tells the agent to treat such blocks as data, never instructions. v1 has no inbound-attachment intake yet, so the wrapper is wired but unused.
@@ -112,6 +118,12 @@ Terms that recur across Solrac's codebase and docs. Alphabetical.
**verdict** — The user's tap on a confirm prompt: `"allow" | "deny" | "timeout"`. Surfaced from the broker as a `ConfirmDecision`.
+**voice cost cap** — Independent of the Anthropic burn cap. Per-chat `VOICE_HOURLY_COST_CAP_USD` and global `VOICE_GLOBAL_HOURLY_COST_CAP_USD`, both sliding 60-min windows over `voice_events.cost_usd_estimate`. Enforced pre-flight on every STT/TTS attempt. Cap-hit writes a `denied_cap` row with `error_message ∈ {chat_voice_cap, global_voice_cap}` and refuses the call.
+
+**voice mode** — Per-chat sticky toggle: `/voice on` / `/voice off`. Persisted as `sessions.voice_replies` (0 or 1). When on, Telegram replies attach a TTS voice note and a `` system block tells the model to keep replies under N words (`VOICE_REPLY_WORDS_HINT`, clamped to [30,200]). Web UI ignores the flag — its per-message 🔊 button is on-demand. See `commands.ts::runVoiceSet`.
+
+**voice_events** — Append-only SQLite table parallel to `audit`. One row per ElevenLabs attempt with `kind` (`stt`|`tts`), `source` (`web`|`telegram`), `status` (`ok`|`denied_cap`|`denied_gate`|`error`), `cost_usd_estimate`, `duration_ms` (STT) or `chars` (TTS). Source of truth for the voice cost cap. `audit_id` is informational (no FK) so denied-gate STTs — which never reach `audit` — still get a row. See [SCHEMA.md#voice_events](./SCHEMA.md#voice_events).
+
**WAL** — SQLite Write-Ahead Log mode (`PRAGMA journal_mode = WAL`). Concurrent readers + a single writer; checkpointed to truncate on graceful shutdown (`PRAGMA wal_checkpoint(TRUNCATE)` in `lifecycle.ts`).
**web transport** — Optional second transport: a `Bun.serve` instance on `SOLRAC_WEB_HOST:SOLRAC_WEB_PORT` that hosts a browser chat UI. All web traffic shares one synthetic `chat.id` (default `-1000`, settable via `SOLRAC_WEB_CHAT_ID`). Token-gated login (`SOLRAC_WEB_TOKEN`) → HttpOnly + SameSite=Strict cookie. The `WebClient` (`src/web-client.ts`) implements the same `TelegramClient` interface as the bot path, publishing to an in-process bus consumed by SSE. Off by default; see [SETUP.md#11-optional-enable-the-browser-web-ui](./SETUP.md#11-optional-enable-the-browser-web-ui).
diff --git a/docs/OPERATIONS.md b/docs/OPERATIONS.md
index 86e8012..c11b4c2 100644
--- a/docs/OPERATIONS.md
+++ b/docs/OPERATIONS.md
@@ -676,6 +676,77 @@ GROUP BY skill_name, origin
ORDER BY n DESC;
```
+### Voice events
+
+The `voice_events` table is append-only and parallel to `audit`. Every voice flow (STT in, TTS out) writes a row regardless of outcome — `ok`, `denied_cap` (voice cap fired), `denied_gate` (allowlist), or `error` (upstream / network).
+
+```sql
+-- Recent voice events for a chat (one row per stt/tts attempt)
+SELECT id,
+ datetime(ts_ms/1000, 'unixepoch') AS at,
+ kind,
+ source,
+ status,
+ duration_ms,
+ chars,
+ ROUND(cost_usd_estimate, 4) AS cost,
+ error_message
+FROM voice_events
+WHERE chat_id = ?
+ORDER BY ts_ms DESC
+LIMIT 50;
+```
+
+```sql
+-- Voice spend last 24h, per chat and per kind
+SELECT chat_id,
+ kind,
+ COUNT(*) AS attempts,
+ SUM(CASE status WHEN 'ok' THEN 1 ELSE 0 END) AS ok,
+ SUM(CASE status WHEN 'denied_cap' THEN 1 ELSE 0 END) AS capped,
+ SUM(CASE status WHEN 'error' THEN 1 ELSE 0 END) AS errored,
+ ROUND(SUM(cost_usd_estimate), 4) AS spend
+FROM voice_events
+WHERE ts_ms >= (strftime('%s','now','-1 day') * 1000)
+GROUP BY chat_id, kind
+ORDER BY spend DESC;
+```
+
+```sql
+-- Sliding 60-min global voice spend (matches the in-process cap query)
+SELECT ROUND(SUM(cost_usd_estimate), 4) AS spent_last_hour
+FROM voice_events
+WHERE ts_ms >= (strftime('%s','now','-1 hour') * 1000)
+ AND status = 'ok';
+```
+
+```sql
+-- Voice cap-deny rate by chat (last 7 days). High values mean the cap is too
+-- low OR the operator is using voice heavily — check before raising the cap.
+SELECT chat_id,
+ SUM(CASE status WHEN 'denied_cap' THEN 1 ELSE 0 END) * 100.0 / COUNT(*)
+ AS pct_denied,
+ COUNT(*) AS total_attempts
+FROM voice_events
+WHERE ts_ms >= (strftime('%s','now','-7 day') * 1000)
+GROUP BY chat_id
+HAVING total_attempts >= 5
+ORDER BY pct_denied DESC;
+```
+
+```sql
+-- Upstream errors (ElevenLabs side: rate limits, auth, transcoding issues)
+SELECT datetime(ts_ms/1000, 'unixepoch') AS at,
+ chat_id,
+ kind,
+ source,
+ error_message
+FROM voice_events
+WHERE status = 'error'
+ORDER BY ts_ms DESC
+LIMIT 20;
+```
+
---
## Workspace inspection
diff --git a/docs/RUNBOOK.md b/docs/RUNBOOK.md
index c0be8e2..95e9f75 100644
--- a/docs/RUNBOOK.md
+++ b/docs/RUNBOOK.md
@@ -12,6 +12,8 @@ For day-to-day operations, see [OPERATIONS.md](./OPERATIONS.md).
- [Bot silent, no error in logs](#bot-silent-no-error)
- [Drain timeout on shutdown](#drain-timeout)
- [Runaway cost (cap not firing)](#runaway-cost)
+- [Voice cost runaway (ElevenLabs)](#voice-cost-runaway)
+- [Voice silent / ElevenLabs errors](#voice-errors)
- [DB corruption / lock errors](#db-corruption)
- [OOM kill / runaway memory](#oom)
- [Zombie poller / stale PID](#zombie-poller)
@@ -325,6 +327,152 @@ sudo systemctl start solrac.service
---
+
+
+## Voice cost runaway (ElevenLabs)
+
+### Symptoms
+
+- Daily report DM shows voice spend above the configured cap.
+- Users report `/voice on` replies stopped arriving with no error in chat.
+- `voice_events` shows a spike in `cost_usd_estimate` or a wave of `denied_cap` rows.
+
+Note: the voice cost axis is **independent** from the Anthropic axis. A voice runaway does not show up in `audit.cost_usd` or the `/stats` `spend24hUsd` number — check `voice_events` separately.
+
+### Diagnosis
+
+The voice cost cap fires *before* the ElevenLabs call, writing a `denied_cap` row. Find the recent caps:
+
+```sh
+sqlite3 data/solrac.sqlite \
+ "SELECT id, chat_id, kind, source, error_message,
+ datetime(ts_ms/1000,'unixepoch') AS at
+ FROM voice_events
+ WHERE status = 'denied_cap'
+ ORDER BY ts_ms DESC LIMIT 20"
+```
+
+`error_message='global_voice_cap'` → host-wide ceiling fired (every chat muted).
+`error_message='chat_voice_cap'` → only the named chat is muted; others still speak.
+
+Top spenders in the last hour:
+
+```sh
+sqlite3 data/solrac.sqlite \
+ "SELECT chat_id, kind,
+ ROUND(SUM(cost_usd_estimate), 4) AS spent,
+ COUNT(*) AS attempts
+ FROM voice_events
+ WHERE ts_ms >= (strftime('%s','now') - 3600) * 1000
+ AND status = 'ok'
+ GROUP BY chat_id, kind
+ ORDER BY spent DESC"
+```
+
+Likely causes:
+
+1. **Verbose TTS replies.** Long markdown answers get spoken in full; `VOICE_TTS_MAX_CHARS` (default 1500) is a hard wall but doesn't compress shorter replies. `VOICE_REPLY_WORDS_HINT` is a model hint, not enforced.
+2. **TTS price drift.** `ELEVENLABS_TTS_PRICE_USD_PER_1K_CHARS` is operator-set. If your plan changed, the local estimate diverges from reality — but the cap still fires at the operator-set rate. The cap is a behavior gate, not a billing reconciliation.
+3. **Replay button refetching.** Browser cache miss on the web 🔊 button forces a re-synthesis. Each click should hit a cached blob; if not, a UI regression — look for `voice.tts_called` log events repeating against the same `audit_id`.
+
+### Recovery
+
+Immediate stop for one chat — turn voice mode off:
+
+```sh
+sqlite3 data/solrac.sqlite \
+ "UPDATE sessions SET voice_replies = 0 WHERE chat_id = "
+```
+
+(Or have the operator type `/voice off` in that chat.)
+
+Global stop — set `VOICE_ENABLED=false` in `.env` and restart. ElevenLabs calls return early at the boot gate; existing audio messages in chat history are untouched.
+
+Tighten the cap:
+
+```ini
+# .env
+VOICE_HOURLY_COST_CAP_USD=0.10
+VOICE_GLOBAL_HOURLY_COST_CAP_USD=0.50
+```
+
+Restart. Next turns will hit the lower ceiling sooner.
+
+### Prevention
+
+- Turning `/voice on` for a chat doesn't double the model spend — TTS speaks the text the model already produced. The voice axis is purely ElevenLabs.
+- Enable for one chat first, watch `voice_events` for a day, then decide the global ceiling.
+- Pin `ELEVENLABS_TTS_PRICE_USD_PER_1K_CHARS` and `ELEVENLABS_STT_PRICE_USD_PER_HOUR` to your actual plan rate so the local estimate matches your invoice.
+
+---
+
+
+
+## Voice silent / ElevenLabs errors
+
+### Symptoms
+
+- `/voice on` confirmed, but Telegram replies arrive without an audio attachment.
+- Web mic button records and uploads, but the composer doesn't pre-fill with a transcript.
+- `voice_events` has rows with `status='error'`.
+- Boot exits with `ELEVENLABS_API_KEY is required when VOICE_ENABLED=true` (or the same for `ELEVENLABS_VOICE_ID`).
+
+### Diagnosis
+
+Read the most recent failures:
+
+```sh
+sqlite3 data/solrac.sqlite \
+ "SELECT id, chat_id, kind, source, status, error_message,
+ datetime(ts_ms/1000,'unixepoch') AS at
+ FROM voice_events
+ WHERE status IN ('error','denied_gate')
+ ORDER BY ts_ms DESC LIMIT 10"
+```
+
+| `status` / `error_message` | Cause | Fix |
+|---|---|---|
+| `error` / verbatim 401 message | API key invalid or missing required permission | Verify in the ElevenLabs dashboard that the key has *both* Speech to Text and Text to Speech permissions. A TTS-only key works for `/voice on` but mic button uploads fail. Log line: `voice.auth_failed`. |
+| `error` / verbatim 429 message | ElevenLabs throttled your account | Back off; lower TTS use; upgrade the plan. No Solrac-side retry (the cap already bounds spend). Log line: `voice.rate_limited`. |
+| `error` / verbatim 4xx message | Wrong voice id, malformed multipart, unsupported model | Confirm `ELEVENLABS_VOICE_ID` against the dashboard. For STT, Telegram voice notes are OGG/Opus (supported by Scribe). Log line: `voice.upstream_error` with the upstream status. |
+| `denied_gate` / `denied` | `from.id` not in the allowlist (for voice-note STT) | Add to `ALLOWLIST_BOOTSTRAP`, restart. Allowlist gates apply uniformly to voice and text. |
+| `denied_gate` / `no_from` | Update has no `from.id` (channel post, service message) | Not actionable — these are filtered by design. |
+| `denied_cap` / `chat_voice_cap` or `global_voice_cap` | Voice cost cap fired | See [Voice cost runaway](#voice-cost-runaway). |
+| `error` / `too_long` (rare, never reaches ElevenLabs) | Input exceeded `VOICE_STT_MAX_BYTES` / `VOICE_STT_MAX_SECONDS` or `VOICE_TTS_MAX_CHARS` | Adjust the limit, or trim the input. |
+| (boot rejection) | `VOICE_ENABLED=true` without `ELEVENLABS_API_KEY` or `ELEVENLABS_VOICE_ID` | Set both, restart. Both are required when voice is enabled. |
+
+### Recovery
+
+**Common gotcha: shell-exported `ELEVENLABS_API_KEY` overriding `.env`.** Bun reads `process.env` first; an exported value beats the file. Check:
+
+```sh
+echo "${ELEVENLABS_API_KEY:0:12}" # what the shell sees
+grep ELEVENLABS_API_KEY .env | cut -c1-30 # what the file holds
+```
+
+If they don't match, unset the shell var and restart:
+
+```sh
+unset ELEVENLABS_API_KEY
+sudo systemctl restart solrac.service
+```
+
+For 401 / 403 specifically: rotate the key. Generate a new key in the ElevenLabs dashboard with both required permissions, paste into `.env`, restart. Verify with a direct curl before debugging Solrac further:
+
+```sh
+curl -sS -H "xi-api-key: $ELEVENLABS_API_KEY" https://api.elevenlabs.io/v1/user | jq
+```
+
+For 429: there's no retry layer in `elevenlabs.ts` — that's intentional. Wait it out; the cap already bounds in-Solrac spend.
+
+### Prevention
+
+- One unrestricted dev key + a restricted prod key with only Text to Speech + Speech to Text permissions. ElevenLabs billing is independent of Anthropic, so rotating either key is contained.
+- Never paste the key into chat or commit it. The CLAUDE.md secret-handling rules apply to ElevenLabs keys too.
+- `ELEVENLABS_*` and `VOICE_*` env vars are scrubbed from the Claude SDK subprocess (`agent.ts::sanitizedSubprocessEnv`); a compromised model can't exfiltrate them. If you add a new ElevenLabs-related env, add it to the scrub list in the same PR.
+
+---
+
## DB corruption
### Symptoms
diff --git a/docs/SCHEMA.md b/docs/SCHEMA.md
index 1e23a4f..5920c13 100644
--- a/docs/SCHEMA.md
+++ b/docs/SCHEMA.md
@@ -32,6 +32,7 @@ Pragmas Solrac sets at boot:
| `sessions` | one row per chat | per-tier SDK session ids + pending `/compact` summaries |
| `audit` | one row per attempted turn (allowed, denied, queue-full, tool-call sub-row) | append-mostly; the source of truth |
| `scheduled_tasks` | one row per loaded `TASK.md` | upserted at boot; `last_run_at` / `one_off_consumed` updated by the tick loop |
+| `voice_events` | one row per ElevenLabs attempt (STT or TTS; ok, capped, gated, errored) | append-only; independent axis from `audit` (own sliding 60-min cost cap) |
Authoritative source for shapes + migrations: `src/db.ts` (look at the `SCHEMA` constant and the post-`SCHEMA` `ALTER TABLE` block).
@@ -209,6 +210,48 @@ scheduled_tasks(
One row per task loaded at boot. The loader upserts `source_path / source_hash / updated_at`; the tick loop updates `last_*` on each fire (and flips `one_off_consumed = 1` for `at ` tasks). When a `TASK.md` is removed from disk, the row stays — operators query `scheduled_tasks LEFT JOIN` the registry at runtime. `/tasks` slash command joins this table with the in-memory registry to render last/next fire info.
+### `voice_events`
+
+```sql
+voice_events(
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ chat_id INTEGER NOT NULL,
+ ts_ms INTEGER NOT NULL,
+ kind TEXT NOT NULL CHECK (kind IN ('stt','tts')),
+ source TEXT NOT NULL CHECK (source IN ('web','telegram')),
+ model TEXT NOT NULL, -- e.g. 'scribe_v1', 'eleven_flash_v2_5'
+ voice_id TEXT, -- null for STT
+ audit_id INTEGER, -- informational; NO FK (denied_gate has no audit row)
+ duration_ms INTEGER, -- STT input duration
+ chars INTEGER, -- TTS char count
+ cost_usd_estimate REAL NOT NULL DEFAULT 0, -- local estimate from env-set prices
+ status TEXT NOT NULL CHECK (status IN ('ok','denied_cap','denied_gate','error')),
+ error_message TEXT
+)
+```
+
+Parallel to `audit`, not nested inside it. One turn can emit multiple voice events (a Telegram voice note triggers an STT row, a `/voice on` reply triggers a TTS row — two separate rows). Every gate path writes a row before the ElevenLabs request fires, so denials show up here even when ElevenLabs never saw the call.
+
+#### `status` values
+
+| Value | Meaning | `cost_usd_estimate` |
+|---|---|---|
+| `ok` | ElevenLabs returned 2xx; `duration_ms` (STT) or `chars` (TTS) populated. | computed from env-set price |
+| `denied_cap` | Voice cost cap fired pre-flight. `error_message` ∈ `chat_voice_cap`, `global_voice_cap`. ElevenLabs never called. | 0 |
+| `denied_gate` | Allowlist / voice-disabled rejection. `error_message` ∈ `denied`, `no_from`. ElevenLabs never called. | 0 |
+| `error` | ElevenLabs returned non-2xx or the request threw. `error_message` carries the upstream message verbatim. | 0 |
+
+Special `error_message` value: `too_long` — input exceeded `VOICE_STT_MAX_BYTES` / `VOICE_STT_MAX_SECONDS` (STT) or `VOICE_TTS_MAX_CHARS` (TTS) before any request fired.
+
+#### Cost estimate
+
+`cost_usd_estimate` is computed locally — ElevenLabs doesn't return cost on the response. Two operator-set price knobs feed the math:
+
+- STT: `(duration_ms / 1000 / 3600) * ELEVENLABS_STT_PRICE_USD_PER_HOUR`
+- TTS: `(chars / 1000) * ELEVENLABS_TTS_PRICE_USD_PER_1K_CHARS`
+
+The cap is enforced against these local estimates, not against ElevenLabs' billing API. If the published prices change, update the env — the table values aren't backfilled.
+
## Indexes
| Index | Columns | Used by |
@@ -217,6 +260,8 @@ One row per task loaded at boot. The loader upserts `source_path / source_hash /
| `idx_audit_chat_started` | `(chat_id, started_at)` | the cost-cap query path (`db.sumChatCostSince`) — fires before every Claude tool call |
| `idx_audit_chat_model_started` | `(chat_id, model, started_at)` | engine-scoped queries: `outOfBandForEngine`, `recentChatTurnsForEngine`, `lastSuccessfulTurnAt`, `countChatTurnsForEngine`, etc. |
| `idx_audit_task_started` | `(task_name, started_at)` | scheduler queries: per-task cost windows (`max_cost_usd` pre-flight) and operator audit dumps |
+| `idx_voice_events_chat_ts` | `(chat_id, ts_ms)` | per-chat voice cost-cap window (`db.voiceCostUsedLast60min`) |
+| `idx_voice_events_ts` | `(ts_ms)` | global voice cost-cap window (`db.voiceCostUsedGlobalLast60min`) |
The first two are declared in the `SCHEMA` constant; the others are created after the migration block so they can reference columns added incrementally on existing databases.
@@ -528,6 +573,91 @@ WHERE (model LIKE 'local:%' OR model LIKE 'ollama:%') -- dual-pattern:
AND started_at >= (strftime('%s','now') - 7*86400) * 1000;
```
+### Voice events
+
+For spend-focused voice queries (cost by chat/kind, daily totals, cap-hit rates) see [OPERATIONS.md#voice-events](./OPERATIONS.md#voice-events). The queries below are debugging-oriented and non-overlapping.
+
+**Trace one voice attempt end-to-end** (link from an `audit.id` to the voice event it produced):
+
+```sql
+SELECT id, chat_id, kind, source, status, error_message,
+ duration_ms, chars, ROUND(cost_usd_estimate, 4) AS cost,
+ datetime(ts_ms/1000,'unixepoch') AS at
+FROM voice_events
+WHERE audit_id =
+ORDER BY ts_ms;
+```
+
+**Recent voice errors with verbatim ElevenLabs messages.** Only `status='error'` rows carry upstream message strings; gate/cap denials carry their own short codes.
+
+```sql
+SELECT id, chat_id, kind, source, error_message,
+ datetime(ts_ms/1000,'unixepoch') AS at
+FROM voice_events
+WHERE status = 'error'
+ORDER BY ts_ms DESC LIMIT 20;
+```
+
+**Cap-hit breakdown (chat vs global).** `error_message` distinguishes which ceiling fired.
+
+```sql
+SELECT error_message,
+ COUNT(*) AS hits,
+ COUNT(DISTINCT chat_id) AS chats_affected
+FROM voice_events
+WHERE status = 'denied_cap'
+ AND ts_ms >= (strftime('%s','now') - 86400) * 1000
+GROUP BY error_message;
+```
+
+**Gate denials by reason** (`denied` = allowlist; `no_from` = malformed update).
+
+```sql
+SELECT error_message, source, COUNT(*) AS denials
+FROM voice_events
+WHERE status = 'denied_gate'
+GROUP BY error_message, source
+ORDER BY denials DESC;
+```
+
+**Longest STT inputs.** Useful for spotting users uploading long voice notes that risk the size wall.
+
+```sql
+SELECT id, chat_id, source,
+ ROUND(duration_ms / 1000.0, 1) AS seconds,
+ ROUND(cost_usd_estimate, 4) AS cost,
+ datetime(ts_ms/1000,'unixepoch') AS at
+FROM voice_events
+WHERE kind = 'stt' AND status = 'ok'
+ORDER BY duration_ms DESC LIMIT 20;
+```
+
+**TTS char distribution** — where are long replies coming from?
+
+```sql
+SELECT chat_id,
+ COUNT(*) AS tts_calls,
+ MIN(chars) AS min_chars,
+ ROUND(AVG(chars), 0) AS avg_chars,
+ MAX(chars) AS max_chars,
+ ROUND(SUM(cost_usd_estimate), 4) AS total_cost
+FROM voice_events
+WHERE kind = 'tts' AND status = 'ok'
+ AND ts_ms >= (strftime('%s','now') - 7*86400) * 1000
+GROUP BY chat_id
+ORDER BY total_cost DESC;
+```
+
+**`/voice on` state per chat** (lives in `sessions`, not `voice_events`).
+
+```sql
+SELECT chat_id, voice_replies,
+ datetime(updated_at/1000,'unixepoch') AS last_updated
+FROM sessions
+WHERE voice_replies = 1
+ORDER BY updated_at DESC;
+```
+
### Tool inspection
**Tool-call distribution per Claude tier (last 7 days).**
@@ -682,6 +812,8 @@ The `audit` log is the source of truth for "what did the bot do?" but it's not a
- [ARCHITECTURE.md#sqlite-schema](./ARCHITECTURE.md#sqlite-schema) — schema rationale and design decisions
- [OPERATIONS.md#audit-queries](./OPERATIONS.md#audit-queries) — cost-focused operator queries
+- [OPERATIONS.md#voice-events](./OPERATIONS.md#voice-events) — voice spend and cap-hit queries
- [OPERATIONS.md#backup-and-restore](./OPERATIONS.md#backup-and-restore) — backup procedure
- [RUNBOOK.md#db-corruption](./RUNBOOK.md#db-corruption) — recovery from `database disk image is malformed`
+- [RUNBOOK.md#voice-cost-runaway](./RUNBOOK.md#voice-cost-runaway) — voice cost runaway recovery
- `src/db.ts` — schema source of truth, prepared statements, migrations
diff --git a/docs/USAGE.md b/docs/USAGE.md
index f4aa682..42f7cfc 100644
--- a/docs/USAGE.md
+++ b/docs/USAGE.md
@@ -172,6 +172,7 @@ Slash commands give you control over conversation context and visibility into sp
| `/context @\|!` | **none** — tier required | Show audit-table footprint (bytes), turn count, last turn's token breakdown (fresh / cache read / cache create / output), and estimated next-turn replay size. **Bare `/context` rejects** for the same reason as `/compact`. | Free |
| `/help` | — | Engine prefix table + command reference. Engine section is dynamic (renders the deploy's actual default). | Free |
| `/status` | — | Per-chat session/spend snapshot + global rollup + queue depth + uptime. Claude session lines render only when a session exists; a `local turns (24h): N` bullet is added when applicable. | Free |
+| `/voice [on\|off]` | shows state | Toggle per-chat voice replies. With voice mode on, Telegram replies attach an audio voice note AND a `` system block tells the model to respond in under N words. Works in both Telegram and the web UI (web operators get the word-limit nudge; per-message speak button replaces the auto-attach). Requires `VOICE_ENABLED=true` at deploy. See [Voice (ElevenLabs STT + TTS)](#voice-elevenlabs-stt--tts). | Free (the flag write; TTS spend is independent) |
### Tier args
@@ -1114,6 +1115,97 @@ Claude and the local engine both emit markdown. Solrac now converts markdown to
- Sessions are stored in process memory; restarting Solrac signs out all browsers (operator must log in again). The conversation history is hydrated from the audit log on next page load.
- Confirmation prompts that arrive before the operator opens the UI are silently dropped on broker timeout (60 s). Same failure mode as Telegram when the operator's phone is off.
+## Voice (ElevenLabs STT + TTS)
+
+Off by default. Enabled via `VOICE_ENABLED=true` + an ElevenLabs key/voice id. Adds two affordances on both transports:
+
+- **Speech-in (STT):** Telegram voice notes get transcribed via ElevenLabs Scribe into the normal text path. The web UI gets a mic button that records via `MediaRecorder` and pre-fills the composer with the transcript.
+- **Speech-out (TTS):** with `/voice on` set for a chat, Telegram replies attach an audio voice note. On the web UI, each assistant reply gets a 🔊 button for on-demand playback.
+
+> ⚠️ **Privacy.** Audio (your spoken words) AND the assistant's reply text are sent to ElevenLabs SaaS for transcription and synthesis. `SOUL.md` and `SOLRAC.md` never leave the host, but the conversation content does. Don't enable voice on chats covering material you wouldn't paste into a third-party transcription tool.
+
+### Setup
+
+1. **ElevenLabs account.** Sign up at [elevenlabs.io](https://elevenlabs.io). Pick a paid plan if you want production usage limits — the free tier is fine for testing.
+2. **API key.** Profile + API Keys → Create API Key. Recommended: name it `solrac`, turn **Restrict Key** on, set **Text to Speech → Access** and **Speech to Text → Access**, leave everything else as `No Access`. ElevenLabs reveals the `sk_…` value **once at creation** — copy it immediately.
+3. **Voice id.** Sidebar → Voices (or VoiceLab) → pick a voice → copy the 20-char id from the voice's detail page. One voice per deploy in v1.
+4. **`.env` values:**
+ ```sh
+ VOICE_ENABLED=true
+ ELEVENLABS_API_KEY=sk_… # the value from step 2
+ ELEVENLABS_VOICE_ID=… # the id from step 3
+ ```
+ Restart Solrac. Boot fails loud if either of the two required vars is missing.
+
+Full env reference: [CONFIG.md#variables](./CONFIG.md#variables) (search for `VOICE_` and `ELEVENLABS_`).
+
+### Telegram voice notes
+
+**Sending audio in.** Hold the 🎙️ mic icon in Telegram's input bar (or click it on desktop), record, release to send. Solrac downloads the audio, hits Scribe, synthesizes a text Update, and feeds it back into the queue. Your prefix conventions still apply — if you say "at Sonnet what's the time" out loud, the transcript probably won't start with `@`, so the engine slot answers. To force a tier, type the prefix instead.
+
+The audio file size cap is `VOICE_STT_MAX_BYTES` (default 2 MiB). Telegram caps voice notes at ~1 minute / ~1 MB anyway, so the limit rarely fires.
+
+Solrac handles `msg.voice` only — files attached via "Send a File" (which arrive as `msg.audio`) are out of scope for v1.
+
+**Getting audio out.** Type `/voice on` once per chat — that flips a sticky per-chat flag (`sessions.voice_replies`) so every successful reply from that chat gets an audio note attached. `/voice off` turns it back off; `/voice` with no arg shows the current state. The flag survives restarts.
+
+When voice mode is on:
+- Each turn also injects a `` system block telling the model to respond in under `VOICE_REPLY_WORDS_HINT` words (default 60) — the soft target keeps replies short enough to listen to.
+- After the text reply lands, Solrac strips the markdown, calls ElevenLabs TTS, and sends the audio via Telegram's `sendVoice` (Ogg/Opus voice note pill) or `sendAudio` (MP3 file) depending on the configured output format.
+- Replies longer than `VOICE_TTS_MAX_CHARS` (default 3000) are refused with a chat message — the prompt nudge usually keeps things short enough, but the wall is the last line of defense.
+
+### Web UI voice surface
+
+When `VOICE_ENABLED=true` AND you're logged in to the web UI:
+
+- **🎙️ mic button** in the composer (next to send). Click → grant mic permission → speak → click again to stop (or auto-stop at `VOICE_STT_MAX_SECONDS`, default 60). The transcript pre-fills the composer; review and send.
+- **🔊 speak button** on each assistant reply, bottom-right of the bubble. Only appears once the reply hits its final state (the `✅` footer sentinel). Click → audio plays, button switches to ⏹ (click to stop). Re-click 🔊 to replay from cache (no extra ElevenLabs spend).
+- **Voice mode badge** in the header (top right). Shows `🔊 voice mode on` when `/voice on` is active for the web chat id. Click to disable (sends `/voice off`).
+
+The web speak button is **on-demand**, not automatic — you control when you pay for synthesis. Different shape from Telegram, where voice-mode-on auto-attaches every reply.
+
+### Voice mode and engine compliance
+
+The `` system block is a soft target, not a hard rule:
+
+| Engine | Compliance |
+|---|---|
+| Claude tiers (`@` / `!`) | Strong; ±20-30% drift typical. |
+| Local (Ollama / LMStudio) | Variable. Instruction-tuned models obey; tiny (1-3B param) models may drift wildly. |
+| Remote (OpenRouter) | Depends on the routed model; most modern instruction-tuned models comply. |
+
+If the model ignores the limit AND the reply exceeds `VOICE_TTS_MAX_CHARS`, the hard wall fires and you get the refusal text instead of audio. Two ways to work around:
+1. Ask for a shorter response in the next prompt ("be brief").
+2. Raise `VOICE_TTS_MAX_CHARS` in `.env`.
+
+The "expand 3×" carve-out in the prompt block gives the model room to elaborate when you explicitly ask ("explain in detail") — it'll know it has up to `60 × 3 = 180` words of headroom.
+
+### Voice cost cap
+
+Independent from the Anthropic cap. Two sliding 60-min windows:
+
+| Cap | Env var | Default |
+|---|---|---|
+| Per-chat voice | `VOICE_HOURLY_COST_CAP_USD` | $0.25/hr |
+| Global voice | `VOICE_GLOBAL_HOURLY_COST_CAP_USD` | $1.00/hr |
+
+When a cap fires:
+- Telegram STT: reply text `voice cap reached, try again in a minute`.
+- Telegram TTS: silent (text reply already shipped; we just skip the audio attach).
+- Web STT: HTTP 429, browser toast `voice cap reached`.
+- Web TTS: HTTP 429, browser toast `voice cap reached`.
+
+Cost is **estimated** from duration (STT) or character count (TTS) using the configured price constants. ElevenLabs doesn't return per-call billing on the wire, so estimates may drift from your statement — pin `ELEVENLABS_TTS_PRICE_USD_PER_1K_CHARS` and `ELEVENLABS_STT_PRICE_USD_PER_HOUR` to your plan if you want closer alignment.
+
+Audit queries: [OPERATIONS.md](./OPERATIONS.md) — every voice event (allowed, capped, denied, errored) writes a row to the `voice_events` table.
+
+### Notes & limits (v1)
+
+- Single deploy-wide voice. No per-chat override yet — change `ELEVENLABS_VOICE_ID` to switch voices for everyone.
+- Page reload drops cached audio blobs. Re-clicking 🔊 after a reload re-pays for TTS. Disk cache is a future enhancement.
+- No real-time WebSocket STT. File-based round-trip is enough at conversational latency.
+- Telegram voice mode attaches one audio note per turn; long multi-paragraph replies still respect the 3000-char wall.
+
## Related docs
- [GLOSSARY.md](./GLOSSARY.md) — terminology reference
diff --git a/public/app.js b/public/app.js
index 9b56133..0580c9e 100644
--- a/public/app.js
+++ b/public/app.js
@@ -30,6 +30,9 @@ const els = {
logoutBtn: $("logout-btn"),
connState: $("conn-state"),
enginePills: document.querySelectorAll(".engine-opt"),
+ micBtn: $("mic-btn"),
+ voiceBadge: $("voice-badge"),
+ toastHost: $("toast-host"),
};
// Track DOM nodes by message_id so streaming edits can replace them in place.
@@ -42,6 +45,22 @@ const messageNodes = new Map();
let activeEngine = "";
let stream = null;
let firstTab = true;
+// Voice mode (mirror of sessions.voice_replies for the web chat id). Refreshed
+// from /api/voice/state on boot and after the user sends /voice on|off.
+let voiceModeOn = false;
+// Voice availability for the deploy. Probed once at boot by HEAD-ing
+// /api/voice/state — if it returns 401/200 the route exists (voice enabled);
+// 503 means the server has VOICE_ENABLED=false and the mic/speak surface
+// stays hidden.
+let voiceFeatureAvailable = false;
+// MediaRecorder state. `recorder` is the active recorder; `recordTimeoutId`
+// is the auto-stop timer that fires at VOICE_STT_MAX_SECONDS (60s server-side).
+let recorder = null;
+let recordTimeoutId = null;
+let recordChunks = [];
+// 60s — matches server-side VOICE_STT_MAX_SECONDS default. Client-side cap
+// keeps us from uploading audio the server will reject anyway.
+const RECORD_MAX_MS = 60_000;
// ── Boot ───────────────────────────────────────────────
@@ -78,6 +97,8 @@ function bindUi() {
pill.addEventListener("click", () => setEngine(pill.dataset.prefix));
}
setEngine("");
+ els.micBtn?.addEventListener("click", onMicClick);
+ els.voiceBadge?.addEventListener("click", () => sendUser("/voice off"));
}
// ── Auth ───────────────────────────────────────────────
@@ -98,6 +119,10 @@ function enterChat(history) {
scrollToBottom();
openStream();
els.composerText.focus();
+ // Probe voice availability once per session. /api/voice/state returns 200
+ // when VOICE_ENABLED=true, 503 when off. We surface the mic + speak
+ // buttons only when the deploy supports them.
+ refreshVoiceState();
}
async function onLoginSubmit(e) {
@@ -185,6 +210,14 @@ function handleEvent(event) {
if (!node) return;
const stick = isNearBottom();
renderBody(node.querySelector(".body"), event.markdown_source, event.html);
+ // Update the stashed markdown so the speak button (added when the
+ // final-state sentinel appears) picks up the latest text. Each edit
+ // overwrites — by the time `✅` lands, dataset.markdown has the full
+ // final reply.
+ if (typeof event.markdown_source === "string") {
+ node.dataset.markdown = event.markdown_source;
+ }
+ maybeAddSpeakButton(node);
if (stick) scrollToBottom();
} else if (event.kind === "reaction") {
// We don't render reactions in the web UI v1.
@@ -233,6 +266,12 @@ async function sendUser(text) {
els.sendBtn.disabled = false;
els.composerText.focus();
}
+ // Slash command that flips sessions.voice_replies — refresh the badge
+ // shortly after so the UI reflects new state. Small delay lets the
+ // command's audit row settle before we re-query.
+ if (/^\s*\/voice(\s|$)/i.test(text)) {
+ window.setTimeout(refreshVoiceState, 500);
+ }
}
function onComposerKeyDown(e) {
@@ -297,6 +336,9 @@ function appendMessage({ role, markdown, html_fallback, message_id }) {
const li = document.createElement("li");
li.className = `msg ${role}`;
if (typeof message_id === "number") li.dataset.messageId = String(message_id);
+ // Stash the raw markdown on the LI so the speak button can read it
+ // straight from the DOM rather than tracking a parallel Map.
+ if (typeof markdown === "string") li.dataset.markdown = markdown;
const roleLabel = document.createElement("div");
roleLabel.className = "role";
roleLabel.textContent = role === "user" ? "you" : "solrac";
@@ -307,6 +349,7 @@ function appendMessage({ role, markdown, html_fallback, message_id }) {
li.appendChild(body);
els.messages.appendChild(li);
if (typeof message_id === "number") messageNodes.set(message_id, li);
+ if (role === "assistant") maybeAddSpeakButton(li);
return li;
}
@@ -344,3 +387,286 @@ function isNearBottom() {
const el = els.messages;
return el.scrollHeight - el.scrollTop - el.clientHeight <= STICK_THRESHOLD_PX;
}
+
+// ── Voice ──────────────────────────────────────────────
+
+async function refreshVoiceState() {
+ try {
+ const res = await fetch("/api/voice/state");
+ if (res.status === 503) {
+ // Deploy has VOICE_ENABLED=false. Hide mic + speak surfaces entirely.
+ voiceFeatureAvailable = false;
+ voiceModeOn = false;
+ els.micBtn?.classList.add("hidden");
+ els.voiceBadge?.classList.add("hidden");
+ return;
+ }
+ if (!res.ok) return;
+ const body = await res.json();
+ voiceFeatureAvailable = true;
+ voiceModeOn = body.enabled === true;
+ els.micBtn?.classList.remove("hidden");
+ if (voiceModeOn) {
+ els.voiceBadge?.classList.remove("hidden");
+ if (els.voiceBadge) els.voiceBadge.textContent = "🔊 voice mode on";
+ } else {
+ els.voiceBadge?.classList.add("hidden");
+ }
+ // Audit existing assistant bubbles for speak buttons — when voice
+ // becomes available mid-session (e.g. after a server restart), already
+ // rendered messages should grow their speak button too.
+ for (const node of messageNodes.values()) {
+ maybeAddSpeakButton(node);
+ }
+ } catch {
+ // Network error — leave state as-is.
+ }
+}
+
+async function onMicClick() {
+ if (!voiceFeatureAvailable) return;
+ if (recorder) {
+ stopRecording(); // user tap to stop mid-flight
+ return;
+ }
+ try {
+ const mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });
+ const mime = pickMediaRecorderMime();
+ recorder = new MediaRecorder(mediaStream, mime ? { mimeType: mime } : undefined);
+ recordChunks = [];
+ recorder.addEventListener("dataavailable", (e) => {
+ if (e.data && e.data.size > 0) recordChunks.push(e.data);
+ });
+ recorder.addEventListener("stop", () => onRecordingStop(mediaStream));
+ recorder.start();
+ setMicState("recording");
+ recordTimeoutId = window.setTimeout(() => stopRecording(), RECORD_MAX_MS);
+ } catch (err) {
+ showToast(`mic error: ${err.message ?? "permission denied"}`);
+ recorder = null;
+ setMicState("idle");
+ }
+}
+
+function stopRecording() {
+ if (!recorder) return;
+ if (recordTimeoutId !== null) {
+ window.clearTimeout(recordTimeoutId);
+ recordTimeoutId = null;
+ }
+ try {
+ recorder.stop();
+ } catch {
+ // already inactive
+ }
+}
+
+async function onRecordingStop(mediaStream) {
+ setMicState("uploading");
+ // Stop the audio tracks so the browser indicator clears immediately.
+ for (const t of mediaStream.getTracks()) t.stop();
+ const chunks = recordChunks;
+ const mime = recorder?.mimeType || "audio/webm";
+ recorder = null;
+ recordChunks = [];
+ if (chunks.length === 0) {
+ setMicState("idle");
+ return;
+ }
+ const blob = new Blob(chunks, { type: mime });
+ const form = new FormData();
+ form.append("audio", blob, mime.includes("ogg") ? "audio.ogg" : "audio.webm");
+ try {
+ const res = await fetch("/api/stt", { method: "POST", body: form });
+ if (res.status === 401) {
+ enterLogin();
+ setMicState("idle");
+ return;
+ }
+ if (res.status === 413) {
+ showToast("audio too large — try a shorter clip");
+ setMicState("idle");
+ return;
+ }
+ if (res.status === 429) {
+ showToast("voice cap reached — try again in a minute");
+ setMicState("idle");
+ return;
+ }
+ if (!res.ok) {
+ const body = await res.json().catch(() => ({}));
+ showToast(`transcription failed: ${body.message ?? res.status}`);
+ setMicState("idle");
+ return;
+ }
+ const body = await res.json();
+ const text = typeof body.text === "string" ? body.text : "";
+ if (text) {
+ // Pre-fill the composer; cursor at end so a quick edit-then-send is
+ // one keystroke. Operator decides whether to send — defends against
+ // STT errors. No auto-send in v1.
+ els.composerText.value = els.composerText.value
+ ? els.composerText.value + " " + text
+ : text;
+ autoResize();
+ els.composerText.focus();
+ const len = els.composerText.value.length;
+ els.composerText.setSelectionRange(len, len);
+ }
+ } catch (err) {
+ showToast(`network error: ${err.message ?? "unknown"}`);
+ } finally {
+ setMicState("idle");
+ }
+}
+
+function setMicState(state) {
+ if (!els.micBtn) return;
+ els.micBtn.classList.toggle("recording", state === "recording");
+ els.micBtn.classList.toggle("uploading", state === "uploading");
+ els.micBtn.disabled = state === "uploading";
+ els.micBtn.title =
+ state === "recording" ? "stop recording" : state === "uploading" ? "uploading…" : "record voice";
+}
+
+// MediaRecorder mime varies by browser:
+// - Chromium → 'audio/webm;codecs=opus' (preferred)
+// - Firefox → same
+// - Safari → 'audio/mp4' (Safari rejects 'audio/webm')
+// Pick the first supported; let MediaRecorder default if none match
+// (Scribe v2 accepts both webm/opus and mp4/aac).
+function pickMediaRecorderMime() {
+ const candidates = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4"];
+ if (typeof MediaRecorder === "undefined" || !MediaRecorder.isTypeSupported) return null;
+ for (const m of candidates) {
+ if (MediaRecorder.isTypeSupported(m)) return m;
+ }
+ return null;
+}
+
+// ── Speak (TTS) ────────────────────────────────────────
+
+// Detect "final state" of an assistant message — the agent and engine
+// runners suffix successful turns with a `✅ N turns · $X.XXXX` footer
+// in the markdown source (see agent.ts::buildFooter). When that sentinel
+// is present we know the stream is settled and the speak button is safe
+// to expose. Mid-stream the button stays absent so the operator can't
+// pay for TTS on a partial reply.
+function isFinalAssistantMarkdown(md) {
+ if (typeof md !== "string") return false;
+ return md.includes("*✅");
+}
+
+function maybeAddSpeakButton(node) {
+ if (!voiceFeatureAvailable) return;
+ if (node.classList.contains("user")) return;
+ if (node.querySelector(".speak-btn")) return; // already added
+ const md = node.dataset.markdown;
+ if (!isFinalAssistantMarkdown(md)) return;
+ const btn = document.createElement("button");
+ btn.type = "button";
+ btn.className = "speak-btn";
+ btn.title = "speak this reply";
+ btn.setAttribute("aria-label", "speak this reply");
+ btn.textContent = "🔊";
+ btn.addEventListener("click", () => onSpeakClick(node, btn));
+ node.appendChild(btn);
+}
+
+async function onSpeakClick(node, btn) {
+ const existingAudio = node.querySelector(".speak-audio");
+ // Click while playing → stop. Audio element stays attached (and its
+ // blob URL stays live) so the next click replays from cache without
+ // re-billing ElevenLabs.
+ if (existingAudio && !existingAudio.paused) {
+ existingAudio.pause();
+ existingAudio.currentTime = 0;
+ setSpeakButtonState(btn, "idle");
+ return;
+ }
+ // Click after a previous play that ended/stopped → replay from cache.
+ if (existingAudio) {
+ existingAudio.currentTime = 0;
+ try {
+ await existingAudio.play();
+ setSpeakButtonState(btn, "playing");
+ } catch {
+ // Autoplay policy or other transient — silently ignore; the user
+ // can click again. No toast for replay failures.
+ }
+ return;
+ }
+ // First click → fetch TTS, attach a hidden , autoplay.
+ const markdown = node.dataset.markdown ?? "";
+ const messageId = node.dataset.messageId ? Number(node.dataset.messageId) : null;
+ if (!markdown) return;
+ btn.disabled = true;
+ btn.classList.add("loading");
+ try {
+ const res = await fetch("/api/tts", {
+ method: "POST",
+ headers: { "content-type": "application/json" },
+ body: JSON.stringify({ message_id: messageId, markdown }),
+ });
+ if (res.status === 401) {
+ enterLogin();
+ return;
+ }
+ if (res.status === 413) {
+ showToast("reply too long to speak — try /voice on for terser replies");
+ return;
+ }
+ if (res.status === 429) {
+ const body = await res.json().catch(() => ({}));
+ showToast(body.message ?? "voice cap reached");
+ return;
+ }
+ if (!res.ok) {
+ const body = await res.json().catch(() => ({}));
+ showToast(`speak failed: ${body.message ?? res.status}`);
+ return;
+ }
+ const blob = await res.blob();
+ const url = URL.createObjectURL(blob);
+ // Inject audio via DOM API (NOT innerHTML) — the sanitizer is for
+ // marked-rendered LLM content; this audio element is UI chrome added
+ // by app code, so the trust boundary doesn't move (plan §10.3).
+ // No `controls` attr — CSS hides the element entirely; the speak
+ // button is the only UI affordance.
+ const audio = document.createElement("audio");
+ audio.src = url;
+ audio.className = "speak-audio";
+ audio.dataset.blobUrl = url;
+ audio.addEventListener("play", () => setSpeakButtonState(btn, "playing"));
+ audio.addEventListener("pause", () => setSpeakButtonState(btn, "idle"));
+ audio.addEventListener("ended", () => setSpeakButtonState(btn, "idle"));
+ node.appendChild(audio);
+ await audio.play();
+ } catch (err) {
+ showToast(`network error: ${err.message ?? "unknown"}`);
+ } finally {
+ btn.disabled = false;
+ btn.classList.remove("loading");
+ }
+}
+
+function setSpeakButtonState(btn, state) {
+ btn.classList.toggle("playing", state === "playing");
+ btn.textContent = state === "playing" ? "⏹" : "🔊";
+ btn.title = state === "playing" ? "stop" : "speak this reply";
+}
+
+// ── Toast ──────────────────────────────────────────────
+
+function showToast(text) {
+ if (!els.toastHost) return;
+ const t = document.createElement("div");
+ t.className = "toast";
+ t.textContent = text;
+ els.toastHost.appendChild(t);
+ // 4s fade-then-remove. The CSS animation handles fade; we just clean up.
+ window.setTimeout(() => {
+ t.classList.add("fading");
+ window.setTimeout(() => t.remove(), 500);
+ }, 4000);
+}
diff --git a/public/index.html b/public/index.html
index 0ef7cff..c7655bd 100644
--- a/public/index.html
+++ b/public/index.html
@@ -23,6 +23,7 @@ solrac