diff --git a/docs/integrations-and-sdks/livekit.mdx b/docs/integrations-and-sdks/livekit.mdx deleted file mode 100644 index e8d091da..00000000 --- a/docs/integrations-and-sdks/livekit.mdx +++ /dev/null @@ -1,63 +0,0 @@ ---- -id: livekit -description: Learn how to integrate Speechmatics STT with LiveKit Agents. ---- - -# LiveKit integration - -LiveKit Agents is a framework for building real-time, voice-enabled AI applications that connect with LiveKit rooms. -With the Speechmatics STT plugin, you enable your voice agent to transcribe live audio in real-time, along with speaker diarization, turn detection and noise robustness. - -LiveKit integrations are perfect for: - -- **Voice AI**: voice assistants, chatbots, and IVR systems -- **Transcription**: live events or recordings -- **Accessibility**: screen readers and assistive technologies, in-app help widgets -- **Media**: news broadcasts, automated announcements - -## Features - -- **Realtime transcription**: instant, accurate speech-to-text -- **Speaker diarization**: identify and separate multiple speakers automatically -- **Turn detection**: detect natural speech boundaries and pauses -- **Noise robustness**: maintain accuracy in challenging environments -- **Global language support**: works with diverse accents and dialects -- **Partial results**: receive interim transcriptions for faster response times -- **High accuracy**: industry-leading word recognition for natural conversations - -## Quickstart - -### Requirements - -- LiveKit >= 1.2 -- [Speechmatics account](https://portal.speechmatics.com). -- Speechmatics API key. You can generate one in the [Portal](https://portal.speechmatics.com/settings/api-keys). - -### Installation - -```python -uv add "livekit-agents[speechmatics]~=1.2" -``` - -### Usage - -```python -import os -import asyncio - -from livekit.agents import AgentSession, cli -from livekit.plugins import speechmatics - -async def run_agent(room): - session = AgentSession( - stt=speechmatics.STT( - api_key=os.environ["SPEECHMATICS_API_KEY"], - ), - ) - await session.join(room) - -if __name__ == "__main__": - cli.run_app(run_agent) -``` - -For detailed examples, please see the [Speechmatics Academy](https://github.com/speechmatics/speechmatics-academy). diff --git a/docs/integrations-and-sdks/livekit/assets/.gitkeep b/docs/integrations-and-sdks/livekit/assets/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/docs/integrations-and-sdks/livekit/assets/main.py b/docs/integrations-and-sdks/livekit/assets/main.py new file mode 100644 index 00000000..9ea2c802 --- /dev/null +++ b/docs/integrations-and-sdks/livekit/assets/main.py @@ -0,0 +1,56 @@ +from dotenv import load_dotenv +from livekit import agents +from livekit.agents import AgentSession, Agent, RoomInputOptions +from livekit.plugins import openai, silero, speechmatics +from livekit.plugins.speechmatics import TurnDetectionMode + +load_dotenv(".env.local") + + +class VoiceAssistant(Agent): + def __init__(self): + super().__init__( + instructions="You are a helpful voice assistant. Be concise and friendly." + ) + + +async def entrypoint(ctx: agents.JobContext): + await ctx.connect() + + # Speech-to-Text: Speechmatics + stt = speechmatics.STT( + turn_detection_mode=TurnDetectionMode.SMART_TURN, + ) + + # Language Model: OpenAI + llm = openai.LLM(model="gpt-4o-mini") + + # Text-to-Speech: Speechmatics + tts = speechmatics.TTS() + + # Voice Activity Detection: Silero + vad = silero.VAD.load() + + # Create and start session + session = AgentSession( + stt=stt, + llm=llm, + tts=tts, + vad=vad, + ) + + await session.start( + room=ctx.room, + agent=VoiceAssistant(), + room_input_options=RoomInputOptions(), + ) + + await session.generate_reply( + instructions="Say a short hello and ask how you can help." + ) + + +if __name__ == "__main__": + agents.cli.run_app( + agents.WorkerOptions(entrypoint_fnc=entrypoint), + ) diff --git a/docs/integrations-and-sdks/livekit/assets/stt-full-example.py b/docs/integrations-and-sdks/livekit/assets/stt-full-example.py new file mode 100644 index 00000000..333b09f6 --- /dev/null +++ b/docs/integrations-and-sdks/livekit/assets/stt-full-example.py @@ -0,0 +1,48 @@ +from livekit.agents import AgentSession +from livekit.plugins import speechmatics +from livekit.plugins.speechmatics import ( + AdditionalVocabEntry, + AudioEncoding, + OperatingPoint, + SpeakerFocusMode, + SpeakerIdentifier, + TurnDetectionMode, +) + +stt = speechmatics.STT( + # Service options + language="en", + output_locale="en-US", + operating_point=OperatingPoint.ENHANCED, + + # Turn detection + turn_detection_mode=TurnDetectionMode.ADAPTIVE, + max_delay=1.5, + include_partials=True, + + # Diarization + enable_diarization=True, + speaker_sensitivity=0.6, + max_speakers=4, + prefer_current_speaker=True, + + # Speaker focus + focus_speakers=["S1", "S2"], + focus_mode=SpeakerFocusMode.RETAIN, + ignore_speakers=["__ASSISTANT__"], + + # Output formatting + speaker_active_format="[{speaker_id}]: {text}", + speaker_passive_format="[{speaker_id} (background)]: {text}", + + # Custom vocabulary + additional_vocab=[ + AdditionalVocabEntry(content="Speechmatics"), + AdditionalVocabEntry(content="LiveKit", sounds_like=["live kit", "livekit"]), + ], +) + +session = AgentSession( + stt=stt, + # ... llm, tts, vad, etc. +) diff --git a/docs/integrations-and-sdks/livekit/index.mdx b/docs/integrations-and-sdks/livekit/index.mdx new file mode 100644 index 00000000..510df978 --- /dev/null +++ b/docs/integrations-and-sdks/livekit/index.mdx @@ -0,0 +1,122 @@ +--- +description: Build a voice AI agent with Speechmatics STT and TTS using LiveKit Agents. +--- + +import CodeBlock from '@theme/CodeBlock' +import livekitQuickstartMainPy from "./assets/main.py?raw" + +# LiveKit quickstart + +Build a real-time voice AI agent with Speechmatics and LiveKit in minutes. + +[LiveKit Agents](https://docs.livekit.io/agents/) is a framework for building voice AI applications using WebRTC. With the Speechmatics plugin, you get accurate speech recognition and natural text-to-speech for your voice agents. + +## Features + +- **Real-time transcription** — Low-latency speech-to-text as users speak +- **Speaker diarization** — Identify and track multiple speakers +- **Smart turn detection** — Know when the user has finished speaking +- **Natural TTS voices** — Choose from multiple voice options +- **Noise robustness** — Accurate recognition in challenging audio environments +- **Global language support** — Works with diverse accents and dialects + +## Prerequisites + +- Python 3.10+ +- [Speechmatics API key](https://portal.speechmatics.com) +- [LiveKit Cloud account](https://cloud.livekit.io) (free tier available) +- [OpenAI API key](https://platform.openai.com) (for the LLM) + +## Setup + +This guide assumes LiveKit Cloud. If you want to self-host LiveKit instead, follow LiveKit's self-hosting guide and configure `LIVEKIT_URL`, `LIVEKIT_API_KEY`, and `LIVEKIT_API_SECRET` for your deployment: https://docs.livekit.io/transport/self-hosting/ + +### 1. Create project + +```bash +mkdir voice-agent && cd voice-agent +``` + +### 2. Install dependencies + +```bash +uv init +uv add "livekit-agents[speechmatics,openai,silero]==1.4.2" python-dotenv +``` + +### 3. Install and authenticate the LiveKit CLI + +Install the LiveKit CLI. For additional installation options, see the LiveKit CLI setup guide: https://docs.livekit.io/home/cli/cli-setup/ + +**macOS**: + +```text +brew install livekit-cli +``` + +**Linux**: + +```text +curl -sSL https://get.livekit.io/cli | bash +``` + +**Windows**: + +```text +winget install LiveKit.LiveKitCLI +``` + +Authenticate and link your LiveKit Cloud project: + +```bash +lk cloud auth +``` + +### 4. Configure environment + +Run the LiveKit CLI to write your LiveKit Cloud credentials to a `.env.local` file: + +```bash +lk app env -w +``` + +This creates a `.env.local` file with your LiveKit credentials. Add your Speechmatics and OpenAI keys: + +```bash title=".env.local" +LIVEKIT_URL=wss://your-project.livekit.cloud +LIVEKIT_API_KEY=... +LIVEKIT_API_SECRET=... +SPEECHMATICS_API_KEY=your_speechmatics_key +OPENAI_API_KEY=your_openai_key +``` + +### 5. Create your agent + +Create a `main.py` file: + + + {livekitQuickstartMainPy} + + +### 6. Run your agent + +Run your agent in `dev` mode to connect it to LiveKit and make it available from anywhere on the internet: + +```bash +python main.py dev +``` + +Open the [LiveKit Agents Playground](https://agents-playground.livekit.io) to test your agent. + +Run your agent in `console` mode to speak to it locally in your terminal: + +```bash +python main.py console +``` + +## Next steps + +- [Speech to text](/integrations-and-sdks/livekit/stt) — Configure diarization, turn detection, and more +- [Text to speech](/integrations-and-sdks/livekit/tts) — Choose voices and adjust settings +- [Speechmatics Academy](https://github.com/speechmatics/speechmatics-academy/tree/main/integrations/livekit) — Full working examples +- [LiveKit deployment](https://docs.livekit.io/agents/deployment/) — Deploy to production diff --git a/docs/integrations-and-sdks/livekit/sidebar.ts b/docs/integrations-and-sdks/livekit/sidebar.ts new file mode 100644 index 00000000..0b6f94a4 --- /dev/null +++ b/docs/integrations-and-sdks/livekit/sidebar.ts @@ -0,0 +1,23 @@ +export default { + type: "category", + label: "LiveKit", + collapsible: true, + collapsed: true, + items: [ + { + type: "doc", + id: "integrations-and-sdks/livekit/index", + label: "Quickstart", + }, + { + type: "doc", + id: "integrations-and-sdks/livekit/stt", + label: "STT", + }, + { + type: "doc", + id: "integrations-and-sdks/livekit/tts", + label: "TTS", + }, + ], +} as const; diff --git a/docs/integrations-and-sdks/livekit/stt.mdx b/docs/integrations-and-sdks/livekit/stt.mdx new file mode 100644 index 00000000..ed2b9660 --- /dev/null +++ b/docs/integrations-and-sdks/livekit/stt.mdx @@ -0,0 +1,269 @@ +--- +description: Transcribe live audio in your LiveKit voice agents with Speechmatics STT. +--- + +import CodeBlock from '@theme/CodeBlock' +import sttFullExample from "./assets/stt-full-example.py?raw" + +# LiveKit speech to text + +Use the Speechmatics STT plugin to transcribe live audio in your LiveKit voice agents. + +## Features + +- **Real-time transcription** — Low-latency streaming with partial (interim) results +- **Turn detection** — Adaptive, fixed, ML-based, or external control modes +- **Speaker diarization** — Identify and attribute speech to different speakers +- **Speaker filtering** — Focus on specific speakers or ignore others (like the assistant) +- **Custom vocabulary** — Boost recognition for domain-specific terms and proper nouns +- **Output formatting** — Configurable templates for multi-speaker transcripts + +## Installation + +```bash +uv add "livekit-agents[speechmatics]~=1.4" +``` + +## Basic configuration + +### Authentication + +By default, the plugin reads your API key from `SPEECHMATICS_API_KEY`. + +### Service options + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `language` | string | `"en"` | Language code for transcription | +| `output_locale` | string \| null | `null` | Output locale (for example `"en-GB"`) | +| `domain` | string \| null | `null` | Domain-specific model (for example `"finance"`) | +| `operating_point` | OperatingPoint \| null | `null` | Transcription accuracy. Use `OperatingPoint.ENHANCED` (higher accuracy) or `OperatingPoint.STANDARD` (lower latency) | +| `base_url` | string | env var | Realtime base URL (defaults to `SPEECHMATICS_RT_URL`, or `wss://eu2.rt.speechmatics.com/v2`) | +| `api_key` | string | env var | Speechmatics API key (defaults to `SPEECHMATICS_API_KEY`) | +| `sample_rate` | number | `16000` | Audio sample rate in Hz. Valid values: `8000` or `16000` | +| `audio_encoding` | AudioEncoding | `PCM_S16LE` | Audio encoding format: `AudioEncoding.PCM_S16LE`, `AudioEncoding.PCM_F32LE`, or `AudioEncoding.MULAW` | +| `punctuation_overrides` | object \| null | `null` | Custom punctuation rules | + +#### Example + +```python +from livekit.agents import AgentSession +from livekit.plugins import speechmatics + +session = AgentSession( + stt=speechmatics.STT( + language="en", + output_locale="en-GB", + ), + # ... llm, tts, vad, etc. +) +``` + +## Advanced configuration + +### Turn detection + +The Speechmatics STT plugin uses the Speechmatics Voice SDK for endpointing and turn detection. +Turn detection determines when a user has finished their complete thought, while the Realtime API's `EndOfUtterance` message indicates a pause in speech. The plugin handles this distinction automatically. + +#### Modes + +Set `turn_detection_mode` to control how end of speech is detected: + +| Mode | When to use | +|------|-------------| +| `TurnDetectionMode.ADAPTIVE` | Default. Adjusts silence threshold based on speech rate, pauses, and disfluencies. Requires `speechmatics-voice[smart]` | +| `TurnDetectionMode.FIXED` | Fixed silence threshold using `end_of_utterance_silence_trigger` | +| `TurnDetectionMode.SMART_TURN` | ML-based endpointing using acoustic cues for more natural turn-taking. Requires `speechmatics-voice[smart]` | +| `TurnDetectionMode.EXTERNAL` | You control turn boundaries manually (for example using your own VAD and calling `finalize()`) | + +```python +from livekit.plugins import speechmatics +from livekit.plugins.speechmatics import TurnDetectionMode + +# Adaptive mode (default) - adjusts to speech patterns +# Requires: pip install speechmatics-voice[smart] +stt = speechmatics.STT( + turn_detection_mode=TurnDetectionMode.ADAPTIVE, +) + +# Fixed mode - consistent silence threshold +stt = speechmatics.STT( + turn_detection_mode=TurnDetectionMode.FIXED, + end_of_utterance_silence_trigger=0.8, # 800ms of silence + end_of_utterance_max_delay=5.0, # Force end after 5s +) + +# Smart turn mode - ML-based natural turn-taking +# Requires: pip install speechmatics-voice[smart] +stt = speechmatics.STT( + turn_detection_mode=TurnDetectionMode.SMART_TURN, +) + +# External mode - manual control via finalize() +stt = speechmatics.STT( + turn_detection_mode=TurnDetectionMode.EXTERNAL, +) +``` + +#### Manual turn finalization + +When using `TurnDetectionMode.EXTERNAL`, you control when a turn ends by calling `finalize()` on the STT instance. This is useful when you have your own VAD or want to integrate with external signals. + +```python +from livekit.plugins import speechmatics +from livekit.plugins.speechmatics import TurnDetectionMode + +stt = speechmatics.STT( + turn_detection_mode=TurnDetectionMode.EXTERNAL, +) + +# Later, when you detect the user has finished speaking: +stt.finalize() +``` + +#### Configuration + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `end_of_utterance_silence_trigger` | number \| null | `null` | Silence duration (seconds) that triggers end of utterance. Used primarily in `FIXED` mode. Valid range: >0 to <2 seconds (exclusive) | +| `end_of_utterance_max_delay` | number \| null | `null` | Maximum delay (seconds) before forcing an end of utterance. Must be greater than `end_of_utterance_silence_trigger` | +| `max_delay` | number \| null | `null` | Maximum transcription delay (seconds). Lower values reduce latency at the cost of accuracy. Valid range: 0.7–4.0 seconds | +| `include_partials` | boolean \| null | `null` | Enable partial (interim) transcription results. When `null`, defaults to `true` | + +### Advanced diarization + +The plugin can attribute words to speakers and lets you decide which speakers are treated as **active** (foreground) vs **passive** (background). + +#### Configuration + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `enable_diarization` | boolean \| null | `null` | Enable speaker diarization | +| `speaker_sensitivity` | number \| null | `null` | Speaker detection sensitivity. Valid range: >0.0 to <1.0 (exclusive) | +| `max_speakers` | number \| null | `null` | Maximum number of speakers to detect. Valid range: 2–100 | +| `prefer_current_speaker` | boolean \| null | `null` | Reduce speaker switching for similar voices | +| `known_speakers` | array \| null | `null` | Pre-define speaker identifiers with labels (`SpeakerIdentifier` objects) | +| `additional_vocab` | array \| null | `null` | Custom vocabulary entries (`AdditionalVocabEntry` objects) for improved recognition | + +```python +from livekit.plugins import speechmatics +from livekit.plugins.speechmatics import AdditionalVocabEntry + +stt = speechmatics.STT( + enable_diarization=True, + speaker_sensitivity=0.7, + max_speakers=3, + prefer_current_speaker=True, + additional_vocab=[ + AdditionalVocabEntry(content="Speechmatics"), + AdditionalVocabEntry(content="API", sounds_like=["A P I"]), + ], +) +``` + +#### Known speakers + +Use `known_speakers` to attribute words to specific speakers across sessions. This is useful when you want consistent speaker identification for known participants. + +```python +from livekit.plugins import speechmatics +from livekit.plugins.speechmatics import SpeakerIdentifier + +stt = speechmatics.STT( + enable_diarization=True, + known_speakers=[ + SpeakerIdentifier(label="Alice", speaker_identifiers=["speaker_abc123"]), + SpeakerIdentifier(label="Bob", speaker_identifiers=["speaker_def456"]), + ], +) +``` + +Speaker identifiers are unique to each Speechmatics account and can be obtained from a previous transcription session. + +#### Speaker focus + +Control which speakers are treated as **active** (foreground) vs **passive** (background): + +- **Active speakers** are the speakers you care about in your application. They generate `FINAL_TRANSCRIPT` events. +- **Passive speakers** are still transcribed, but their words are buffered and only included in the output alongside new words from active speakers. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `focus_speakers` | array | `[]` | Speaker IDs to treat as active | +| `ignore_speakers` | array | `[]` | Speaker IDs to exclude entirely | +| `focus_mode` | SpeakerFocusMode | `RETAIN` | How to handle non-focused speakers | + +##### Focus modes + +- `SpeakerFocusMode.RETAIN` keeps non-focused speakers as passive. +- `SpeakerFocusMode.IGNORE` discards non-focused speaker words entirely. + +`ignore_speakers` always excludes those speakers from transcription and their speech will not trigger VAD or end of utterance detection. + +:::tip +By default, any speaker label wrapped in double underscores (for example `__ASSISTANT__`) is automatically excluded. This convention lets you filter out assistant audio without explicitly adding it to `ignore_speakers`. +::: + +```python +from livekit.plugins import speechmatics +from livekit.plugins.speechmatics import SpeakerFocusMode + +stt = speechmatics.STT( + focus_speakers=["S1"], + focus_mode=SpeakerFocusMode.RETAIN, + ignore_speakers=["S3"], +) +``` + +#### Speaker formatting + +Use `speaker_active_format` and `speaker_passive_format` to format transcripts for your LLM. +The templates support `{speaker_id}` and `{text}`. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `speaker_active_format` | string \| null | `null` | Format template for active speaker output | +| `speaker_passive_format` | string \| null | `null` | Format template for passive speaker output | + +```python +from livekit.plugins import speechmatics + +stt = speechmatics.STT( + speaker_active_format="<{speaker_id}>{text}", + speaker_passive_format="<{speaker_id} background>{text}", +) +``` + +When you use a custom format, include it in your agent instructions so the LLM can interpret speaker tags consistently. + +#### Updating speakers during transcription + +You can dynamically change which speakers to focus on or ignore during an active transcription session using the `update_speakers()` method. + +```python +from livekit.plugins import speechmatics +from livekit.plugins.speechmatics import SpeakerFocusMode + +stt = speechmatics.STT(enable_diarization=True) + +# Later, during transcription: +stt.update_speakers( + focus_speakers=["S1", "S2"], + ignore_speakers=["S3"], + focus_mode=SpeakerFocusMode.RETAIN, +) +``` + +This is useful when you need to adjust speaker filtering based on runtime conditions, such as when a new participant joins or leaves a conversation. + +#### Example + + + {sttFullExample} + + +## Next steps + +- [Quickstart](/integrations-and-sdks/livekit) — Build a complete voice agent +- [Text to speech](/integrations-and-sdks/livekit/tts) — Use Speechmatics voices in your agent diff --git a/docs/integrations-and-sdks/livekit/tts.mdx b/docs/integrations-and-sdks/livekit/tts.mdx new file mode 100644 index 00000000..ebd83031 --- /dev/null +++ b/docs/integrations-and-sdks/livekit/tts.mdx @@ -0,0 +1,41 @@ +--- +description: Use Speechmatics text-to-speech voices in your LiveKit voice agents. +--- + +# LiveKit text to speech + +Give your LiveKit voice agent natural, expressive speech with Speechmatics TTS. + +## Installation + +```bash +uv add "livekit-agents[speechmatics]~=1.4" +``` + +## Usage + +```python +from livekit.agents import AgentSession +from livekit.plugins import speechmatics + +session = AgentSession( + tts=speechmatics.TTS(), + # ... stt, llm, etc. +) +``` + +## Configuration + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `voice` | string | `"sarah"` | Voice model to use | +| `api_key` | string | env var | Speechmatics API key (defaults to `SPEECHMATICS_API_KEY`) | + +For available voices and detailed TTS options, see the [Text to speech quickstart](/text-to-speech/quickstart). + + + +## Next steps + +- [Quickstart](/integrations-and-sdks/livekit) — Build a complete voice agent +- [Speech to text](/integrations-and-sdks/livekit/stt) — Configure STT options diff --git a/docs/integrations-and-sdks/sidebar.ts b/docs/integrations-and-sdks/sidebar.ts index 337d99a5..05b66e38 100644 --- a/docs/integrations-and-sdks/sidebar.ts +++ b/docs/integrations-and-sdks/sidebar.ts @@ -1,3 +1,5 @@ +import livekitSidebar from "./livekit/sidebar"; + export default { type: "category", label: "Integrations and SDKs", @@ -14,20 +16,16 @@ export default { id: "integrations-and-sdks/vapi", label: "Vapi", }, - { - type: "doc", - id: "integrations-and-sdks/livekit", - label: "LiveKit", - }, + livekitSidebar, { type: "doc", id: "integrations-and-sdks/pipecat", label: "Pipecat", }, { - type: "doc", - id: "integrations-and-sdks/sdks", - label: "SDKs", + type: "doc", + id: "integrations-and-sdks/sdks", + label: "SDKs", }, -] -} \ No newline at end of file + ] +}; \ No newline at end of file