From b57804b762e9ef3d0c9cfe72dabfcd7c2dd6795b Mon Sep 17 00:00:00 2001 From: Archie McMullan Date: Wed, 25 Feb 2026 11:12:07 +0000 Subject: [PATCH 1/4] docs: scaffold Pipecat integration section --- .../pipecat/assets/.gitkeep | 0 .../pipecat/assets/main.py | 56 ++++ .../pipecat/assets/stt-full-example.py | 48 ++++ docs/integrations-and-sdks/pipecat/index.mdx | 122 ++++++++ docs/integrations-and-sdks/pipecat/sidebar.ts | 23 ++ docs/integrations-and-sdks/pipecat/stt.mdx | 269 ++++++++++++++++++ docs/integrations-and-sdks/pipecat/tts.mdx | 41 +++ docs/integrations-and-sdks/sidebar.ts | 23 +- 8 files changed, 568 insertions(+), 14 deletions(-) create mode 100644 docs/integrations-and-sdks/pipecat/assets/.gitkeep create mode 100644 docs/integrations-and-sdks/pipecat/assets/main.py create mode 100644 docs/integrations-and-sdks/pipecat/assets/stt-full-example.py create mode 100644 docs/integrations-and-sdks/pipecat/index.mdx create mode 100644 docs/integrations-and-sdks/pipecat/sidebar.ts create mode 100644 docs/integrations-and-sdks/pipecat/stt.mdx create mode 100644 docs/integrations-and-sdks/pipecat/tts.mdx diff --git a/docs/integrations-and-sdks/pipecat/assets/.gitkeep b/docs/integrations-and-sdks/pipecat/assets/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/docs/integrations-and-sdks/pipecat/assets/main.py b/docs/integrations-and-sdks/pipecat/assets/main.py new file mode 100644 index 00000000..9ea2c802 --- /dev/null +++ b/docs/integrations-and-sdks/pipecat/assets/main.py @@ -0,0 +1,56 @@ +from dotenv import load_dotenv +from livekit import agents +from livekit.agents import AgentSession, Agent, RoomInputOptions +from livekit.plugins import openai, silero, speechmatics +from livekit.plugins.speechmatics import TurnDetectionMode + +load_dotenv(".env.local") + + +class VoiceAssistant(Agent): + def __init__(self): + super().__init__( + instructions="You are a helpful voice assistant. Be concise and friendly." + ) + + +async def entrypoint(ctx: agents.JobContext): + await ctx.connect() + + # Speech-to-Text: Speechmatics + stt = speechmatics.STT( + turn_detection_mode=TurnDetectionMode.SMART_TURN, + ) + + # Language Model: OpenAI + llm = openai.LLM(model="gpt-4o-mini") + + # Text-to-Speech: Speechmatics + tts = speechmatics.TTS() + + # Voice Activity Detection: Silero + vad = silero.VAD.load() + + # Create and start session + session = AgentSession( + stt=stt, + llm=llm, + tts=tts, + vad=vad, + ) + + await session.start( + room=ctx.room, + agent=VoiceAssistant(), + room_input_options=RoomInputOptions(), + ) + + await session.generate_reply( + instructions="Say a short hello and ask how you can help." + ) + + +if __name__ == "__main__": + agents.cli.run_app( + agents.WorkerOptions(entrypoint_fnc=entrypoint), + ) diff --git a/docs/integrations-and-sdks/pipecat/assets/stt-full-example.py b/docs/integrations-and-sdks/pipecat/assets/stt-full-example.py new file mode 100644 index 00000000..333b09f6 --- /dev/null +++ b/docs/integrations-and-sdks/pipecat/assets/stt-full-example.py @@ -0,0 +1,48 @@ +from livekit.agents import AgentSession +from livekit.plugins import speechmatics +from livekit.plugins.speechmatics import ( + AdditionalVocabEntry, + AudioEncoding, + OperatingPoint, + SpeakerFocusMode, + SpeakerIdentifier, + TurnDetectionMode, +) + +stt = speechmatics.STT( + # Service options + language="en", + output_locale="en-US", + operating_point=OperatingPoint.ENHANCED, + + # Turn detection + turn_detection_mode=TurnDetectionMode.ADAPTIVE, + max_delay=1.5, + include_partials=True, + + # Diarization + enable_diarization=True, + speaker_sensitivity=0.6, + max_speakers=4, + prefer_current_speaker=True, + + # Speaker focus + focus_speakers=["S1", "S2"], + focus_mode=SpeakerFocusMode.RETAIN, + ignore_speakers=["__ASSISTANT__"], + + # Output formatting + speaker_active_format="[{speaker_id}]: {text}", + speaker_passive_format="[{speaker_id} (background)]: {text}", + + # Custom vocabulary + additional_vocab=[ + AdditionalVocabEntry(content="Speechmatics"), + AdditionalVocabEntry(content="LiveKit", sounds_like=["live kit", "livekit"]), + ], +) + +session = AgentSession( + stt=stt, + # ... llm, tts, vad, etc. +) diff --git a/docs/integrations-and-sdks/pipecat/index.mdx b/docs/integrations-and-sdks/pipecat/index.mdx new file mode 100644 index 00000000..510df978 --- /dev/null +++ b/docs/integrations-and-sdks/pipecat/index.mdx @@ -0,0 +1,122 @@ +--- +description: Build a voice AI agent with Speechmatics STT and TTS using LiveKit Agents. +--- + +import CodeBlock from '@theme/CodeBlock' +import livekitQuickstartMainPy from "./assets/main.py?raw" + +# LiveKit quickstart + +Build a real-time voice AI agent with Speechmatics and LiveKit in minutes. + +[LiveKit Agents](https://docs.livekit.io/agents/) is a framework for building voice AI applications using WebRTC. With the Speechmatics plugin, you get accurate speech recognition and natural text-to-speech for your voice agents. + +## Features + +- **Real-time transcription** — Low-latency speech-to-text as users speak +- **Speaker diarization** — Identify and track multiple speakers +- **Smart turn detection** — Know when the user has finished speaking +- **Natural TTS voices** — Choose from multiple voice options +- **Noise robustness** — Accurate recognition in challenging audio environments +- **Global language support** — Works with diverse accents and dialects + +## Prerequisites + +- Python 3.10+ +- [Speechmatics API key](https://portal.speechmatics.com) +- [LiveKit Cloud account](https://cloud.livekit.io) (free tier available) +- [OpenAI API key](https://platform.openai.com) (for the LLM) + +## Setup + +This guide assumes LiveKit Cloud. If you want to self-host LiveKit instead, follow LiveKit's self-hosting guide and configure `LIVEKIT_URL`, `LIVEKIT_API_KEY`, and `LIVEKIT_API_SECRET` for your deployment: https://docs.livekit.io/transport/self-hosting/ + +### 1. Create project + +```bash +mkdir voice-agent && cd voice-agent +``` + +### 2. Install dependencies + +```bash +uv init +uv add "livekit-agents[speechmatics,openai,silero]==1.4.2" python-dotenv +``` + +### 3. Install and authenticate the LiveKit CLI + +Install the LiveKit CLI. For additional installation options, see the LiveKit CLI setup guide: https://docs.livekit.io/home/cli/cli-setup/ + +**macOS**: + +```text +brew install livekit-cli +``` + +**Linux**: + +```text +curl -sSL https://get.livekit.io/cli | bash +``` + +**Windows**: + +```text +winget install LiveKit.LiveKitCLI +``` + +Authenticate and link your LiveKit Cloud project: + +```bash +lk cloud auth +``` + +### 4. Configure environment + +Run the LiveKit CLI to write your LiveKit Cloud credentials to a `.env.local` file: + +```bash +lk app env -w +``` + +This creates a `.env.local` file with your LiveKit credentials. Add your Speechmatics and OpenAI keys: + +```bash title=".env.local" +LIVEKIT_URL=wss://your-project.livekit.cloud +LIVEKIT_API_KEY=... +LIVEKIT_API_SECRET=... +SPEECHMATICS_API_KEY=your_speechmatics_key +OPENAI_API_KEY=your_openai_key +``` + +### 5. Create your agent + +Create a `main.py` file: + + + {livekitQuickstartMainPy} + + +### 6. Run your agent + +Run your agent in `dev` mode to connect it to LiveKit and make it available from anywhere on the internet: + +```bash +python main.py dev +``` + +Open the [LiveKit Agents Playground](https://agents-playground.livekit.io) to test your agent. + +Run your agent in `console` mode to speak to it locally in your terminal: + +```bash +python main.py console +``` + +## Next steps + +- [Speech to text](/integrations-and-sdks/livekit/stt) — Configure diarization, turn detection, and more +- [Text to speech](/integrations-and-sdks/livekit/tts) — Choose voices and adjust settings +- [Speechmatics Academy](https://github.com/speechmatics/speechmatics-academy/tree/main/integrations/livekit) — Full working examples +- [LiveKit deployment](https://docs.livekit.io/agents/deployment/) — Deploy to production diff --git a/docs/integrations-and-sdks/pipecat/sidebar.ts b/docs/integrations-and-sdks/pipecat/sidebar.ts new file mode 100644 index 00000000..13d626dd --- /dev/null +++ b/docs/integrations-and-sdks/pipecat/sidebar.ts @@ -0,0 +1,23 @@ +export default { + type: "category", + label: "Pipecat", + collapsible: true, + collapsed: true, + items: [ + { + type: "doc", + id: "integrations-and-sdks/pipecat/index", + label: "Quickstart", + }, + { + type: "doc", + id: "integrations-and-sdks/pipecat/stt", + label: "STT", + }, + { + type: "doc", + id: "integrations-and-sdks/pipecat/tts", + label: "TTS", + }, + ], +} as const; diff --git a/docs/integrations-and-sdks/pipecat/stt.mdx b/docs/integrations-and-sdks/pipecat/stt.mdx new file mode 100644 index 00000000..ed2b9660 --- /dev/null +++ b/docs/integrations-and-sdks/pipecat/stt.mdx @@ -0,0 +1,269 @@ +--- +description: Transcribe live audio in your LiveKit voice agents with Speechmatics STT. +--- + +import CodeBlock from '@theme/CodeBlock' +import sttFullExample from "./assets/stt-full-example.py?raw" + +# LiveKit speech to text + +Use the Speechmatics STT plugin to transcribe live audio in your LiveKit voice agents. + +## Features + +- **Real-time transcription** — Low-latency streaming with partial (interim) results +- **Turn detection** — Adaptive, fixed, ML-based, or external control modes +- **Speaker diarization** — Identify and attribute speech to different speakers +- **Speaker filtering** — Focus on specific speakers or ignore others (like the assistant) +- **Custom vocabulary** — Boost recognition for domain-specific terms and proper nouns +- **Output formatting** — Configurable templates for multi-speaker transcripts + +## Installation + +```bash +uv add "livekit-agents[speechmatics]~=1.4" +``` + +## Basic configuration + +### Authentication + +By default, the plugin reads your API key from `SPEECHMATICS_API_KEY`. + +### Service options + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `language` | string | `"en"` | Language code for transcription | +| `output_locale` | string \| null | `null` | Output locale (for example `"en-GB"`) | +| `domain` | string \| null | `null` | Domain-specific model (for example `"finance"`) | +| `operating_point` | OperatingPoint \| null | `null` | Transcription accuracy. Use `OperatingPoint.ENHANCED` (higher accuracy) or `OperatingPoint.STANDARD` (lower latency) | +| `base_url` | string | env var | Realtime base URL (defaults to `SPEECHMATICS_RT_URL`, or `wss://eu2.rt.speechmatics.com/v2`) | +| `api_key` | string | env var | Speechmatics API key (defaults to `SPEECHMATICS_API_KEY`) | +| `sample_rate` | number | `16000` | Audio sample rate in Hz. Valid values: `8000` or `16000` | +| `audio_encoding` | AudioEncoding | `PCM_S16LE` | Audio encoding format: `AudioEncoding.PCM_S16LE`, `AudioEncoding.PCM_F32LE`, or `AudioEncoding.MULAW` | +| `punctuation_overrides` | object \| null | `null` | Custom punctuation rules | + +#### Example + +```python +from livekit.agents import AgentSession +from livekit.plugins import speechmatics + +session = AgentSession( + stt=speechmatics.STT( + language="en", + output_locale="en-GB", + ), + # ... llm, tts, vad, etc. +) +``` + +## Advanced configuration + +### Turn detection + +The Speechmatics STT plugin uses the Speechmatics Voice SDK for endpointing and turn detection. +Turn detection determines when a user has finished their complete thought, while the Realtime API's `EndOfUtterance` message indicates a pause in speech. The plugin handles this distinction automatically. + +#### Modes + +Set `turn_detection_mode` to control how end of speech is detected: + +| Mode | When to use | +|------|-------------| +| `TurnDetectionMode.ADAPTIVE` | Default. Adjusts silence threshold based on speech rate, pauses, and disfluencies. Requires `speechmatics-voice[smart]` | +| `TurnDetectionMode.FIXED` | Fixed silence threshold using `end_of_utterance_silence_trigger` | +| `TurnDetectionMode.SMART_TURN` | ML-based endpointing using acoustic cues for more natural turn-taking. Requires `speechmatics-voice[smart]` | +| `TurnDetectionMode.EXTERNAL` | You control turn boundaries manually (for example using your own VAD and calling `finalize()`) | + +```python +from livekit.plugins import speechmatics +from livekit.plugins.speechmatics import TurnDetectionMode + +# Adaptive mode (default) - adjusts to speech patterns +# Requires: pip install speechmatics-voice[smart] +stt = speechmatics.STT( + turn_detection_mode=TurnDetectionMode.ADAPTIVE, +) + +# Fixed mode - consistent silence threshold +stt = speechmatics.STT( + turn_detection_mode=TurnDetectionMode.FIXED, + end_of_utterance_silence_trigger=0.8, # 800ms of silence + end_of_utterance_max_delay=5.0, # Force end after 5s +) + +# Smart turn mode - ML-based natural turn-taking +# Requires: pip install speechmatics-voice[smart] +stt = speechmatics.STT( + turn_detection_mode=TurnDetectionMode.SMART_TURN, +) + +# External mode - manual control via finalize() +stt = speechmatics.STT( + turn_detection_mode=TurnDetectionMode.EXTERNAL, +) +``` + +#### Manual turn finalization + +When using `TurnDetectionMode.EXTERNAL`, you control when a turn ends by calling `finalize()` on the STT instance. This is useful when you have your own VAD or want to integrate with external signals. + +```python +from livekit.plugins import speechmatics +from livekit.plugins.speechmatics import TurnDetectionMode + +stt = speechmatics.STT( + turn_detection_mode=TurnDetectionMode.EXTERNAL, +) + +# Later, when you detect the user has finished speaking: +stt.finalize() +``` + +#### Configuration + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `end_of_utterance_silence_trigger` | number \| null | `null` | Silence duration (seconds) that triggers end of utterance. Used primarily in `FIXED` mode. Valid range: >0 to <2 seconds (exclusive) | +| `end_of_utterance_max_delay` | number \| null | `null` | Maximum delay (seconds) before forcing an end of utterance. Must be greater than `end_of_utterance_silence_trigger` | +| `max_delay` | number \| null | `null` | Maximum transcription delay (seconds). Lower values reduce latency at the cost of accuracy. Valid range: 0.7–4.0 seconds | +| `include_partials` | boolean \| null | `null` | Enable partial (interim) transcription results. When `null`, defaults to `true` | + +### Advanced diarization + +The plugin can attribute words to speakers and lets you decide which speakers are treated as **active** (foreground) vs **passive** (background). + +#### Configuration + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `enable_diarization` | boolean \| null | `null` | Enable speaker diarization | +| `speaker_sensitivity` | number \| null | `null` | Speaker detection sensitivity. Valid range: >0.0 to <1.0 (exclusive) | +| `max_speakers` | number \| null | `null` | Maximum number of speakers to detect. Valid range: 2–100 | +| `prefer_current_speaker` | boolean \| null | `null` | Reduce speaker switching for similar voices | +| `known_speakers` | array \| null | `null` | Pre-define speaker identifiers with labels (`SpeakerIdentifier` objects) | +| `additional_vocab` | array \| null | `null` | Custom vocabulary entries (`AdditionalVocabEntry` objects) for improved recognition | + +```python +from livekit.plugins import speechmatics +from livekit.plugins.speechmatics import AdditionalVocabEntry + +stt = speechmatics.STT( + enable_diarization=True, + speaker_sensitivity=0.7, + max_speakers=3, + prefer_current_speaker=True, + additional_vocab=[ + AdditionalVocabEntry(content="Speechmatics"), + AdditionalVocabEntry(content="API", sounds_like=["A P I"]), + ], +) +``` + +#### Known speakers + +Use `known_speakers` to attribute words to specific speakers across sessions. This is useful when you want consistent speaker identification for known participants. + +```python +from livekit.plugins import speechmatics +from livekit.plugins.speechmatics import SpeakerIdentifier + +stt = speechmatics.STT( + enable_diarization=True, + known_speakers=[ + SpeakerIdentifier(label="Alice", speaker_identifiers=["speaker_abc123"]), + SpeakerIdentifier(label="Bob", speaker_identifiers=["speaker_def456"]), + ], +) +``` + +Speaker identifiers are unique to each Speechmatics account and can be obtained from a previous transcription session. + +#### Speaker focus + +Control which speakers are treated as **active** (foreground) vs **passive** (background): + +- **Active speakers** are the speakers you care about in your application. They generate `FINAL_TRANSCRIPT` events. +- **Passive speakers** are still transcribed, but their words are buffered and only included in the output alongside new words from active speakers. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `focus_speakers` | array | `[]` | Speaker IDs to treat as active | +| `ignore_speakers` | array | `[]` | Speaker IDs to exclude entirely | +| `focus_mode` | SpeakerFocusMode | `RETAIN` | How to handle non-focused speakers | + +##### Focus modes + +- `SpeakerFocusMode.RETAIN` keeps non-focused speakers as passive. +- `SpeakerFocusMode.IGNORE` discards non-focused speaker words entirely. + +`ignore_speakers` always excludes those speakers from transcription and their speech will not trigger VAD or end of utterance detection. + +:::tip +By default, any speaker label wrapped in double underscores (for example `__ASSISTANT__`) is automatically excluded. This convention lets you filter out assistant audio without explicitly adding it to `ignore_speakers`. +::: + +```python +from livekit.plugins import speechmatics +from livekit.plugins.speechmatics import SpeakerFocusMode + +stt = speechmatics.STT( + focus_speakers=["S1"], + focus_mode=SpeakerFocusMode.RETAIN, + ignore_speakers=["S3"], +) +``` + +#### Speaker formatting + +Use `speaker_active_format` and `speaker_passive_format` to format transcripts for your LLM. +The templates support `{speaker_id}` and `{text}`. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `speaker_active_format` | string \| null | `null` | Format template for active speaker output | +| `speaker_passive_format` | string \| null | `null` | Format template for passive speaker output | + +```python +from livekit.plugins import speechmatics + +stt = speechmatics.STT( + speaker_active_format="<{speaker_id}>{text}", + speaker_passive_format="<{speaker_id} background>{text}", +) +``` + +When you use a custom format, include it in your agent instructions so the LLM can interpret speaker tags consistently. + +#### Updating speakers during transcription + +You can dynamically change which speakers to focus on or ignore during an active transcription session using the `update_speakers()` method. + +```python +from livekit.plugins import speechmatics +from livekit.plugins.speechmatics import SpeakerFocusMode + +stt = speechmatics.STT(enable_diarization=True) + +# Later, during transcription: +stt.update_speakers( + focus_speakers=["S1", "S2"], + ignore_speakers=["S3"], + focus_mode=SpeakerFocusMode.RETAIN, +) +``` + +This is useful when you need to adjust speaker filtering based on runtime conditions, such as when a new participant joins or leaves a conversation. + +#### Example + + + {sttFullExample} + + +## Next steps + +- [Quickstart](/integrations-and-sdks/livekit) — Build a complete voice agent +- [Text to speech](/integrations-and-sdks/livekit/tts) — Use Speechmatics voices in your agent diff --git a/docs/integrations-and-sdks/pipecat/tts.mdx b/docs/integrations-and-sdks/pipecat/tts.mdx new file mode 100644 index 00000000..ebd83031 --- /dev/null +++ b/docs/integrations-and-sdks/pipecat/tts.mdx @@ -0,0 +1,41 @@ +--- +description: Use Speechmatics text-to-speech voices in your LiveKit voice agents. +--- + +# LiveKit text to speech + +Give your LiveKit voice agent natural, expressive speech with Speechmatics TTS. + +## Installation + +```bash +uv add "livekit-agents[speechmatics]~=1.4" +``` + +## Usage + +```python +from livekit.agents import AgentSession +from livekit.plugins import speechmatics + +session = AgentSession( + tts=speechmatics.TTS(), + # ... stt, llm, etc. +) +``` + +## Configuration + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `voice` | string | `"sarah"` | Voice model to use | +| `api_key` | string | env var | Speechmatics API key (defaults to `SPEECHMATICS_API_KEY`) | + +For available voices and detailed TTS options, see the [Text to speech quickstart](/text-to-speech/quickstart). + + + +## Next steps + +- [Quickstart](/integrations-and-sdks/livekit) — Build a complete voice agent +- [Speech to text](/integrations-and-sdks/livekit/stt) — Configure STT options diff --git a/docs/integrations-and-sdks/sidebar.ts b/docs/integrations-and-sdks/sidebar.ts index 337d99a5..21475c32 100644 --- a/docs/integrations-and-sdks/sidebar.ts +++ b/docs/integrations-and-sdks/sidebar.ts @@ -1,3 +1,6 @@ +import livekitSidebar from "./livekit/sidebar"; +import pipecatSidebar from "./pipecat/sidebar"; + export default { type: "category", label: "Integrations and SDKs", @@ -14,20 +17,12 @@ export default { id: "integrations-and-sdks/vapi", label: "Vapi", }, + livekitSidebar, + pipecatSidebar, { type: "doc", - id: "integrations-and-sdks/livekit", - label: "LiveKit", + id: "integrations-and-sdks/sdks", + label: "SDKs", }, - { - type: "doc", - id: "integrations-and-sdks/pipecat", - label: "Pipecat", - }, - { - type: "doc", - id: "integrations-and-sdks/sdks", - label: "SDKs", - }, -] -} \ No newline at end of file + ], +} as const; \ No newline at end of file From f8eb907c35bdff9d0761f9e9823bfa885e46f83a Mon Sep 17 00:00:00 2001 From: Archie McMullan Date: Wed, 25 Feb 2026 11:43:28 +0000 Subject: [PATCH 2/4] docs: pipecat boilerplate baseline --- docs/integrations-and-sdks/index.mdx | 2 +- docs/integrations-and-sdks/pipecat.mdx | 72 -------------------------- docs/integrations-and-sdks/sidebar.ts | 7 ++- 3 files changed, 6 insertions(+), 75 deletions(-) delete mode 100644 docs/integrations-and-sdks/pipecat.mdx diff --git a/docs/integrations-and-sdks/index.mdx b/docs/integrations-and-sdks/index.mdx index 6f3e97be..6096a57c 100644 --- a/docs/integrations-and-sdks/index.mdx +++ b/docs/integrations-and-sdks/index.mdx @@ -39,7 +39,7 @@ Choose an integration to build accurate, low-latency voice agents rapidly with t title="Pipecat" description="Open-source framework with full control of the voice pipeline in code. Complex agents. Best for: power builders." icon={Pipecat logo} - href="/integrations-and-sdks/pipecat" + href="/integrations-and-sdks/pipecat/" /> diff --git a/docs/integrations-and-sdks/pipecat.mdx b/docs/integrations-and-sdks/pipecat.mdx deleted file mode 100644 index bf9935cd..00000000 --- a/docs/integrations-and-sdks/pipecat.mdx +++ /dev/null @@ -1,72 +0,0 @@ ---- -id: pipecat -description: Learn how to integrate Speechmatics STT with Pipecat. ---- - -import CodeBlock from '@theme/CodeBlock'; - -# Pipecat integration - -Pipecat is an open-source framework for building voice agents. When Speechmatics STT is integrated with Pipecat, you can build real-time voice and multimodal conversational agent specifically tailored to your needs. - -Pipecat is perfect for: - -- **Voice AI**: Voice assistants, chatbots, and IVR systems -- **Transcription**: Realtime transcription of live events or media -- **Accessibility applications**: Screen readers and assistive technologies -- **Content creation**: Podcasts, dubbing, audiobooks, and voice-overs -- **Media production**: News broadcasts and automated announcements - -## Features - -- **Realtime transcription**: low-latency speech-to-text for responsive agents -- **Speaker diarization**: track who’s speaking in multi-participant sessions -- **Turn detection**: capture natural speech boundaries automatically -- **Noise robustness**: maintain accuracy in challenging environments -- **Custom vocabularies**: boost recognition for domain-specific terms -- **Flexible deployment**: use on-device, cloud, or hybrid Pipecat setups -## Quickstart - -### Requirements -- Python 3.10 or later -- uv package manager installed -- Pipecat >= 1.2 -- Speechmatics account. You can create one [here](https://portal.speechmatics.com). -- Speechmatics API key. You can generate one in the [Portal](https://portal.speechmatics.com/settings/api-keys). - -#### Installation -```python -pip install "pipecat-ai[speechmatics]" -``` -### Usage -Set the environment variable SPEECHMATICS_API_KEY to your Speechmatics API key. -```bash -export SPEECHMATICS_API_KEY=your_api_key -``` - -```python -import asyncio -import os - -from pipecat.services.speechmatics import SpeechmaticsSTTService - - -async def main(): - stt = SpeechmaticsSTTService( - api_key=os.environ["SPEECHMATICS_API_KEY"], - ) - - async def audio_stream(): - # Replace with your real audio source. - yield from [b"fake_audio_chunk_1", b"fake_audio_chunk_2"] - - async for result in stt.transcribe(audio_stream()): - speaker = f"Speaker {result.speaker}" if result.speaker else "Unknown" - print(f"{speaker}: {result.text}") - - -if __name__ == "__main__": - asyncio.run(main()) -``` - -For detailed examples, please see the [Speechmatics Academy](https://github.com/speechmatics/speechmatics-academy). diff --git a/docs/integrations-and-sdks/sidebar.ts b/docs/integrations-and-sdks/sidebar.ts index 21475c32..bc303d6e 100644 --- a/docs/integrations-and-sdks/sidebar.ts +++ b/docs/integrations-and-sdks/sidebar.ts @@ -1,4 +1,3 @@ -import livekitSidebar from "./livekit/sidebar"; import pipecatSidebar from "./pipecat/sidebar"; export default { @@ -17,7 +16,11 @@ export default { id: "integrations-and-sdks/vapi", label: "Vapi", }, - livekitSidebar, + { + type: "doc", + id: "integrations-and-sdks/livekit", + label: "LiveKit", + }, pipecatSidebar, { type: "doc", From c0ac551bbd539c6a3aa9204f44fe934a27ebd0a5 Mon Sep 17 00:00:00 2001 From: Archie McMullan Date: Wed, 25 Feb 2026 15:59:45 +0000 Subject: [PATCH 3/4] docs: uplift pipecat quickstart --- .../pipecat/assets/main.py | 148 +++++++++++++----- docs/integrations-and-sdks/pipecat/index.mdx | 100 ++++-------- 2 files changed, 142 insertions(+), 106 deletions(-) diff --git a/docs/integrations-and-sdks/pipecat/assets/main.py b/docs/integrations-and-sdks/pipecat/assets/main.py index 9ea2c802..3d837e90 100644 --- a/docs/integrations-and-sdks/pipecat/assets/main.py +++ b/docs/integrations-and-sdks/pipecat/assets/main.py @@ -1,56 +1,126 @@ +import os + +import aiohttp from dotenv import load_dotenv -from livekit import agents -from livekit.agents import AgentSession, Agent, RoomInputOptions -from livekit.plugins import openai, silero, speechmatics -from livekit.plugins.speechmatics import TurnDetectionMode +from loguru import logger + +from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3 +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.audio.vad.vad_analyzer import VADParams +from pipecat.frames.frames import LLMRunFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.llm_context import LLMContext +from pipecat.processors.aggregators.llm_response_universal import ( + LLMContextAggregatorPair, + LLMUserAggregatorParams, +) +from pipecat.runner.types import RunnerArguments +from pipecat.runner.utils import create_transport +from pipecat.services.openai.llm import OpenAILLMService +from pipecat.services.speechmatics.stt import SpeechmaticsSTTService +from pipecat.services.speechmatics.tts import SpeechmaticsTTSService +from pipecat.transports.base_transport import BaseTransport, TransportParams +from pipecat.turns.user_stop.turn_analyzer_user_turn_stop_strategy import ( + TurnAnalyzerUserTurnStopStrategy, +) +from pipecat.turns.user_turn_strategies import UserTurnStrategies + +load_dotenv(override=True) -load_dotenv(".env.local") +async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): + logger.info("Starting bot") + + async with aiohttp.ClientSession() as session: + stt = SpeechmaticsSTTService( + api_key=os.getenv("SPEECHMATICS_API_KEY"), + params=SpeechmaticsSTTService.InputParams( + turn_detection_mode=SpeechmaticsSTTService.TurnDetectionMode.EXTERNAL, + ), + ) + + llm = OpenAILLMService( + api_key=os.getenv("OPENAI_API_KEY"), + model="gpt-4o-mini", + ) -class VoiceAssistant(Agent): - def __init__(self): - super().__init__( - instructions="You are a helpful voice assistant. Be concise and friendly." + tts = SpeechmaticsTTSService( + api_key=os.getenv("SPEECHMATICS_API_KEY"), + voice_id="sarah", + aiohttp_session=session, ) + messages = [ + { + "role": "system", + "content": "You are a helpful voice assistant. Be concise and friendly.", + }, + ] + + context = LLMContext(messages) + user_aggregator, assistant_aggregator = LLMContextAggregatorPair( + context, + user_params=LLMUserAggregatorParams( + vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)), + user_turn_strategies=UserTurnStrategies( + stop=[ + TurnAnalyzerUserTurnStopStrategy( + turn_analyzer=LocalSmartTurnAnalyzerV3() + ) + ] + ), + ), + ) -async def entrypoint(ctx: agents.JobContext): - await ctx.connect() + pipeline = Pipeline( + [ + transport.input(), + stt, + user_aggregator, + llm, + tts, + transport.output(), + assistant_aggregator, + ] + ) - # Speech-to-Text: Speechmatics - stt = speechmatics.STT( - turn_detection_mode=TurnDetectionMode.SMART_TURN, - ) + task = PipelineTask( + pipeline, + params=PipelineParams( + enable_metrics=True, + enable_usage_metrics=True, + ), + ) - # Language Model: OpenAI - llm = openai.LLM(model="gpt-4o-mini") + @transport.event_handler("on_client_connected") + async def on_client_connected(transport, client): + logger.info("Client connected") + await task.queue_frames([LLMRunFrame()]) - # Text-to-Speech: Speechmatics - tts = speechmatics.TTS() + @transport.event_handler("on_client_disconnected") + async def on_client_disconnected(transport, client): + logger.info("Client disconnected") + await task.cancel() - # Voice Activity Detection: Silero - vad = silero.VAD.load() + runner = PipelineRunner(handle_sigint=runner_args.handle_sigint) + await runner.run(task) - # Create and start session - session = AgentSession( - stt=stt, - llm=llm, - tts=tts, - vad=vad, - ) - await session.start( - room=ctx.room, - agent=VoiceAssistant(), - room_input_options=RoomInputOptions(), - ) +async def bot(runner_args: RunnerArguments): + transport_params = { + "webrtc": lambda: TransportParams( + audio_in_enabled=True, + audio_out_enabled=True, + ), + } - await session.generate_reply( - instructions="Say a short hello and ask how you can help." - ) + transport = await create_transport(runner_args, transport_params) + await run_bot(transport, runner_args) if __name__ == "__main__": - agents.cli.run_app( - agents.WorkerOptions(entrypoint_fnc=entrypoint), - ) + from pipecat.runner.run import main + + main() diff --git a/docs/integrations-and-sdks/pipecat/index.mdx b/docs/integrations-and-sdks/pipecat/index.mdx index 510df978..3d658074 100644 --- a/docs/integrations-and-sdks/pipecat/index.mdx +++ b/docs/integrations-and-sdks/pipecat/index.mdx @@ -1,36 +1,31 @@ --- -description: Build a voice AI agent with Speechmatics STT and TTS using LiveKit Agents. + description: Build a local voice bot with Speechmatics STT and TTS using Pipecat. --- import CodeBlock from '@theme/CodeBlock' -import livekitQuickstartMainPy from "./assets/main.py?raw" +import pipecatQuickstartMainPy from "./assets/main.py?raw" -# LiveKit quickstart +# Pipecat quickstart -Build a real-time voice AI agent with Speechmatics and LiveKit in minutes. +Build a local voice bot with Speechmatics and Pipecat in minutes. -[LiveKit Agents](https://docs.livekit.io/agents/) is a framework for building voice AI applications using WebRTC. With the Speechmatics plugin, you get accurate speech recognition and natural text-to-speech for your voice agents. +[Pipecat](https://docs.pipecat.ai/) is a framework for building real-time voice bots using a pipeline architecture. In this quickstart, you’ll run a local WebRTC server and connect to your bot from your browser. ## Features - **Real-time transcription** — Low-latency speech-to-text as users speak -- **Speaker diarization** — Identify and track multiple speakers -- **Smart turn detection** — Know when the user has finished speaking -- **Natural TTS voices** — Choose from multiple voice options -- **Noise robustness** — Accurate recognition in challenging audio environments -- **Global language support** — Works with diverse accents and dialects +- **Natural text to speech** — Give your bot a clear, natural voice +- **Local web client** — Test your bot in a browser at `http://localhost:7860/client` +- **No infrastructure** — No cloud deployment or room setup required ## Prerequisites - Python 3.10+ - [Speechmatics API key](https://portal.speechmatics.com) -- [LiveKit Cloud account](https://cloud.livekit.io) (free tier available) - [OpenAI API key](https://platform.openai.com) (for the LLM) ## Setup -This guide assumes LiveKit Cloud. If you want to self-host LiveKit instead, follow LiveKit's self-hosting guide and configure `LIVEKIT_URL`, `LIVEKIT_API_KEY`, and `LIVEKIT_API_SECRET` for your deployment: https://docs.livekit.io/transport/self-hosting/ - ### 1. Create project ```bash @@ -39,84 +34,55 @@ mkdir voice-agent && cd voice-agent ### 2. Install dependencies -```bash -uv init -uv add "livekit-agents[speechmatics,openai,silero]==1.4.2" python-dotenv -``` - -### 3. Install and authenticate the LiveKit CLI - -Install the LiveKit CLI. For additional installation options, see the LiveKit CLI setup guide: https://docs.livekit.io/home/cli/cli-setup/ - -**macOS**: - -```text -brew install livekit-cli -``` - -**Linux**: - -```text -curl -sSL https://get.livekit.io/cli | bash -``` - -**Windows**: +Create a `requirements.txt` file: -```text -winget install LiveKit.LiveKitCLI +```text title="requirements.txt" +pipecat-ai[local-smart-turn-v3,silero,speechmatics,webrtc,openai,runner] +pipecat-ai-small-webrtc-prebuilt +python-dotenv +loguru ``` -Authenticate and link your LiveKit Cloud project: +Install with [uv](https://docs.astral.sh/uv/): ```bash -lk cloud auth +uv venv +source .venv/bin/activate +uv pip install -r requirements.txt ``` -### 4. Configure environment - -Run the LiveKit CLI to write your LiveKit Cloud credentials to a `.env.local` file: - -```bash -lk app env -w -``` +### 3. Configure environment -This creates a `.env.local` file with your LiveKit credentials. Add your Speechmatics and OpenAI keys: +Create a `.env` file: -```bash title=".env.local" -LIVEKIT_URL=wss://your-project.livekit.cloud -LIVEKIT_API_KEY=... -LIVEKIT_API_SECRET=... +```text title=".env" SPEECHMATICS_API_KEY=your_speechmatics_key OPENAI_API_KEY=your_openai_key ``` -### 5. Create your agent +### 4. Create your bot Create a `main.py` file: - {livekitQuickstartMainPy} + {pipecatQuickstartMainPy} -### 6. Run your agent - -Run your agent in `dev` mode to connect it to LiveKit and make it available from anywhere on the internet: +### 5. Run your bot ```bash -python main.py dev +python main.py ``` -Open the [LiveKit Agents Playground](https://agents-playground.livekit.io) to test your agent. - -Run your agent in `console` mode to speak to it locally in your terminal: +Open `http://localhost:7860/client` in your browser and allow microphone access. -```bash -python main.py console -``` +:::note +The first run can take a little longer while dependencies and models load. +::: ## Next steps -- [Speech to text](/integrations-and-sdks/livekit/stt) — Configure diarization, turn detection, and more -- [Text to speech](/integrations-and-sdks/livekit/tts) — Choose voices and adjust settings -- [Speechmatics Academy](https://github.com/speechmatics/speechmatics-academy/tree/main/integrations/livekit) — Full working examples -- [LiveKit deployment](https://docs.livekit.io/agents/deployment/) — Deploy to production +- [Speech to text](/integrations-and-sdks/pipecat/stt) — Configure diarization, turn detection, and more +- [Text to speech](/integrations-and-sdks/pipecat/tts) — Choose voices and adjust settings +- [Speechmatics Academy](https://github.com/speechmatics/speechmatics-academy/tree/main/integrations/pipecat) — Full working examples +- [Pipecat quickstart](https://docs.pipecat.ai/getting-started/quickstart) — Learn more patterns and deployment options From c2f6feff27eda5f90ae4e163ae0f48c74abc6d59 Mon Sep 17 00:00:00 2001 From: Archie McMullan Date: Wed, 25 Feb 2026 20:29:57 +0000 Subject: [PATCH 4/4] create restructured pipecat documentation --- .../pipecat/assets/stt-full-example.py | 69 +++--- docs/integrations-and-sdks/pipecat/stt.mdx | 214 +++++++++--------- docs/integrations-and-sdks/pipecat/tts.mdx | 32 +-- 3 files changed, 158 insertions(+), 157 deletions(-) diff --git a/docs/integrations-and-sdks/pipecat/assets/stt-full-example.py b/docs/integrations-and-sdks/pipecat/assets/stt-full-example.py index 333b09f6..58039be7 100644 --- a/docs/integrations-and-sdks/pipecat/assets/stt-full-example.py +++ b/docs/integrations-and-sdks/pipecat/assets/stt-full-example.py @@ -1,48 +1,35 @@ -from livekit.agents import AgentSession -from livekit.plugins import speechmatics -from livekit.plugins.speechmatics import ( - AdditionalVocabEntry, - AudioEncoding, - OperatingPoint, - SpeakerFocusMode, - SpeakerIdentifier, - TurnDetectionMode, -) - -stt = speechmatics.STT( - # Service options - language="en", - output_locale="en-US", - operating_point=OperatingPoint.ENHANCED, +from pipecat.services.speechmatics.stt import SpeechmaticsSTTService - # Turn detection - turn_detection_mode=TurnDetectionMode.ADAPTIVE, - max_delay=1.5, - include_partials=True, +stt = SpeechmaticsSTTService( + params=SpeechmaticsSTTService.InputParams( + # Service options + language="en", + operating_point=SpeechmaticsSTTService.OperatingPoint.ENHANCED, - # Diarization - enable_diarization=True, - speaker_sensitivity=0.6, - max_speakers=4, - prefer_current_speaker=True, + # Turn detection + turn_detection_mode=SpeechmaticsSTTService.TurnDetectionMode.EXTERNAL, + max_delay=1.5, + include_partials=True, - # Speaker focus - focus_speakers=["S1", "S2"], - focus_mode=SpeakerFocusMode.RETAIN, - ignore_speakers=["__ASSISTANT__"], + # Diarization + enable_diarization=True, + speaker_sensitivity=0.6, + max_speakers=4, + prefer_current_speaker=True, - # Output formatting - speaker_active_format="[{speaker_id}]: {text}", - speaker_passive_format="[{speaker_id} (background)]: {text}", + # Speaker focus + focus_speakers=["S1", "S2"], + focus_mode=SpeechmaticsSTTService.SpeakerFocusMode.RETAIN, + ignore_speakers=[], - # Custom vocabulary - additional_vocab=[ - AdditionalVocabEntry(content="Speechmatics"), - AdditionalVocabEntry(content="LiveKit", sounds_like=["live kit", "livekit"]), - ], -) + # Output formatting + speaker_active_format="[{speaker_id}]: {text}", + speaker_passive_format="[{speaker_id} (background)]: {text}", -session = AgentSession( - stt=stt, - # ... llm, tts, vad, etc. + # Custom vocabulary + additional_vocab=[ + SpeechmaticsSTTService.AdditionalVocabEntry(content="Speechmatics"), + SpeechmaticsSTTService.AdditionalVocabEntry(content="Pipecat", sounds_like=["pipe cat"]), + ], + ), ) diff --git a/docs/integrations-and-sdks/pipecat/stt.mdx b/docs/integrations-and-sdks/pipecat/stt.mdx index ed2b9660..0edcdda9 100644 --- a/docs/integrations-and-sdks/pipecat/stt.mdx +++ b/docs/integrations-and-sdks/pipecat/stt.mdx @@ -1,13 +1,13 @@ --- -description: Transcribe live audio in your LiveKit voice agents with Speechmatics STT. +description: Transcribe live audio in your Pipecat voice bots with Speechmatics STT. --- import CodeBlock from '@theme/CodeBlock' import sttFullExample from "./assets/stt-full-example.py?raw" -# LiveKit speech to text +# Pipecat speech to text -Use the Speechmatics STT plugin to transcribe live audio in your LiveKit voice agents. +Use the Speechmatics STT service to transcribe live audio in your Pipecat voice bots. ## Features @@ -21,41 +21,47 @@ Use the Speechmatics STT plugin to transcribe live audio in your LiveKit voice a ## Installation ```bash -uv add "livekit-agents[speechmatics]~=1.4" +pip install "pipecat-ai[speechmatics]" ``` ## Basic configuration ### Authentication -By default, the plugin reads your API key from `SPEECHMATICS_API_KEY`. +By default, the service reads your API key from the `SPEECHMATICS_API_KEY` environment variable. ### Service options | Parameter | Type | Default | Description | |-----------|------|---------|-------------| -| `language` | string | `"en"` | Language code for transcription | -| `output_locale` | string \| null | `null` | Output locale (for example `"en-GB"`) | +| `api_key` | string | env var | Speechmatics API key (defaults to `SPEECHMATICS_API_KEY`) | +| `base_url` | string | env var | Realtime base URL (defaults to `SPEECHMATICS_RT_URL`, or `wss://eu2.rt.speechmatics.com/v2`) | +| `sample_rate` | number | pipeline default | Audio sample rate in Hz | +| `should_interrupt` | boolean | `true` | Enable interruption on detected speech | + +### Input parameters + +These are passed via `params=SpeechmaticsSTTService.InputParams(...)`: + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `language` | Language \| string | `Language.EN` | Language code for transcription | | `domain` | string \| null | `null` | Domain-specific model (for example `"finance"`) | | `operating_point` | OperatingPoint \| null | `null` | Transcription accuracy. Use `OperatingPoint.ENHANCED` (higher accuracy) or `OperatingPoint.STANDARD` (lower latency) | -| `base_url` | string | env var | Realtime base URL (defaults to `SPEECHMATICS_RT_URL`, or `wss://eu2.rt.speechmatics.com/v2`) | -| `api_key` | string | env var | Speechmatics API key (defaults to `SPEECHMATICS_API_KEY`) | -| `sample_rate` | number | `16000` | Audio sample rate in Hz. Valid values: `8000` or `16000` | | `audio_encoding` | AudioEncoding | `PCM_S16LE` | Audio encoding format: `AudioEncoding.PCM_S16LE`, `AudioEncoding.PCM_F32LE`, or `AudioEncoding.MULAW` | | `punctuation_overrides` | object \| null | `null` | Custom punctuation rules | +| `extra_params` | object \| null | `null` | Additional parameters to pass to the API | #### Example ```python -from livekit.agents import AgentSession -from livekit.plugins import speechmatics +from pipecat.services.speechmatics.stt import SpeechmaticsSTTService -session = AgentSession( - stt=speechmatics.STT( +stt = SpeechmaticsSTTService( + params=SpeechmaticsSTTService.InputParams( language="en", - output_locale="en-GB", + operating_point=SpeechmaticsSTTService.OperatingPoint.ENHANCED, ), - # ... llm, tts, vad, etc. ) ``` @@ -63,8 +69,7 @@ session = AgentSession( ### Turn detection -The Speechmatics STT plugin uses the Speechmatics Voice SDK for endpointing and turn detection. -Turn detection determines when a user has finished their complete thought, while the Realtime API's `EndOfUtterance` message indicates a pause in speech. The plugin handles this distinction automatically. +Turn detection determines when a user has finished their complete thought, while the Realtime API's `EndOfUtterance` message indicates a pause in speech. The service handles this distinction automatically. #### Modes @@ -72,55 +77,52 @@ Set `turn_detection_mode` to control how end of speech is detected: | Mode | When to use | |------|-------------| -| `TurnDetectionMode.ADAPTIVE` | Default. Adjusts silence threshold based on speech rate, pauses, and disfluencies. Requires `speechmatics-voice[smart]` | +| `TurnDetectionMode.EXTERNAL` | Default and recommended. Delegates turn detection to Pipecat's pipeline (VAD, Smart Turn, etc.). Try this first | +| `TurnDetectionMode.ADAPTIVE` | Speechmatics analyzes speech content and acoustic patterns for end-of-turn detection | | `TurnDetectionMode.FIXED` | Fixed silence threshold using `end_of_utterance_silence_trigger` | -| `TurnDetectionMode.SMART_TURN` | ML-based endpointing using acoustic cues for more natural turn-taking. Requires `speechmatics-voice[smart]` | -| `TurnDetectionMode.EXTERNAL` | You control turn boundaries manually (for example using your own VAD and calling `finalize()`) | +| `TurnDetectionMode.SMART_TURN` | Speechmatics Smart Turn for ML-based turn detection | + +:::tip +Start with `EXTERNAL` mode. This lets you use Pipecat's turn detection features (like `LocalSmartTurnAnalyzerV3`) which are well-integrated with the pipeline. Only switch to other modes if you need Speechmatics to handle turn detection directly. +::: ```python -from livekit.plugins import speechmatics -from livekit.plugins.speechmatics import TurnDetectionMode +from pipecat.services.speechmatics.stt import SpeechmaticsSTTService -# Adaptive mode (default) - adjusts to speech patterns -# Requires: pip install speechmatics-voice[smart] -stt = speechmatics.STT( - turn_detection_mode=TurnDetectionMode.ADAPTIVE, +# External mode (default, recommended) - use Pipecat's turn detection +stt = SpeechmaticsSTTService( + params=SpeechmaticsSTTService.InputParams( + turn_detection_mode=SpeechmaticsSTTService.TurnDetectionMode.EXTERNAL, + ), ) -# Fixed mode - consistent silence threshold -stt = speechmatics.STT( - turn_detection_mode=TurnDetectionMode.FIXED, - end_of_utterance_silence_trigger=0.8, # 800ms of silence - end_of_utterance_max_delay=5.0, # Force end after 5s +# Adaptive mode - Speechmatics determines end-of-turn +stt = SpeechmaticsSTTService( + params=SpeechmaticsSTTService.InputParams( + turn_detection_mode=SpeechmaticsSTTService.TurnDetectionMode.ADAPTIVE, + ), ) -# Smart turn mode - ML-based natural turn-taking -# Requires: pip install speechmatics-voice[smart] -stt = speechmatics.STT( - turn_detection_mode=TurnDetectionMode.SMART_TURN, +# Fixed mode - consistent silence threshold +stt = SpeechmaticsSTTService( + params=SpeechmaticsSTTService.InputParams( + turn_detection_mode=SpeechmaticsSTTService.TurnDetectionMode.FIXED, + end_of_utterance_silence_trigger=0.8, # 800ms of silence + end_of_utterance_max_delay=5.0, # Force end after 5s + ), ) -# External mode - manual control via finalize() -stt = speechmatics.STT( - turn_detection_mode=TurnDetectionMode.EXTERNAL, +# Smart turn mode - Speechmatics ML-based turn detection +stt = SpeechmaticsSTTService( + params=SpeechmaticsSTTService.InputParams( + turn_detection_mode=SpeechmaticsSTTService.TurnDetectionMode.SMART_TURN, + ), ) ``` -#### Manual turn finalization - -When using `TurnDetectionMode.EXTERNAL`, you control when a turn ends by calling `finalize()` on the STT instance. This is useful when you have your own VAD or want to integrate with external signals. - -```python -from livekit.plugins import speechmatics -from livekit.plugins.speechmatics import TurnDetectionMode - -stt = speechmatics.STT( - turn_detection_mode=TurnDetectionMode.EXTERNAL, -) - -# Later, when you detect the user has finished speaking: -stt.finalize() -``` +:::note +When using `ADAPTIVE` or `SMART_TURN` modes, remove any competing VAD or turn-detection features from your pipeline to avoid conflicts. +::: #### Configuration @@ -129,11 +131,12 @@ stt.finalize() | `end_of_utterance_silence_trigger` | number \| null | `null` | Silence duration (seconds) that triggers end of utterance. Used primarily in `FIXED` mode. Valid range: >0 to <2 seconds (exclusive) | | `end_of_utterance_max_delay` | number \| null | `null` | Maximum delay (seconds) before forcing an end of utterance. Must be greater than `end_of_utterance_silence_trigger` | | `max_delay` | number \| null | `null` | Maximum transcription delay (seconds). Lower values reduce latency at the cost of accuracy. Valid range: 0.7–4.0 seconds | -| `include_partials` | boolean \| null | `null` | Enable partial (interim) transcription results. When `null`, defaults to `true` | +| `include_partials` | boolean \| null | `null` | Enable partial (interim) transcription results | +| `split_sentences` | boolean \| null | `null` | Split transcription into sentences | ### Advanced diarization -The plugin can attribute words to speakers and lets you decide which speakers are treated as **active** (foreground) vs **passive** (background). +The service can attribute words to speakers and lets you decide which speakers are treated as **active** (foreground) vs **passive** (background). #### Configuration @@ -147,18 +150,19 @@ The plugin can attribute words to speakers and lets you decide which speakers ar | `additional_vocab` | array \| null | `null` | Custom vocabulary entries (`AdditionalVocabEntry` objects) for improved recognition | ```python -from livekit.plugins import speechmatics -from livekit.plugins.speechmatics import AdditionalVocabEntry - -stt = speechmatics.STT( - enable_diarization=True, - speaker_sensitivity=0.7, - max_speakers=3, - prefer_current_speaker=True, - additional_vocab=[ - AdditionalVocabEntry(content="Speechmatics"), - AdditionalVocabEntry(content="API", sounds_like=["A P I"]), - ], +from pipecat.services.speechmatics.stt import SpeechmaticsSTTService + +stt = SpeechmaticsSTTService( + params=SpeechmaticsSTTService.InputParams( + enable_diarization=True, + speaker_sensitivity=0.7, + max_speakers=3, + prefer_current_speaker=True, + additional_vocab=[ + SpeechmaticsSTTService.AdditionalVocabEntry(content="Speechmatics"), + SpeechmaticsSTTService.AdditionalVocabEntry(content="API", sounds_like=["A P I"]), + ], + ), ) ``` @@ -167,15 +171,16 @@ stt = speechmatics.STT( Use `known_speakers` to attribute words to specific speakers across sessions. This is useful when you want consistent speaker identification for known participants. ```python -from livekit.plugins import speechmatics -from livekit.plugins.speechmatics import SpeakerIdentifier - -stt = speechmatics.STT( - enable_diarization=True, - known_speakers=[ - SpeakerIdentifier(label="Alice", speaker_identifiers=["speaker_abc123"]), - SpeakerIdentifier(label="Bob", speaker_identifiers=["speaker_def456"]), - ], +from pipecat.services.speechmatics.stt import SpeechmaticsSTTService + +stt = SpeechmaticsSTTService( + params=SpeechmaticsSTTService.InputParams( + enable_diarization=True, + known_speakers=[ + SpeechmaticsSTTService.SpeakerIdentifier(label="Alice", speaker_identifiers=["speaker_abc123"]), + SpeechmaticsSTTService.SpeakerIdentifier(label="Bob", speaker_identifiers=["speaker_def456"]), + ], + ), ) ``` @@ -201,25 +206,22 @@ Control which speakers are treated as **active** (foreground) vs **passive** (ba `ignore_speakers` always excludes those speakers from transcription and their speech will not trigger VAD or end of utterance detection. -:::tip -By default, any speaker label wrapped in double underscores (for example `__ASSISTANT__`) is automatically excluded. This convention lets you filter out assistant audio without explicitly adding it to `ignore_speakers`. -::: - ```python -from livekit.plugins import speechmatics -from livekit.plugins.speechmatics import SpeakerFocusMode +from pipecat.services.speechmatics.stt import SpeechmaticsSTTService -stt = speechmatics.STT( - focus_speakers=["S1"], - focus_mode=SpeakerFocusMode.RETAIN, - ignore_speakers=["S3"], +stt = SpeechmaticsSTTService( + params=SpeechmaticsSTTService.InputParams( + focus_speakers=["S1"], + focus_mode=SpeechmaticsSTTService.SpeakerFocusMode.RETAIN, + ignore_speakers=["S3"], + ), ) ``` #### Speaker formatting Use `speaker_active_format` and `speaker_passive_format` to format transcripts for your LLM. -The templates support `{speaker_id}` and `{text}`. +The templates support `{speaker_id}`, `{text}`, `{ts}`, `{start_time}`, `{end_time}`, and `{lang}`. | Parameter | Type | Default | Description | |-----------|------|---------|-------------| @@ -227,31 +229,36 @@ The templates support `{speaker_id}` and `{text}`. | `speaker_passive_format` | string \| null | `null` | Format template for passive speaker output | ```python -from livekit.plugins import speechmatics +from pipecat.services.speechmatics.stt import SpeechmaticsSTTService -stt = speechmatics.STT( - speaker_active_format="<{speaker_id}>{text}", - speaker_passive_format="<{speaker_id} background>{text}", +stt = SpeechmaticsSTTService( + params=SpeechmaticsSTTService.InputParams( + speaker_active_format="<{speaker_id}>{text}", + speaker_passive_format="<{speaker_id} background>{text}", + ), ) ``` -When you use a custom format, include it in your agent instructions so the LLM can interpret speaker tags consistently. +When you use a custom format, include it in your bot's system prompt so the LLM can interpret speaker tags consistently. #### Updating speakers during transcription -You can dynamically change which speakers to focus on or ignore during an active transcription session using the `update_speakers()` method. +You can dynamically change which speakers to focus on or ignore during an active transcription session using the `update_params()` method. ```python -from livekit.plugins import speechmatics -from livekit.plugins.speechmatics import SpeakerFocusMode +from pipecat.services.speechmatics.stt import SpeechmaticsSTTService -stt = speechmatics.STT(enable_diarization=True) +stt = SpeechmaticsSTTService( + params=SpeechmaticsSTTService.InputParams(enable_diarization=True), +) # Later, during transcription: -stt.update_speakers( - focus_speakers=["S1", "S2"], - ignore_speakers=["S3"], - focus_mode=SpeakerFocusMode.RETAIN, +stt.update_params( + SpeechmaticsSTTService.UpdateParams( + focus_speakers=["S1", "S2"], + ignore_speakers=["S3"], + focus_mode=SpeechmaticsSTTService.SpeakerFocusMode.RETAIN, + ) ) ``` @@ -265,5 +272,6 @@ This is useful when you need to adjust speaker filtering based on runtime condit ## Next steps -- [Quickstart](/integrations-and-sdks/livekit) — Build a complete voice agent -- [Text to speech](/integrations-and-sdks/livekit/tts) — Use Speechmatics voices in your agent +- [Quickstart](/integrations-and-sdks/pipecat) — Build a complete voice bot +- [Text to speech](/integrations-and-sdks/pipecat/tts) — Use Speechmatics voices in your bot +- [Pipecat documentation](https://docs.pipecat.ai/server/services/stt/speechmatics) — Full Speechmatics STT reference diff --git a/docs/integrations-and-sdks/pipecat/tts.mdx b/docs/integrations-and-sdks/pipecat/tts.mdx index ebd83031..dde396d7 100644 --- a/docs/integrations-and-sdks/pipecat/tts.mdx +++ b/docs/integrations-and-sdks/pipecat/tts.mdx @@ -1,35 +1,40 @@ --- -description: Use Speechmatics text-to-speech voices in your LiveKit voice agents. + description: Use Speechmatics text to speech voices in your Pipecat voice bots. --- -# LiveKit text to speech +# Pipecat text to speech -Give your LiveKit voice agent natural, expressive speech with Speechmatics TTS. +Use Speechmatics TTS to give your Pipecat voice bot a clear, natural voice. ## Installation ```bash -uv add "livekit-agents[speechmatics]~=1.4" +pip install "pipecat-ai[speechmatics]" ``` ## Usage ```python -from livekit.agents import AgentSession -from livekit.plugins import speechmatics +import aiohttp -session = AgentSession( - tts=speechmatics.TTS(), - # ... stt, llm, etc. -) +from pipecat.services.speechmatics.tts import SpeechmaticsTTSService + +async with aiohttp.ClientSession() as session: + tts = SpeechmaticsTTSService( + api_key="YOUR_API_KEY", + voice_id="sarah", + aiohttp_session=session, + ) ``` ## Configuration | Parameter | Type | Default | Description | |-----------|------|---------|-------------| -| `voice` | string | `"sarah"` | Voice model to use | | `api_key` | string | env var | Speechmatics API key (defaults to `SPEECHMATICS_API_KEY`) | +| `voice_id` | string | `"sarah"` | Voice to use | +| `base_url` | string | service default | Base URL for the Speechmatics TTS endpoint | +| `aiohttp_session` | `aiohttp.ClientSession` | none | Reuse a session for connection pooling and lower latency | For available voices and detailed TTS options, see the [Text to speech quickstart](/text-to-speech/quickstart). @@ -37,5 +42,6 @@ For available voices and detailed TTS options, see the [Text to speech quickstar ## Next steps -- [Quickstart](/integrations-and-sdks/livekit) — Build a complete voice agent -- [Speech to text](/integrations-and-sdks/livekit/stt) — Configure STT options +- [Quickstart](/integrations-and-sdks/pipecat) — Build a complete voice bot +- [Speech to text](/integrations-and-sdks/pipecat/stt) — Configure STT options +- [Pipecat documentation](https://docs.pipecat.ai/server/services/tts/speechmatics) — Full Speechmatics TTS reference