From 11c28b7e935e71cf30e8092c305528efd5287bac Mon Sep 17 00:00:00 2001 From: Archie McMullan Date: Fri, 20 Feb 2026 16:36:22 +0000 Subject: [PATCH 1/3] Voice agents: hide Flow, add integrations-first overview, add Voice SDK page --- docs/voice-agents/overview.mdx | 195 ++++-------------- docs/voice-agents/sidebar.ts | 7 +- .../{features.mdx => voice-sdk.mdx} | 161 ++++++++++++++- sidebars.ts | 2 +- 4 files changed, 202 insertions(+), 163 deletions(-) rename docs/voice-agents/{features.mdx => voice-sdk.mdx} (52%) diff --git a/docs/voice-agents/overview.mdx b/docs/voice-agents/overview.mdx index 55f60855..80044889 100644 --- a/docs/voice-agents/overview.mdx +++ b/docs/voice-agents/overview.mdx @@ -1,170 +1,57 @@ --- -description: Learn how to build voice-enabled applications with the Speechmatics Voice SDK +description: Learn how to build voice agents with Speechmatics integrations and the Voice SDK. --- -import Admonition from '@theme/Admonition'; -import CodeBlock from '@theme/CodeBlock'; -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; +import { LinkCard } from "@site/src/theme/LinkCard"; +import { Grid } from "@radix-ui/themes"; -import pythonVoiceCustomConfig from "./assets/custom-config.py?raw" -import pythonVoiceConfigOverlays from "./assets/config-overlays.py?raw" -import pythonVoiceConfigSerialization from "./assets/config-serialization.py?raw" +# Voice agents overview -# Voice SDK overview -The Voice SDK builds on our Realtime API to provide additional features optimized for conversational AI, using Python: +Our integration partners can be the quickest way to get a production voice agent up and running. -- **Intelligent segmentation**: groups words into meaningful speech segments per speaker. -- **Turn detection**: automatically detects when speakers finish talking. -- **Speaker management**: focus on or ignore specific speakers in multi-speaker scenarios. -- **Preset configurations**: offers ready-to-use settings for conversations, note-taking, and captions. -- **Simplified event handling**: delivers clean, structured segments instead of raw word-level events. +If you’re building it yourself, you can also use our Voice SDK. Integrations are built on top of the Voice SDK, which provides features optimized for conversational AI. -### Voice SDK vs Realtime SDK +If you’re building an integration and want to work with us, contact support. -Use the Voice SDK when: +## Features -- Building conversational AI or voice agents -- You need automatic turn detection -- You want speaker-focused transcription -- You need ready-to-use presets for common scenarios +Speechmatics provides building blocks you can use through integrations and the Voice SDK. -Use the Realtime SDK when: +It includes: -- You need the raw stream of word-by-word transcription data -- Building custom segmentation logic -- You want fine-grained control over every event -- Processing audio files or custom workflows +- **Turn detection**: detect when a speaker has finished talking. +- **Intelligent segmentation**: group partial transcripts into clean, speaker-attributed segments. +- **Diarization**: identify and label different speakers. +- **Speaker focus**: focus on or ignore specific speakers in multi-speaker scenarios. +- **Preset configurations**: start quickly with ready-to-use settings. +- **Structured events**: work with clean segments instead of raw word-level events. -## Getting started +## Integrations -### 1. Create an API key +Use an integration to handle audio transport and wiring, so you can focus on your agent logic: -[Create a Speechmatics API key in the portal](https://portal.speechmatics.com/settings/api-keys) to access the Voice SDK. -Store your key securely as a managed secret. + + } + href="/integrations-and-sdks/vapi" + /> + } + href="/integrations-and-sdks/livekit" + /> + } + href="/integrations-and-sdks/pipecat" + /> + -### 2. Install dependencies +## Voice SDK -```bash -# Standard installation -pip install speechmatics-voice - -# With SMART_TURN (ML-based turn detection) -pip install speechmatics-voice[smart] -``` - -### 3. Quickstart - -Here's how to stream microphone audio to the Voice Agent and transcribe finalised segments of speech, with speaker ID: - -```python -import asyncio -import os -from speechmatics.rt import Microphone -from speechmatics.voice import VoiceAgentClient, AgentServerMessageType - -async def main(): - """Stream microphone audio to Speechmatics Voice Agent using 'scribe' preset""" - - # Audio configuration - SAMPLE_RATE = 16000 # Hz - CHUNK_SIZE = 160 # Samples per read - PRESET = "scribe" # Configuration preset - - # Create client with preset - client = VoiceAgentClient( - api_key=os.getenv("SPEECHMATICS_API_KEY"), - preset=PRESET - ) - - # Print finalised segments of speech with speaker ID - @client.on(AgentServerMessageType.ADD_SEGMENT) - def on_segment(message): - for segment in message["segments"]: - speaker = segment["speaker_id"] - text = segment["text"] - print(f"{speaker}: {text}") - - # Setup microphone - mic = Microphone(SAMPLE_RATE, CHUNK_SIZE) - if not mic.start(): - print("Error: Microphone not available") - return - - # Connect to the Voice Agent - await client.connect() - - # Stream microphone audio (interruptable using keyboard) - try: - while True: - audio_chunk = await mic.read(CHUNK_SIZE) - if not audio_chunk: - break # Microphone stopped producing data - await client.send_audio(audio_chunk) - except KeyboardInterrupt: - pass - finally: - await client.disconnect() - -if __name__ == "__main__": - asyncio.run(main()) - -``` - -#### Presets - the simplest way to get started -These are purpose-built, optimized configurations, ready for use without further modification: - -`fast` - low latency, fast responses - -`adaptive` - general conversation - -`smart_turn` - complex conversation - -`external` - user handles end of turn - -`scribe` - note-taking - -`captions` - live captioning - -To view all available presets: -```python -presets = VoiceAgentConfigPreset.list_presets() -``` - -### 4. Custom configurations - -For more control, you can also specify custom configurations or use presets as a starting point and customise with overlays: - - -Specify configurations in a `VoiceAgentConfig` object: - - {pythonVoiceCustomConfig} - - - -Use presets as a starting point and customise with overlays: - - {pythonVoiceConfigOverlays} - - - - -Note: If no configuration or preset is provided, the client will default to the `external` preset. - - - - -## FAQ -### Support - -
-Where can I provide feedback or get help? - -You can submit feedback, bug reports, or feature requests through the Speechmatics [GitHub discussions](https://github.com/orgs/speechmatics/discussions). -
- -## Next steps - -- For more information, see the [Voice SDK](https://github.com/speechmatics/speechmatics-python-sdk/tree/main/sdk/voice) on GitHub. -- For working examples, integrations and templates, check out the [Speechmatics Academy](https://github.com/speechmatics/speechmatics-academy). -- Share and discuss your project with [our team](https://support.speechmatics.com) or join our [developer community on Reddit](https://www.reddit.com/r/Speechmatics) to connect with other builders in voice AI. +Use the Voice SDK to handle turn detection, group transcripts into clean segments, and apply diarization for LLM workflows. +See [Voice SDK](/voice-agents/voice-sdk) for getting started, presets, and configuration. diff --git a/docs/voice-agents/sidebar.ts b/docs/voice-agents/sidebar.ts index f14bba42..a622bfff 100644 --- a/docs/voice-agents/sidebar.ts +++ b/docs/voice-agents/sidebar.ts @@ -1,5 +1,3 @@ -import voiceAgentsFlowSidebar from "./flow/sidebar"; - export default { type: "category", label: "Voice agents", @@ -13,9 +11,8 @@ export default { }, { type: "doc", - id: "voice-agents/features", - label: "Features", + id: "voice-agents/voice-sdk", + label: "Voice SDK", }, - voiceAgentsFlowSidebar, ], } as const; \ No newline at end of file diff --git a/docs/voice-agents/features.mdx b/docs/voice-agents/voice-sdk.mdx similarity index 52% rename from docs/voice-agents/features.mdx rename to docs/voice-agents/voice-sdk.mdx index 8232cc91..bcef2065 100644 --- a/docs/voice-agents/features.mdx +++ b/docs/voice-agents/voice-sdk.mdx @@ -1,9 +1,163 @@ --- -description: Learn about configuration parameters for the Voice SDK +description: Learn how to use the Voice SDK. --- import CodeBlock from '@theme/CodeBlock'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; -# Features +import pythonVoiceCustomConfig from "./assets/custom-config.py?raw" +import pythonVoiceConfigOverlays from "./assets/config-overlays.py?raw" +import pythonVoiceConfigSerialization from "./assets/config-serialization.py?raw" + + + +# Voice SDK + +The Voice SDK is a Python library that provides additional features optimized for conversational AI, built on top of our Realtime API. + +We use it to build our integrations, and it is also available for you to use. + +- **Intelligent segmentation**: groups words into meaningful speech segments per speaker. +- **Turn detection**: automatically detects when speakers finish talking. +- **Speaker management**: focus on or ignore specific speakers in multi-speaker scenarios. +- **Preset configurations**: offers ready-to-use settings for conversations, note-taking, and captions. +- **Simplified event handling**: delivers clean, structured segments instead of raw word-level events. + +### Voice SDK vs Realtime SDK + +Use the Voice SDK when: + +- Building conversational AI or voice agents +- You need automatic turn detection +- You want speaker-focused transcription +- You need ready-to-use presets for common scenarios + +Use the Realtime SDK when: + +- You need the raw stream of word-by-word transcription data +- Building custom segmentation logic +- You want fine-grained control over every event +- Processing audio files or custom workflows + +## Getting started + +### 1. Create an API key + +[Create a Speechmatics API key in the portal](https://portal.speechmatics.com/settings/api-keys) to access the Voice SDK. +Store your key securely as a managed secret. + +### 2. Install dependencies + +```bash +# Standard installation +pip install speechmatics-voice + +# With SMART_TURN (ML-based turn detection) +pip install speechmatics-voice[smart] +``` + +### 3. Quickstart + +Here's how to stream microphone audio to the Voice Agent and transcribe finalised segments of speech, with speaker ID: + +```python +import asyncio +import os +from speechmatics.rt import Microphone +from speechmatics.voice import VoiceAgentClient, AgentServerMessageType + +async def main(): + """Stream microphone audio to Speechmatics Voice Agent using 'scribe' preset""" + + # Audio configuration + SAMPLE_RATE = 16000 # Hz + CHUNK_SIZE = 160 # Samples per read + PRESET = "scribe" # Configuration preset + + # Create client with preset + client = VoiceAgentClient( + api_key=os.getenv("SPEECHMATICS_API_KEY"), + preset=PRESET + ) + + # Print finalised segments of speech with speaker ID + @client.on(AgentServerMessageType.ADD_SEGMENT) + def on_segment(message): + for segment in message["segments"]: + speaker = segment["speaker_id"] + text = segment["text"] + print(f"{speaker}: {text}") + + # Setup microphone + mic = Microphone(SAMPLE_RATE, CHUNK_SIZE) + if not mic.start(): + print("Error: Microphone not available") + return + + # Connect to the Voice Agent + await client.connect() + + # Stream microphone audio (interruptable using keyboard) + try: + while True: + audio_chunk = await mic.read(CHUNK_SIZE) + if not audio_chunk: + break # Microphone stopped producing data + await client.send_audio(audio_chunk) + except KeyboardInterrupt: + pass + finally: + await client.disconnect() + +if __name__ == "__main__": + asyncio.run(main()) + +``` + +#### Presets - the simplest way to get started + +These are purpose-built, optimized configurations, ready for use without further modification: + +`fast` - low latency, fast responses + +`adaptive` - general conversation + +`smart_turn` - complex conversation + +`external` - user handles end of turn + +`scribe` - note-taking + +`captions` - live captioning + +To view all available presets: + +```python +presets = VoiceAgentConfigPreset.list_presets() +``` + +### 4. Custom configurations + +For more control, you can also specify custom configurations or use presets as a starting point and customise with overlays: + + + +Specify configurations in a `VoiceAgentConfig` object: + + {pythonVoiceCustomConfig} + + + +Use presets as a starting point and customise with overlays: + + {pythonVoiceConfigOverlays} + + + + +Note: If no configuration or preset is provided, the client will default to the `external` preset. + +## Configuration ### Basic parameters `language` (str, default: "en") @@ -45,7 +199,8 @@ Silence duration in seconds to trigger turn end. Maximum delay before forcing turn end. `max_delay` (float, default: 0.7) -Maximum transcription delay for word emission. +Maximum transcription delay for word emission. +Defaults to 0.7 seconds, but when using turn detection we recommend 1.0s for better accuracy. Turn detection will ensure finalisation latency is not affected. ### Speaker configuration `speaker_sensitivity` (float, default: 0.5) diff --git a/sidebars.ts b/sidebars.ts index 61f6511b..2f7522b6 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -10,8 +10,8 @@ export default { docs: [ gettingStartedSidebar, speechToTextSidebar, - voiceAgentsSidebar, textToSpeechSidebar, + voiceAgentsSidebar, integrationsAndSDKSidebar, deploymentsSidebar, { From 7d40dba9796a48fa860f5245157c4fd4a40722a8 Mon Sep 17 00:00:00 2001 From: Archie McMullan Date: Fri, 27 Feb 2026 09:58:11 +0000 Subject: [PATCH 2/3] Refined some wording and added link for customer support --- docs/voice-agents/overview.mdx | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/voice-agents/overview.mdx b/docs/voice-agents/overview.mdx index 80044889..459f8a7e 100644 --- a/docs/voice-agents/overview.mdx +++ b/docs/voice-agents/overview.mdx @@ -6,11 +6,8 @@ import { Grid } from "@radix-ui/themes"; # Voice agents overview -Our integration partners can be the quickest way to get a production voice agent up and running. - -If you’re building it yourself, you can also use our Voice SDK. Integrations are built on top of the Voice SDK, which provides features optimized for conversational AI. - -If you’re building an integration and want to work with us, contact support. +Our Voice SDK provides features optimized for conversational AI, which we use to build our integrations. +Our integration partners are the quickest way to get a production voice agent up and running, ## Features @@ -55,3 +52,7 @@ Use an integration to handle audio transport and wiring, so you can focus on you Use the Voice SDK to handle turn detection, group transcripts into clean segments, and apply diarization for LLM workflows. See [Voice SDK](/voice-agents/voice-sdk) for getting started, presets, and configuration. + +If you’re building an integration and want to work with us, [contact support](https://support.speechmatics.com). +If you’re building an integration and want to work with us, [contact support](https://support.speechmatics.com). + From 8204fca3e401adc15377180960072a6b293f6c9a Mon Sep 17 00:00:00 2001 From: Archie McMullan Date: Fri, 27 Feb 2026 10:04:36 +0000 Subject: [PATCH 3/3] fixed duplicated support line --- docs/voice-agents/overview.mdx | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/voice-agents/overview.mdx b/docs/voice-agents/overview.mdx index 459f8a7e..cb05b61d 100644 --- a/docs/voice-agents/overview.mdx +++ b/docs/voice-agents/overview.mdx @@ -54,5 +54,4 @@ Use the Voice SDK to handle turn detection, group transcripts into clean segment See [Voice SDK](/voice-agents/voice-sdk) for getting started, presets, and configuration. If you’re building an integration and want to work with us, [contact support](https://support.speechmatics.com). -If you’re building an integration and want to work with us, [contact support](https://support.speechmatics.com).