From 11c28b7e935e71cf30e8092c305528efd5287bac Mon Sep 17 00:00:00 2001 From: Archie McMullan Date: Fri, 20 Feb 2026 16:36:22 +0000 Subject: [PATCH 1/6] Voice agents: hide Flow, add integrations-first overview, add Voice SDK page --- docs/voice-agents/overview.mdx | 195 ++++-------------- docs/voice-agents/sidebar.ts | 7 +- .../{features.mdx => voice-sdk.mdx} | 161 ++++++++++++++- sidebars.ts | 2 +- 4 files changed, 202 insertions(+), 163 deletions(-) rename docs/voice-agents/{features.mdx => voice-sdk.mdx} (52%) diff --git a/docs/voice-agents/overview.mdx b/docs/voice-agents/overview.mdx index 55f60855..80044889 100644 --- a/docs/voice-agents/overview.mdx +++ b/docs/voice-agents/overview.mdx @@ -1,170 +1,57 @@ --- -description: Learn how to build voice-enabled applications with the Speechmatics Voice SDK +description: Learn how to build voice agents with Speechmatics integrations and the Voice SDK. --- -import Admonition from '@theme/Admonition'; -import CodeBlock from '@theme/CodeBlock'; -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; +import { LinkCard } from "@site/src/theme/LinkCard"; +import { Grid } from "@radix-ui/themes"; -import pythonVoiceCustomConfig from "./assets/custom-config.py?raw" -import pythonVoiceConfigOverlays from "./assets/config-overlays.py?raw" -import pythonVoiceConfigSerialization from "./assets/config-serialization.py?raw" +# Voice agents overview -# Voice SDK overview -The Voice SDK builds on our Realtime API to provide additional features optimized for conversational AI, using Python: +Our integration partners can be the quickest way to get a production voice agent up and running. -- **Intelligent segmentation**: groups words into meaningful speech segments per speaker. -- **Turn detection**: automatically detects when speakers finish talking. -- **Speaker management**: focus on or ignore specific speakers in multi-speaker scenarios. -- **Preset configurations**: offers ready-to-use settings for conversations, note-taking, and captions. -- **Simplified event handling**: delivers clean, structured segments instead of raw word-level events. +If you’re building it yourself, you can also use our Voice SDK. Integrations are built on top of the Voice SDK, which provides features optimized for conversational AI. -### Voice SDK vs Realtime SDK +If you’re building an integration and want to work with us, contact support. -Use the Voice SDK when: +## Features -- Building conversational AI or voice agents -- You need automatic turn detection -- You want speaker-focused transcription -- You need ready-to-use presets for common scenarios +Speechmatics provides building blocks you can use through integrations and the Voice SDK. -Use the Realtime SDK when: +It includes: -- You need the raw stream of word-by-word transcription data -- Building custom segmentation logic -- You want fine-grained control over every event -- Processing audio files or custom workflows +- **Turn detection**: detect when a speaker has finished talking. +- **Intelligent segmentation**: group partial transcripts into clean, speaker-attributed segments. +- **Diarization**: identify and label different speakers. +- **Speaker focus**: focus on or ignore specific speakers in multi-speaker scenarios. +- **Preset configurations**: start quickly with ready-to-use settings. +- **Structured events**: work with clean segments instead of raw word-level events. -## Getting started +## Integrations -### 1. Create an API key +Use an integration to handle audio transport and wiring, so you can focus on your agent logic: -[Create a Speechmatics API key in the portal](https://portal.speechmatics.com/settings/api-keys) to access the Voice SDK. -Store your key securely as a managed secret. + + } + href="/integrations-and-sdks/vapi" + /> + } + href="/integrations-and-sdks/livekit" + /> + } + href="/integrations-and-sdks/pipecat" + /> + -### 2. Install dependencies +## Voice SDK -```bash -# Standard installation -pip install speechmatics-voice - -# With SMART_TURN (ML-based turn detection) -pip install speechmatics-voice[smart] -``` - -### 3. Quickstart - -Here's how to stream microphone audio to the Voice Agent and transcribe finalised segments of speech, with speaker ID: - -```python -import asyncio -import os -from speechmatics.rt import Microphone -from speechmatics.voice import VoiceAgentClient, AgentServerMessageType - -async def main(): - """Stream microphone audio to Speechmatics Voice Agent using 'scribe' preset""" - - # Audio configuration - SAMPLE_RATE = 16000 # Hz - CHUNK_SIZE = 160 # Samples per read - PRESET = "scribe" # Configuration preset - - # Create client with preset - client = VoiceAgentClient( - api_key=os.getenv("SPEECHMATICS_API_KEY"), - preset=PRESET - ) - - # Print finalised segments of speech with speaker ID - @client.on(AgentServerMessageType.ADD_SEGMENT) - def on_segment(message): - for segment in message["segments"]: - speaker = segment["speaker_id"] - text = segment["text"] - print(f"{speaker}: {text}") - - # Setup microphone - mic = Microphone(SAMPLE_RATE, CHUNK_SIZE) - if not mic.start(): - print("Error: Microphone not available") - return - - # Connect to the Voice Agent - await client.connect() - - # Stream microphone audio (interruptable using keyboard) - try: - while True: - audio_chunk = await mic.read(CHUNK_SIZE) - if not audio_chunk: - break # Microphone stopped producing data - await client.send_audio(audio_chunk) - except KeyboardInterrupt: - pass - finally: - await client.disconnect() - -if __name__ == "__main__": - asyncio.run(main()) - -``` - -#### Presets - the simplest way to get started -These are purpose-built, optimized configurations, ready for use without further modification: - -`fast` - low latency, fast responses - -`adaptive` - general conversation - -`smart_turn` - complex conversation - -`external` - user handles end of turn - -`scribe` - note-taking - -`captions` - live captioning - -To view all available presets: -```python -presets = VoiceAgentConfigPreset.list_presets() -``` - -### 4. Custom configurations - -For more control, you can also specify custom configurations or use presets as a starting point and customise with overlays: - - -Specify configurations in a `VoiceAgentConfig` object: - - {pythonVoiceCustomConfig} - - - -Use presets as a starting point and customise with overlays: - - {pythonVoiceConfigOverlays} - - - - -Note: If no configuration or preset is provided, the client will default to the `external` preset. - - - - -## FAQ -### Support - -
-Where can I provide feedback or get help? - -You can submit feedback, bug reports, or feature requests through the Speechmatics [GitHub discussions](https://github.com/orgs/speechmatics/discussions). -
- -## Next steps - -- For more information, see the [Voice SDK](https://github.com/speechmatics/speechmatics-python-sdk/tree/main/sdk/voice) on GitHub. -- For working examples, integrations and templates, check out the [Speechmatics Academy](https://github.com/speechmatics/speechmatics-academy). -- Share and discuss your project with [our team](https://support.speechmatics.com) or join our [developer community on Reddit](https://www.reddit.com/r/Speechmatics) to connect with other builders in voice AI. +Use the Voice SDK to handle turn detection, group transcripts into clean segments, and apply diarization for LLM workflows. +See [Voice SDK](/voice-agents/voice-sdk) for getting started, presets, and configuration. diff --git a/docs/voice-agents/sidebar.ts b/docs/voice-agents/sidebar.ts index f14bba42..a622bfff 100644 --- a/docs/voice-agents/sidebar.ts +++ b/docs/voice-agents/sidebar.ts @@ -1,5 +1,3 @@ -import voiceAgentsFlowSidebar from "./flow/sidebar"; - export default { type: "category", label: "Voice agents", @@ -13,9 +11,8 @@ export default { }, { type: "doc", - id: "voice-agents/features", - label: "Features", + id: "voice-agents/voice-sdk", + label: "Voice SDK", }, - voiceAgentsFlowSidebar, ], } as const; \ No newline at end of file diff --git a/docs/voice-agents/features.mdx b/docs/voice-agents/voice-sdk.mdx similarity index 52% rename from docs/voice-agents/features.mdx rename to docs/voice-agents/voice-sdk.mdx index 8232cc91..bcef2065 100644 --- a/docs/voice-agents/features.mdx +++ b/docs/voice-agents/voice-sdk.mdx @@ -1,9 +1,163 @@ --- -description: Learn about configuration parameters for the Voice SDK +description: Learn how to use the Voice SDK. --- import CodeBlock from '@theme/CodeBlock'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; -# Features +import pythonVoiceCustomConfig from "./assets/custom-config.py?raw" +import pythonVoiceConfigOverlays from "./assets/config-overlays.py?raw" +import pythonVoiceConfigSerialization from "./assets/config-serialization.py?raw" + + + +# Voice SDK + +The Voice SDK is a Python library that provides additional features optimized for conversational AI, built on top of our Realtime API. + +We use it to build our integrations, and it is also available for you to use. + +- **Intelligent segmentation**: groups words into meaningful speech segments per speaker. +- **Turn detection**: automatically detects when speakers finish talking. +- **Speaker management**: focus on or ignore specific speakers in multi-speaker scenarios. +- **Preset configurations**: offers ready-to-use settings for conversations, note-taking, and captions. +- **Simplified event handling**: delivers clean, structured segments instead of raw word-level events. + +### Voice SDK vs Realtime SDK + +Use the Voice SDK when: + +- Building conversational AI or voice agents +- You need automatic turn detection +- You want speaker-focused transcription +- You need ready-to-use presets for common scenarios + +Use the Realtime SDK when: + +- You need the raw stream of word-by-word transcription data +- Building custom segmentation logic +- You want fine-grained control over every event +- Processing audio files or custom workflows + +## Getting started + +### 1. Create an API key + +[Create a Speechmatics API key in the portal](https://portal.speechmatics.com/settings/api-keys) to access the Voice SDK. +Store your key securely as a managed secret. + +### 2. Install dependencies + +```bash +# Standard installation +pip install speechmatics-voice + +# With SMART_TURN (ML-based turn detection) +pip install speechmatics-voice[smart] +``` + +### 3. Quickstart + +Here's how to stream microphone audio to the Voice Agent and transcribe finalised segments of speech, with speaker ID: + +```python +import asyncio +import os +from speechmatics.rt import Microphone +from speechmatics.voice import VoiceAgentClient, AgentServerMessageType + +async def main(): + """Stream microphone audio to Speechmatics Voice Agent using 'scribe' preset""" + + # Audio configuration + SAMPLE_RATE = 16000 # Hz + CHUNK_SIZE = 160 # Samples per read + PRESET = "scribe" # Configuration preset + + # Create client with preset + client = VoiceAgentClient( + api_key=os.getenv("SPEECHMATICS_API_KEY"), + preset=PRESET + ) + + # Print finalised segments of speech with speaker ID + @client.on(AgentServerMessageType.ADD_SEGMENT) + def on_segment(message): + for segment in message["segments"]: + speaker = segment["speaker_id"] + text = segment["text"] + print(f"{speaker}: {text}") + + # Setup microphone + mic = Microphone(SAMPLE_RATE, CHUNK_SIZE) + if not mic.start(): + print("Error: Microphone not available") + return + + # Connect to the Voice Agent + await client.connect() + + # Stream microphone audio (interruptable using keyboard) + try: + while True: + audio_chunk = await mic.read(CHUNK_SIZE) + if not audio_chunk: + break # Microphone stopped producing data + await client.send_audio(audio_chunk) + except KeyboardInterrupt: + pass + finally: + await client.disconnect() + +if __name__ == "__main__": + asyncio.run(main()) + +``` + +#### Presets - the simplest way to get started + +These are purpose-built, optimized configurations, ready for use without further modification: + +`fast` - low latency, fast responses + +`adaptive` - general conversation + +`smart_turn` - complex conversation + +`external` - user handles end of turn + +`scribe` - note-taking + +`captions` - live captioning + +To view all available presets: + +```python +presets = VoiceAgentConfigPreset.list_presets() +``` + +### 4. Custom configurations + +For more control, you can also specify custom configurations or use presets as a starting point and customise with overlays: + + + +Specify configurations in a `VoiceAgentConfig` object: + + {pythonVoiceCustomConfig} + + + +Use presets as a starting point and customise with overlays: + + {pythonVoiceConfigOverlays} + + + + +Note: If no configuration or preset is provided, the client will default to the `external` preset. + +## Configuration ### Basic parameters `language` (str, default: "en") @@ -45,7 +199,8 @@ Silence duration in seconds to trigger turn end. Maximum delay before forcing turn end. `max_delay` (float, default: 0.7) -Maximum transcription delay for word emission. +Maximum transcription delay for word emission. +Defaults to 0.7 seconds, but when using turn detection we recommend 1.0s for better accuracy. Turn detection will ensure finalisation latency is not affected. ### Speaker configuration `speaker_sensitivity` (float, default: 0.5) diff --git a/sidebars.ts b/sidebars.ts index 61f6511b..2f7522b6 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -10,8 +10,8 @@ export default { docs: [ gettingStartedSidebar, speechToTextSidebar, - voiceAgentsSidebar, textToSpeechSidebar, + voiceAgentsSidebar, integrationsAndSDKSidebar, deploymentsSidebar, { From 905a543b9bea544fe6ff4f0314819c1227e51bfe Mon Sep 17 00:00:00 2001 From: Archie McMullan Date: Mon, 23 Feb 2026 22:39:09 +0000 Subject: [PATCH 2/6] Voice agents: refactor and expand Voice SDK docs --- docs/voice-agents/assets/additional-vocab.py | 12 + docs/voice-agents/assets/advanced-config.py | 22 + docs/voice-agents/assets/basic-config.py | 36 ++ docs/voice-agents/assets/custom-config.py | 1 + .../voice-agents/assets/event-subscription.py | 9 + docs/voice-agents/assets/known-speakers.py | 9 + docs/voice-agents/assets/quickstart.py | 50 ++ docs/voice-agents/assets/smart-turn.py | 15 + .../assets/speaker-focus-handler.py | 7 + docs/voice-agents/assets/speaker-focus.py | 27 + docs/voice-agents/voice-sdk.mdx | 496 ++++++++++-------- 11 files changed, 475 insertions(+), 209 deletions(-) create mode 100644 docs/voice-agents/assets/additional-vocab.py create mode 100644 docs/voice-agents/assets/advanced-config.py create mode 100644 docs/voice-agents/assets/basic-config.py create mode 100644 docs/voice-agents/assets/event-subscription.py create mode 100644 docs/voice-agents/assets/known-speakers.py create mode 100644 docs/voice-agents/assets/quickstart.py create mode 100644 docs/voice-agents/assets/smart-turn.py create mode 100644 docs/voice-agents/assets/speaker-focus-handler.py create mode 100644 docs/voice-agents/assets/speaker-focus.py diff --git a/docs/voice-agents/assets/additional-vocab.py b/docs/voice-agents/assets/additional-vocab.py new file mode 100644 index 00000000..fa221e15 --- /dev/null +++ b/docs/voice-agents/assets/additional-vocab.py @@ -0,0 +1,12 @@ +from speechmatics.voice import AdditionalVocabEntry, VoiceAgentConfig + +config = VoiceAgentConfig( + language="en", + additional_vocab=[ + AdditionalVocabEntry( + content="Speechmatics", + sounds_like=["speech matters", "speech matics"] + ), + AdditionalVocabEntry(content="API"), + ] +) diff --git a/docs/voice-agents/assets/advanced-config.py b/docs/voice-agents/assets/advanced-config.py new file mode 100644 index 00000000..64ee1e46 --- /dev/null +++ b/docs/voice-agents/assets/advanced-config.py @@ -0,0 +1,22 @@ +from speechmatics.voice import ( + EndOfUtteranceMode, + SpeakerFocusConfig, + SpeakerFocusMode, + SpeakerIdentifier, + VoiceAgentConfig, + VoiceAgentConfigPreset, +) + +overrides = VoiceAgentConfig( + end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE, + enable_diarization=True, + speaker_config=SpeakerFocusConfig( + focus_speakers=["S1"], + focus_mode=SpeakerFocusMode.RETAIN, + ), + known_speakers=[ + SpeakerIdentifier(label="Alice", speaker_identifiers=["XX...XX"]), + ], +) + +config = VoiceAgentConfigPreset.ADAPTIVE(overrides) diff --git a/docs/voice-agents/assets/basic-config.py b/docs/voice-agents/assets/basic-config.py new file mode 100644 index 00000000..b6b5cfec --- /dev/null +++ b/docs/voice-agents/assets/basic-config.py @@ -0,0 +1,36 @@ +from speechmatics.voice import ( + AdditionalVocabEntry, + AudioEncoding, + OperatingPoint, + VoiceAgentConfig, + VoiceAgentConfigPreset, +) + +overrides = VoiceAgentConfig( + # Language and locale + language="en", # e.g. "en", "es", "fr" + output_locale=None, # e.g. "en-GB", "en-US" + + # Model selection + operating_point=OperatingPoint.ENHANCED, # STANDARD or ENHANCED + domain=None, # e.g. "finance", "medical" + + # Vocabulary + additional_vocab=[ + AdditionalVocabEntry( + content="Speechmatics", + sounds_like=["speech matters", "speech matics"], + ), + AdditionalVocabEntry(content="API"), + ], + punctuation_overrides=None, + + # Audio + sample_rate=16000, + audio_encoding=AudioEncoding.PCM_S16LE, + + # Diarization + enable_diarization=True, +) + +config = VoiceAgentConfigPreset.ADAPTIVE(overrides) diff --git a/docs/voice-agents/assets/custom-config.py b/docs/voice-agents/assets/custom-config.py index b55e8854..be798933 100644 --- a/docs/voice-agents/assets/custom-config.py +++ b/docs/voice-agents/assets/custom-config.py @@ -1,3 +1,4 @@ +import os from speechmatics.voice import VoiceAgentClient, VoiceAgentConfig, EndOfUtteranceMode config = VoiceAgentConfig( diff --git a/docs/voice-agents/assets/event-subscription.py b/docs/voice-agents/assets/event-subscription.py new file mode 100644 index 00000000..6f5ad677 --- /dev/null +++ b/docs/voice-agents/assets/event-subscription.py @@ -0,0 +1,9 @@ +@client.on(AgentServerMessageType.ADD_SEGMENT) +def on_final_segment(message): + for segment in message["segments"]: + print(f"[FINAL] {segment['speaker_id']}: {segment['text']}") + +@client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT) +def on_partial_segment(message): + for segment in message["segments"]: + print(f"[PARTIAL] {segment['speaker_id']}: {segment['text']}") diff --git a/docs/voice-agents/assets/known-speakers.py b/docs/voice-agents/assets/known-speakers.py new file mode 100644 index 00000000..1a03eff3 --- /dev/null +++ b/docs/voice-agents/assets/known-speakers.py @@ -0,0 +1,9 @@ +from speechmatics.voice import SpeakerIdentifier, VoiceAgentConfig + +config = VoiceAgentConfig( + enable_diarization=True, + known_speakers=[ + SpeakerIdentifier(label="Alice", speaker_identifiers=["XX...XX"]), + SpeakerIdentifier(label="Bob", speaker_identifiers=["YY...YY"]) + ] +) diff --git a/docs/voice-agents/assets/quickstart.py b/docs/voice-agents/assets/quickstart.py new file mode 100644 index 00000000..00e9bc91 --- /dev/null +++ b/docs/voice-agents/assets/quickstart.py @@ -0,0 +1,50 @@ +import asyncio +import os +from speechmatics.rt import Microphone +from speechmatics.voice import VoiceAgentClient, AgentServerMessageType + +async def main(): + """Stream microphone audio to Speechmatics Voice Agent using 'scribe' preset""" + + # Audio configuration + SAMPLE_RATE = 16000 # Hz + CHUNK_SIZE = 160 # Samples per read + PRESET = "scribe" # Configuration preset + + # Create client with preset + client = VoiceAgentClient( + api_key=os.getenv("SPEECHMATICS_API_KEY"), + preset=PRESET + ) + + # Print finalised segments of speech with speaker ID + @client.on(AgentServerMessageType.ADD_SEGMENT) + def on_segment(message): + for segment in message["segments"]: + speaker = segment["speaker_id"] + text = segment["text"] + print(f"{speaker}: {text}") + + # Setup microphone + mic = Microphone(SAMPLE_RATE, CHUNK_SIZE) + if not mic.start(): + print("Error: Microphone not available") + return + + # Connect to the Voice Agent + await client.connect() + + # Stream microphone audio (interruptable using keyboard) + try: + while True: + audio_chunk = await mic.read(CHUNK_SIZE) + if not audio_chunk: + break # Microphone stopped producing data + await client.send_audio(audio_chunk) + except KeyboardInterrupt: + pass + finally: + await client.disconnect() + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/voice-agents/assets/smart-turn.py b/docs/voice-agents/assets/smart-turn.py new file mode 100644 index 00000000..d887257f --- /dev/null +++ b/docs/voice-agents/assets/smart-turn.py @@ -0,0 +1,15 @@ +from speechmatics.voice import ( + EndOfUtteranceMode, + SmartTurnConfig, + VoiceAgentConfig, + VoiceAgentConfigPreset, +) + +# ADAPTIVE mode + ML-enhanced turn detection +config = VoiceAgentConfig( + end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE, + smart_turn_config=SmartTurnConfig(enabled=True), +) + +# Or use the SMART_TURN preset which bundles this configuration +config = VoiceAgentConfigPreset.SMART_TURN() diff --git a/docs/voice-agents/assets/speaker-focus-handler.py b/docs/voice-agents/assets/speaker-focus-handler.py new file mode 100644 index 00000000..a0375fc9 --- /dev/null +++ b/docs/voice-agents/assets/speaker-focus-handler.py @@ -0,0 +1,7 @@ +@client.on(AgentServerMessageType.ADD_SEGMENT) +def on_segment(message): + for segment in message["segments"]: + if segment["is_active"]: + process_focused_speaker(segment["text"]) + else: + process_passive_speaker(segment["speaker_id"], segment["text"]) diff --git a/docs/voice-agents/assets/speaker-focus.py b/docs/voice-agents/assets/speaker-focus.py new file mode 100644 index 00000000..1c2dcc79 --- /dev/null +++ b/docs/voice-agents/assets/speaker-focus.py @@ -0,0 +1,27 @@ +from speechmatics.voice import SpeakerFocusConfig, SpeakerFocusMode, VoiceAgentConfig + +# Focus on specific speakers, keep others as passive +config = VoiceAgentConfig( + enable_diarization=True, + speaker_config=SpeakerFocusConfig( + focus_speakers=["S1", "S2"], + focus_mode=SpeakerFocusMode.RETAIN + ) +) + +# Focus on specific speakers, exclude everyone else +config = VoiceAgentConfig( + enable_diarization=True, + speaker_config=SpeakerFocusConfig( + focus_speakers=["S1", "S2"], + focus_mode=SpeakerFocusMode.IGNORE + ) +) + +# Blacklist specific speakers (exclude them from all processing) +config = VoiceAgentConfig( + enable_diarization=True, + speaker_config=SpeakerFocusConfig( + ignore_speakers=["S3"], + ) +) diff --git a/docs/voice-agents/voice-sdk.mdx b/docs/voice-agents/voice-sdk.mdx index bcef2065..f17db73b 100644 --- a/docs/voice-agents/voice-sdk.mdx +++ b/docs/voice-agents/voice-sdk.mdx @@ -1,6 +1,7 @@ --- description: Learn how to use the Voice SDK. --- + import CodeBlock from '@theme/CodeBlock'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; @@ -8,11 +9,20 @@ import TabItem from '@theme/TabItem'; import pythonVoiceCustomConfig from "./assets/custom-config.py?raw" import pythonVoiceConfigOverlays from "./assets/config-overlays.py?raw" import pythonVoiceConfigSerialization from "./assets/config-serialization.py?raw" - - +import pythonQuickstart from "./assets/quickstart.py?raw" +import pythonEventSubscription from "./assets/event-subscription.py?raw" +import pythonAdditionalVocab from "./assets/additional-vocab.py?raw" +import pythonBasicConfig from "./assets/basic-config.py?raw" +import pythonSmartTurn from "./assets/smart-turn.py?raw" +import pythonSpeakerFocus from "./assets/speaker-focus.py?raw" +import pythonSpeakerFocusHandler from "./assets/speaker-focus-handler.py?raw" +import pythonKnownSpeakers from "./assets/known-speakers.py?raw" +import pythonAdvancedConfig from "./assets/advanced-config.py?raw" # Voice SDK +## Overview + The Voice SDK is a Python library that provides additional features optimized for conversational AI, built on top of our Realtime API. We use it to build our integrations, and it is also available for you to use. @@ -23,30 +33,53 @@ We use it to build our integrations, and it is also available for you to use. - **Preset configurations**: offers ready-to-use settings for conversations, note-taking, and captions. - **Simplified event handling**: delivers clean, structured segments instead of raw word-level events. -### Voice SDK vs Realtime SDK +### Segmentation + +Segmentation groups words into readable chunks of text. +In practice, this means you can work with finalized segments rather than stitching together word-by-word updates. + +### Turn detection and finalization + +Turn detection determines when a speaker has finished a turn. +When a turn is detected, speech is finalized into segments that you can use in your application. + +Turn detection (and subsequent finalization) is important for speed: the sooner a turn is finalized, the sooner you can send a final transcript to an LLM. -Use the Voice SDK when: +We take the complexity out of this through presets. +If you prefer manual control, use the `external` preset and call `client.finalize()` to end a turn. +This sends a signal to the Speechmatics servers to finalize the current speech immediately. -- Building conversational AI or voice agents -- You need automatic turn detection -- You want speaker-focused transcription -- You need ready-to-use presets for common scenarios +### Diarization and speaker management -Use the Realtime SDK when: +When diarization is enabled, the Voice SDK assigns speaker IDs (for example `S1`, `S2`) and produces segments per speaker. -- You need the raw stream of word-by-word transcription data -- Building custom segmentation logic -- You want fine-grained control over every event -- Processing audio files or custom workflows +You can also: -## Getting started +- Focus on specific speakers +- Ignore specific speakers +- Provide known speakers for speaker identification -### 1. Create an API key +### Voice SDK vs Realtime SDK + +- Use the Voice SDK when: + - Building conversational AI or voice agents + - You need automatic turn detection + - You want speaker-focused transcription + - You need ready-to-use presets for common scenarios +- Use the Realtime SDK when: + - You need the raw stream of word-by-word transcription data + - Building custom segmentation logic + - You want fine-grained control over every event + - Processing audio files or custom workflows + +## Get started + +### Create an API key [Create a Speechmatics API key in the portal](https://portal.speechmatics.com/settings/api-keys) to access the Voice SDK. Store your key securely as a managed secret. -### 2. Install dependencies +### Install ```bash # Standard installation @@ -56,79 +89,107 @@ pip install speechmatics-voice pip install speechmatics-voice[smart] ``` -### 3. Quickstart +### Quickstart Here's how to stream microphone audio to the Voice Agent and transcribe finalised segments of speech, with speaker ID: -```python -import asyncio -import os -from speechmatics.rt import Microphone -from speechmatics.voice import VoiceAgentClient, AgentServerMessageType - -async def main(): - """Stream microphone audio to Speechmatics Voice Agent using 'scribe' preset""" - - # Audio configuration - SAMPLE_RATE = 16000 # Hz - CHUNK_SIZE = 160 # Samples per read - PRESET = "scribe" # Configuration preset - - # Create client with preset - client = VoiceAgentClient( - api_key=os.getenv("SPEECHMATICS_API_KEY"), - preset=PRESET - ) - - # Print finalised segments of speech with speaker ID - @client.on(AgentServerMessageType.ADD_SEGMENT) - def on_segment(message): - for segment in message["segments"]: - speaker = segment["speaker_id"] - text = segment["text"] - print(f"{speaker}: {text}") - - # Setup microphone - mic = Microphone(SAMPLE_RATE, CHUNK_SIZE) - if not mic.start(): - print("Error: Microphone not available") - return - - # Connect to the Voice Agent - await client.connect() - - # Stream microphone audio (interruptable using keyboard) - try: - while True: - audio_chunk = await mic.read(CHUNK_SIZE) - if not audio_chunk: - break # Microphone stopped producing data - await client.send_audio(audio_chunk) - except KeyboardInterrupt: - pass - finally: - await client.disconnect() - -if __name__ == "__main__": - asyncio.run(main()) + + {pythonQuickstart} + + +Note: `Microphone` is imported from the Realtime SDK (`speechmatics.rt`). Install with `pip install speechmatics`. + +## Events and segments +The Voice SDK emits events as transcription progresses. The two main segment events are: + +- `ADD_PARTIAL_SEGMENT` - Interim results that stream in real-time as speech is recognized +- `ADD_SEGMENT` - Final results emitted when a turn ends + +### How segments work + +As someone speaks, you receive `ADD_PARTIAL_SEGMENT` events with the current transcription. These update continuously—each new partial replaces the previous one. + +When a turn is detected (or you call `client.finalize()`), the SDK emits an `ADD_SEGMENT` event with the finalized transcript. This is the stable result you should use for downstream processing like sending to an LLM. + +``` +Speaking: "Hello, how are you?" + +Timeline: + ADD_PARTIAL_SEGMENT: "Hello" + ADD_PARTIAL_SEGMENT: "Hello, how" + ADD_PARTIAL_SEGMENT: "Hello, how are" + ADD_PARTIAL_SEGMENT: "Hello, how are you" + (turn detected or finalize() called) + ADD_SEGMENT: "Hello, how are you?" ← Use this +``` + +### Segment payload + +Example `ADD_SEGMENT` payload: + +```json +{ + "message": "AddSegment", + "segments": [ + { + "speaker_id": "S1", + "is_active": true, + "timestamp": "2025-11-11T23:18:37.189+00:00", + "language": "en", + "text": "Welcome to Speechmatics.", + "metadata": { + "start_time": 1.28, + "end_time": 8.04 + } + } + ], + "metadata": { + "start_time": 1.28, + "end_time": 8.04, + "processing_time": 0.187 + } +} ``` -#### Presets - the simplest way to get started +**Field explanations:** +- `speaker_id`: Speaker label (e.g., `S1`, `S2`, or custom label if using known speakers) +- `is_active`: Whether this speaker is in your focus list (see Speaker focus) +- `timestamp`: Absolute wall-clock time (ISO 8601 format) +- `start_time` / `end_time`: Time in seconds relative to the start of the session +- `processing_time`: Transcription latency in seconds + +### Subscribing to events + + + {pythonEventSubscription} + + +### When finals are emitted + +Final segments (`ADD_SEGMENT`) are emitted when: +1. **Turn detection** triggers automatically (based on your preset/config) +2. **You call `client.finalize()`** manually (when using `external` preset) + +See Turn detection for more on automatic finalization. + +## Presets These are purpose-built, optimized configurations, ready for use without further modification: -`fast` - low latency, fast responses +`FAST` - low latency, fast responses + +`FIXED` - general conversation with fixed timing -`adaptive` - general conversation +`ADAPTIVE` - general conversation with adaptive timing -`smart_turn` - complex conversation +`SMART_TURN` - complex conversation with ML-enhanced turn detection -`external` - user handles end of turn +`EXTERNAL` - user handles end of turn -`scribe` - note-taking +`SCRIBE` - note-taking -`captions` - live captioning +`CAPTIONS` - live captioning To view all available presets: @@ -136,7 +197,10 @@ To view all available presets: presets = VoiceAgentConfigPreset.list_presets() ``` -### 4. Custom configurations +Presets include defaults for all settings (language defaults to English). +To change the language (or any other preset setting), use a custom configuration or use a preset as a starting point and customize with overlays. + +### Custom configuration For more control, you can also specify custom configurations or use presets as a starting point and customise with overlays: @@ -157,52 +221,141 @@ Use presets as a starting point and customise with overlays: Note: If no configuration or preset is provided, the client will default to the `external` preset. -## Configuration +## Basic configuration + +### Language and locale -### Basic parameters `language` (str, default: "en") Language code for transcription (e.g., "en", "es", "fr"). See [supported languages](/speech-to-text/languages). +`output_locale` (str, default: None) +Output locale for formatting (e.g., "en-GB", "en-US"). +See [supported languages and locales](/speech-to-text/languages). + +### Model selection + `operating_point` (OperatingPoint, default: ENHANCED) -Balance accuracy vs latency. +Select an accuracy level. Options: STANDARD or ENHANCED. `domain` (str, default: None) Domain-specific model (e.g., "finance", "medical"). See [supported languages and domains](/speech-to-text/languages). -`output_locale` (str, default: None) -Output locale for formatting (e.g., "en-GB", "en-US"). -See [supported languages and locales](/speech-to-text/languages). +### Vocabulary + +`additional_vocab` (list[AdditionalVocabEntry], default: []) + +Custom vocabulary for domain-specific terms. + + + {pythonAdditionalVocab} + + +`punctuation_overrides` (dict, default: None) +Custom punctuation rules. Keys are punctuation marks, values are replacement strings. + +### Audio + +`sample_rate` (int, default: 16000) +Audio sample rate in Hz. + +`audio_encoding` (AudioEncoding, default: PCM_S16LE) +Audio encoding format. + +### Latency and quality + +`max_delay` (float, default: 1.0) +Maximum transcription delay in seconds for word emission. +Turn detection ensures finalisation latency is not affected. + +### Basic diarization `enable_diarization` (bool, default: False) Enable speaker diarization to identify and label different speakers. +When enabled, segments include a `speaker_id` field (for example `S1`, `S2`). + +### Basic configuration example + + + {pythonBasicConfig} + + +## Advanced configuration ### Turn detection -`end_of_utterance_mode` (EndOfUtteranceMode, default: FIXED) -Controls how turn endings are detected: - -- `FIXED`: Uses fixed silence threshold. -Fast but may split slow speech. -- `ADAPTIVE`: Adjusts delay based on speech rate, pauses, and disfluencies. -Best for natural conversation. -- `SMART_TURN`: Uses ML model to detect acoustic turn-taking cues. -Requires [smart] extras. -- `EXTERNAL`: Manual control via client.finalize(). -For custom turn logic. - -`end_of_utterance_silence_trigger` (float, default: 0.2) + +Presets configure turn detection under the hood. +When a turn is detected (or you call `client.finalize()` using the `external` preset), we send a signal to our servers so you can get the final transcript back as quickly as possible. + +This works in multi-speaker scenarios, including when diarization is enabled. + +`end_of_utterance_mode` (EndOfUtteranceMode, default: FIXED) +Controls the base strategy for detecting turn endings: + +- `FIXED`: Uses a fixed silence threshold. Fast but may split slow speech. +- `ADAPTIVE`: Adjusts delay based on speech rate, pauses, and disfluencies. Best for natural conversation. +- `EXTERNAL`: Manual control via `client.finalize()`. For custom turn logic. + +`end_of_utterance_silence_trigger` (float, default: 0.5) Silence duration in seconds to trigger turn end. -`end_of_utterance_max_delay` (float, default: 10.0) +`end_of_utterance_max_delay` (float, default: 10.0) Maximum delay before forcing turn end. -`max_delay` (float, default: 0.7) -Maximum transcription delay for word emission. -Defaults to 0.7 seconds, but when using turn detection we recommend 1.0s for better accuracy. Turn detection will ensure finalisation latency is not affected. +#### Voice activity detection + +`vad_config` (VoiceActivityConfig, default: None) +Configure voice activity detection: + +- `enabled` (bool, default: False) - Enable VAD. +- `silence_duration` (float, default: 0.18) - Seconds of silence before considering speech ended. +- `threshold` (float, default: 0.35) - Sensitivity threshold for detecting speech. + +#### Smart turn (ML-enhanced detection) + +`smart_turn_config` (SmartTurnConfig, default: None) +Enables an ML model that detects acoustic turn-taking cues (intonation, rhythm patterns) on top of the base mode. + +Smart turn can be combined with `FIXED` or `ADAPTIVE` modes, but **not** with `EXTERNAL` mode. + + + {pythonSmartTurn} + + +Requires the `[smart]` extras: `pip install speechmatics-voice[smart]` + +### Segment output options + +`include_partials` (bool, default: True) +Emit partial segments via `ADD_PARTIAL_SEGMENT`. +Set to `False` for final-only output. + +`include_results` (bool, default: False) +Include word-level timing data in segments. + +`transcription_update_preset` (TranscriptionUpdatePreset, default: COMPLETE) +Controls when partial segment updates are emitted. +Options: `COMPLETE`, `COMPLETE_PLUS_TIMING`, `WORDS`, `WORDS_PLUS_TIMING`, `TIMING`. + +### Segment generation options + +`speech_segment_config` (SpeechSegmentConfig, default: SpeechSegmentConfig()) +Fine-tune segment generation and post-processing: + +- `add_trailing_eos` (bool, default: False) - Append end-of-sentence markers to segments that are missing them. +- `emit_sentences` (bool, default: True) - Emit a finalized segment as soon as a sentence ends. If a speaker continues during a turn, multiple segments may be emitted. +- `pause_mark` (Optional[str], default: None) - Insert a custom string when pauses are detected (e.g., `"..."` produces `"Hello ... how are you?"`). + +### Advanced diarization + +#### Sensitivity and speaker limits + +`enable_diarization` (bool, default: False) +Enable speaker diarization to identify and label different speakers. +You must set this to `True` to use any of the diarization options below. -### Speaker configuration `speaker_sensitivity` (float, default: 0.5) Diarization sensitivity between 0.0 and 1.0. Higher values detect more speakers. @@ -210,141 +363,66 @@ Higher values detect more speakers. `max_speakers` (int, default: None) Limit maximum number of speakers to detect. +#### Speaker grouping + `prefer_current_speaker` (bool, default: False) Give extra weight to current speaker for word grouping. -`speaker_config` (SpeakerFocusConfig, default: SpeakerFocusConfig()) +#### Speaker focus + +`speaker_config` (SpeakerFocusConfig, default: SpeakerFocusConfig()) Configure speaker focus/ignore rules. - -{ -`from speechmatics.voice import SpeakerFocusConfig, SpeakerFocusMode - -# Focus only on specific speakers -config = VoiceAgentConfig( - enable_diarization=True, - speaker_config=SpeakerFocusConfig( - focus_speakers=["S1", "S2"], - focus_mode=SpeakerFocusMode.RETAIN - ) -) - -# Ignore specific speakers -config = VoiceAgentConfig( - enable_diarization=True, - speaker_config=SpeakerFocusConfig( - ignore_speakers=["S3"], - focus_mode=SpeakerFocusMode.IGNORE - ) -)` -} - -`known_speakers` (list[SpeakerIdentifier], default: []) -Pre-enrolled speaker identifiers for speaker identification. +When diarization is enabled, you can control which speakers appear in your output and how they are treated. - -{ -`from speechmatics.voice import SpeakerIdentifier - -config = VoiceAgentConfig( - enable_diarization=True, - known_speakers=[ - SpeakerIdentifier(label="Alice", speaker_identifiers=["XX...XX"]), - SpeakerIdentifier(label="Bob", speaker_identifiers=["YY...YY"]) - ] -)` -} - -### Language and vocabulary -`additional_vocab` (list[AdditionalVocabEntry], default: []) +When no `focus_speakers` are configured, all detected speakers are treated as active (`is_active: true`). -Custom vocabulary for domain-specific terms. +**Active speakers** are speakers in your `focus_speakers` list. +Their segments have `is_active: true`. - -{ -`from speechmatics.voice import AdditionalVocabEntry - -config = VoiceAgentConfig( - language="en", - additional_vocab=[ - AdditionalVocabEntry( - content="Speechmatics", - sounds_like=["speech matters", "speech matics"] - ), - AdditionalVocabEntry(content="API"), - ] -)` -} - -`punctuation_overrides` (dict, default: None) -Custom punctuation rules. +**Passive speakers** are speakers not in `focus_speakers` but still included in output when using `SpeakerFocusMode.RETAIN`. +Their segments have `is_active: false`. -### Audio parameters -`sample_rate` (int, default: 16000) -Audio sample rate in Hz. +**Ignored speakers** are completely excluded from output. +Their speech does not appear in segments and does not trigger turn detection. -`audio_encoding` (AudioEncoding, default: PCM_S16LE) -Audio encoding format. +**SpeakerFocusMode options:** -### Advanced parameters -`transcription_update_preset` (TranscriptionUpdatePreset, default: COMPLETE) -Controls when to emit updates: COMPLETE, COMPLETE_PLUS_TIMING, WORDS, WORDS_PLUS_TIMING, or TIMING. +- `RETAIN`: Non-focused speakers are kept in output as passive speakers (`is_active: false`). Use this when you want to prioritize certain speakers but still see what others say. +- `IGNORE`: Non-focused speakers are excluded entirely from output. Use this when you only care about specific speakers and want to filter out everyone else. -`speech_segment_config` (SpeechSegmentConfig, default: SpeechSegmentConfig()) -Fine-tune segment generation and post-processing. + + {pythonSpeakerFocus} + -`smart_turn_config` (SmartTurnConfig, default: None) -Configure SMART_TURN behavior (buffer length, threshold). +In your event handler, you can use `is_active` to decide how to route segments: -`include_results` (bool, default: False) -Include word-level timing data in segments. + + {pythonSpeakerFocusHandler} + -`include_partials` (bool, default: True) -Emit partial segments. Set to False for final-only output. +#### Known speakers (speaker identification) -### Configuration with overlays. -Use presets as a starting point and customize with overlays: +`known_speakers` (list[SpeakerIdentifier], default: []) +Pre-enrolled speaker identifiers for speaker identification. -{ -`from speechmatics.voice import VoiceAgentConfigPreset, VoiceAgentConfig - -# Use preset with custom overrides -config = VoiceAgentConfigPreset.SCRIBE( - VoiceAgentConfig( - language="es", - max_delay=0.8 - ) -)` -} + {pythonKnownSpeakers} -### Available presets +### Advanced configuration example + -{ -`presets = VoiceAgentConfigPreset.list_presets() -# Output: ['low_latency', 'conversation_adaptive', 'conversation_smart_turn', 'scribe', 'captions']` -} + {pythonAdvancedConfig} -### Configuration serialization +## Import and export configurations Export and import configurations as JSON: -{ -`from speechmatics.voice import VoiceAgentConfigPreset, VoiceAgentConfig - -# Export preset to JSON -config_json = VoiceAgentConfigPreset.SCRIBE().to_json() - -# Load from JSON -config = VoiceAgentConfig.from_json(config_json) - -# Or create from JSON string -config = VoiceAgentConfig.from_json('{"language": "en", "enable_diarization": true}')` -} + {pythonVoiceConfigSerialization} +## More information -For more information, see the [Voice SDK](https://github.com/speechmatics/speechmatics-python-sdk/tree/main/sdk/voice) on github. -` +- Voice SDK on GitHub: https://github.com/speechmatics/speechmatics-python-sdk/tree/main/sdk/voice From bb79643946aa66d4d93ae3706e1bea67acf69bf5 Mon Sep 17 00:00:00 2001 From: Archie McMullan Date: Fri, 27 Feb 2026 09:59:53 +0000 Subject: [PATCH 3/6] Update docs/voice-agents/voice-sdk.mdx american english typo Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- docs/voice-agents/voice-sdk.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/voice-agents/voice-sdk.mdx b/docs/voice-agents/voice-sdk.mdx index f17db73b..8c2901c5 100644 --- a/docs/voice-agents/voice-sdk.mdx +++ b/docs/voice-agents/voice-sdk.mdx @@ -91,7 +91,7 @@ pip install speechmatics-voice[smart] ### Quickstart -Here's how to stream microphone audio to the Voice Agent and transcribe finalised segments of speech, with speaker ID: +Here's how to stream microphone audio to the Voice Agent and transcribe finalized segments of speech, with speaker ID: {pythonQuickstart} From 1483f42e6f23e49eed34b2e5bfca6ab92319d1a7 Mon Sep 17 00:00:00 2001 From: Archie McMullan Date: Fri, 27 Feb 2026 10:00:42 +0000 Subject: [PATCH 4/6] Update docs/voice-agents/assets/quickstart.py typo Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- docs/voice-agents/assets/quickstart.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/voice-agents/assets/quickstart.py b/docs/voice-agents/assets/quickstart.py index 00e9bc91..5b8aabbc 100644 --- a/docs/voice-agents/assets/quickstart.py +++ b/docs/voice-agents/assets/quickstart.py @@ -34,7 +34,7 @@ def on_segment(message): # Connect to the Voice Agent await client.connect() - # Stream microphone audio (interruptable using keyboard) + # Stream microphone audio (interruptible using keyboard) try: while True: audio_chunk = await mic.read(CHUNK_SIZE) From 05057e46395a822dc46c68d923413e2ecbd105df Mon Sep 17 00:00:00 2001 From: Archie McMullan Date: Fri, 27 Feb 2026 10:13:05 +0000 Subject: [PATCH 5/6] Resolved errors and enhanced some sections for clarity --- docs/voice-agents/overview.mdx | 12 ++++++------ docs/voice-agents/voice-sdk.mdx | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/voice-agents/overview.mdx b/docs/voice-agents/overview.mdx index 80044889..b78d9042 100644 --- a/docs/voice-agents/overview.mdx +++ b/docs/voice-agents/overview.mdx @@ -6,24 +6,24 @@ import { Grid } from "@radix-ui/themes"; # Voice agents overview -Our integration partners can be the quickest way to get a production voice agent up and running. +There are two ways to build voice agents using Speechmatics: -If you’re building it yourself, you can also use our Voice SDK. Integrations are built on top of the Voice SDK, which provides features optimized for conversational AI. +- Integration partners (LiveKit, Pipecat and VAPI): the fastest path to a production voice agent. +- Voice SDK: direct access for custom pipelines or working outside of supported integration platforms. -If you’re building an integration and want to work with us, contact support. ## Features Speechmatics provides building blocks you can use through integrations and the Voice SDK. -It includes: +Key features include: - **Turn detection**: detect when a speaker has finished talking. - **Intelligent segmentation**: group partial transcripts into clean, speaker-attributed segments. - **Diarization**: identify and label different speakers. - **Speaker focus**: focus on or ignore specific speakers in multi-speaker scenarios. - **Preset configurations**: start quickly with ready-to-use settings. -- **Structured events**: work with clean segments instead of raw word-level events. +- **Intelligent Segmentation**: work with clean, single speaker, segments instead of raw word-level events. ## Integrations @@ -54,4 +54,4 @@ Use an integration to handle audio transport and wiring, so you can focus on you Use the Voice SDK to handle turn detection, group transcripts into clean segments, and apply diarization for LLM workflows. -See [Voice SDK](/voice-agents/voice-sdk) for getting started, presets, and configuration. +See [Voice SDK](/voice-agents/voice-sdk) for information on getting started, presets, and configuration. diff --git a/docs/voice-agents/voice-sdk.mdx b/docs/voice-agents/voice-sdk.mdx index 8c2901c5..8bad8416 100644 --- a/docs/voice-agents/voice-sdk.mdx +++ b/docs/voice-agents/voice-sdk.mdx @@ -31,11 +31,11 @@ We use it to build our integrations, and it is also available for you to use. - **Turn detection**: automatically detects when speakers finish talking. - **Speaker management**: focus on or ignore specific speakers in multi-speaker scenarios. - **Preset configurations**: offers ready-to-use settings for conversations, note-taking, and captions. -- **Simplified event handling**: delivers clean, structured segments instead of raw word-level events. +- **Intelligent segmentation**: delivers clean, per-speaker, structured segments instead of raw word-level events. ### Segmentation -Segmentation groups words into readable chunks of text. +Segmentation groups words into, per-speaker, readable chunks of text. In practice, this means you can work with finalized segments rather than stitching together word-by-word updates. ### Turn detection and finalization @@ -97,7 +97,7 @@ Here's how to stream microphone audio to the Voice Agent and transcribe finalize {pythonQuickstart} -Note: `Microphone` is imported from the Realtime SDK (`speechmatics.rt`). Install with `pip install speechmatics`. +Note: `Microphone` is imported from the Realtime SDK (`speechmatics.rt`). Install with `pip install speechmatics-rt`. ## Events and segments @@ -179,7 +179,7 @@ These are purpose-built, optimized configurations, ready for use without further `FAST` - low latency, fast responses -`FIXED` - general conversation with fixed timing +`FIXED` - general conversation with fixed, silence-based turn detection `ADAPTIVE` - general conversation with adaptive timing From 5045ec97c47c9f75b203fd2d6f35b3805a059026 Mon Sep 17 00:00:00 2001 From: Archie McMullan Date: Fri, 27 Feb 2026 10:20:53 +0000 Subject: [PATCH 6/6] Fix duplicated 'intelligent segmentation' --- docs/voice-agents/overview.mdx | 1 - docs/voice-agents/voice-sdk.mdx | 1 - 2 files changed, 2 deletions(-) diff --git a/docs/voice-agents/overview.mdx b/docs/voice-agents/overview.mdx index b78d9042..788d7600 100644 --- a/docs/voice-agents/overview.mdx +++ b/docs/voice-agents/overview.mdx @@ -23,7 +23,6 @@ Key features include: - **Diarization**: identify and label different speakers. - **Speaker focus**: focus on or ignore specific speakers in multi-speaker scenarios. - **Preset configurations**: start quickly with ready-to-use settings. -- **Intelligent Segmentation**: work with clean, single speaker, segments instead of raw word-level events. ## Integrations diff --git a/docs/voice-agents/voice-sdk.mdx b/docs/voice-agents/voice-sdk.mdx index 8bad8416..913ea762 100644 --- a/docs/voice-agents/voice-sdk.mdx +++ b/docs/voice-agents/voice-sdk.mdx @@ -31,7 +31,6 @@ We use it to build our integrations, and it is also available for you to use. - **Turn detection**: automatically detects when speakers finish talking. - **Speaker management**: focus on or ignore specific speakers in multi-speaker scenarios. - **Preset configurations**: offers ready-to-use settings for conversations, note-taking, and captions. -- **Intelligent segmentation**: delivers clean, per-speaker, structured segments instead of raw word-level events. ### Segmentation