diff --git a/docs/voice-agents/assets/additional-vocab.py b/docs/voice-agents/assets/additional-vocab.py
new file mode 100644
index 0000000..fa221e1
--- /dev/null
+++ b/docs/voice-agents/assets/additional-vocab.py
@@ -0,0 +1,12 @@
+from speechmatics.voice import AdditionalVocabEntry, VoiceAgentConfig
+
+config = VoiceAgentConfig(
+ language="en",
+ additional_vocab=[
+ AdditionalVocabEntry(
+ content="Speechmatics",
+ sounds_like=["speech matters", "speech matics"]
+ ),
+ AdditionalVocabEntry(content="API"),
+ ]
+)
diff --git a/docs/voice-agents/assets/advanced-config.py b/docs/voice-agents/assets/advanced-config.py
new file mode 100644
index 0000000..64ee1e4
--- /dev/null
+++ b/docs/voice-agents/assets/advanced-config.py
@@ -0,0 +1,22 @@
+from speechmatics.voice import (
+ EndOfUtteranceMode,
+ SpeakerFocusConfig,
+ SpeakerFocusMode,
+ SpeakerIdentifier,
+ VoiceAgentConfig,
+ VoiceAgentConfigPreset,
+)
+
+overrides = VoiceAgentConfig(
+ end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE,
+ enable_diarization=True,
+ speaker_config=SpeakerFocusConfig(
+ focus_speakers=["S1"],
+ focus_mode=SpeakerFocusMode.RETAIN,
+ ),
+ known_speakers=[
+ SpeakerIdentifier(label="Alice", speaker_identifiers=["XX...XX"]),
+ ],
+)
+
+config = VoiceAgentConfigPreset.ADAPTIVE(overrides)
diff --git a/docs/voice-agents/assets/basic-config.py b/docs/voice-agents/assets/basic-config.py
new file mode 100644
index 0000000..b6b5cfe
--- /dev/null
+++ b/docs/voice-agents/assets/basic-config.py
@@ -0,0 +1,36 @@
+from speechmatics.voice import (
+ AdditionalVocabEntry,
+ AudioEncoding,
+ OperatingPoint,
+ VoiceAgentConfig,
+ VoiceAgentConfigPreset,
+)
+
+overrides = VoiceAgentConfig(
+ # Language and locale
+ language="en", # e.g. "en", "es", "fr"
+ output_locale=None, # e.g. "en-GB", "en-US"
+
+ # Model selection
+ operating_point=OperatingPoint.ENHANCED, # STANDARD or ENHANCED
+ domain=None, # e.g. "finance", "medical"
+
+ # Vocabulary
+ additional_vocab=[
+ AdditionalVocabEntry(
+ content="Speechmatics",
+ sounds_like=["speech matters", "speech matics"],
+ ),
+ AdditionalVocabEntry(content="API"),
+ ],
+ punctuation_overrides=None,
+
+ # Audio
+ sample_rate=16000,
+ audio_encoding=AudioEncoding.PCM_S16LE,
+
+ # Diarization
+ enable_diarization=True,
+)
+
+config = VoiceAgentConfigPreset.ADAPTIVE(overrides)
diff --git a/docs/voice-agents/assets/custom-config.py b/docs/voice-agents/assets/custom-config.py
index b55e885..be79893 100644
--- a/docs/voice-agents/assets/custom-config.py
+++ b/docs/voice-agents/assets/custom-config.py
@@ -1,3 +1,4 @@
+import os
from speechmatics.voice import VoiceAgentClient, VoiceAgentConfig, EndOfUtteranceMode
config = VoiceAgentConfig(
diff --git a/docs/voice-agents/assets/event-subscription.py b/docs/voice-agents/assets/event-subscription.py
new file mode 100644
index 0000000..6f5ad67
--- /dev/null
+++ b/docs/voice-agents/assets/event-subscription.py
@@ -0,0 +1,9 @@
+@client.on(AgentServerMessageType.ADD_SEGMENT)
+def on_final_segment(message):
+ for segment in message["segments"]:
+ print(f"[FINAL] {segment['speaker_id']}: {segment['text']}")
+
+@client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT)
+def on_partial_segment(message):
+ for segment in message["segments"]:
+ print(f"[PARTIAL] {segment['speaker_id']}: {segment['text']}")
diff --git a/docs/voice-agents/assets/known-speakers.py b/docs/voice-agents/assets/known-speakers.py
new file mode 100644
index 0000000..1a03eff
--- /dev/null
+++ b/docs/voice-agents/assets/known-speakers.py
@@ -0,0 +1,9 @@
+from speechmatics.voice import SpeakerIdentifier, VoiceAgentConfig
+
+config = VoiceAgentConfig(
+ enable_diarization=True,
+ known_speakers=[
+ SpeakerIdentifier(label="Alice", speaker_identifiers=["XX...XX"]),
+ SpeakerIdentifier(label="Bob", speaker_identifiers=["YY...YY"])
+ ]
+)
diff --git a/docs/voice-agents/assets/quickstart.py b/docs/voice-agents/assets/quickstart.py
new file mode 100644
index 0000000..5b8aabb
--- /dev/null
+++ b/docs/voice-agents/assets/quickstart.py
@@ -0,0 +1,50 @@
+import asyncio
+import os
+from speechmatics.rt import Microphone
+from speechmatics.voice import VoiceAgentClient, AgentServerMessageType
+
+async def main():
+ """Stream microphone audio to Speechmatics Voice Agent using 'scribe' preset"""
+
+ # Audio configuration
+ SAMPLE_RATE = 16000 # Hz
+ CHUNK_SIZE = 160 # Samples per read
+ PRESET = "scribe" # Configuration preset
+
+ # Create client with preset
+ client = VoiceAgentClient(
+ api_key=os.getenv("SPEECHMATICS_API_KEY"),
+ preset=PRESET
+ )
+
+ # Print finalised segments of speech with speaker ID
+ @client.on(AgentServerMessageType.ADD_SEGMENT)
+ def on_segment(message):
+ for segment in message["segments"]:
+ speaker = segment["speaker_id"]
+ text = segment["text"]
+ print(f"{speaker}: {text}")
+
+ # Setup microphone
+ mic = Microphone(SAMPLE_RATE, CHUNK_SIZE)
+ if not mic.start():
+ print("Error: Microphone not available")
+ return
+
+ # Connect to the Voice Agent
+ await client.connect()
+
+ # Stream microphone audio (interruptible using keyboard)
+ try:
+ while True:
+ audio_chunk = await mic.read(CHUNK_SIZE)
+ if not audio_chunk:
+ break # Microphone stopped producing data
+ await client.send_audio(audio_chunk)
+ except KeyboardInterrupt:
+ pass
+ finally:
+ await client.disconnect()
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/docs/voice-agents/assets/smart-turn.py b/docs/voice-agents/assets/smart-turn.py
new file mode 100644
index 0000000..d887257
--- /dev/null
+++ b/docs/voice-agents/assets/smart-turn.py
@@ -0,0 +1,15 @@
+from speechmatics.voice import (
+ EndOfUtteranceMode,
+ SmartTurnConfig,
+ VoiceAgentConfig,
+ VoiceAgentConfigPreset,
+)
+
+# ADAPTIVE mode + ML-enhanced turn detection
+config = VoiceAgentConfig(
+ end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE,
+ smart_turn_config=SmartTurnConfig(enabled=True),
+)
+
+# Or use the SMART_TURN preset which bundles this configuration
+config = VoiceAgentConfigPreset.SMART_TURN()
diff --git a/docs/voice-agents/assets/speaker-focus-handler.py b/docs/voice-agents/assets/speaker-focus-handler.py
new file mode 100644
index 0000000..a0375fc
--- /dev/null
+++ b/docs/voice-agents/assets/speaker-focus-handler.py
@@ -0,0 +1,7 @@
+@client.on(AgentServerMessageType.ADD_SEGMENT)
+def on_segment(message):
+ for segment in message["segments"]:
+ if segment["is_active"]:
+ process_focused_speaker(segment["text"])
+ else:
+ process_passive_speaker(segment["speaker_id"], segment["text"])
diff --git a/docs/voice-agents/assets/speaker-focus.py b/docs/voice-agents/assets/speaker-focus.py
new file mode 100644
index 0000000..1c2dcc7
--- /dev/null
+++ b/docs/voice-agents/assets/speaker-focus.py
@@ -0,0 +1,27 @@
+from speechmatics.voice import SpeakerFocusConfig, SpeakerFocusMode, VoiceAgentConfig
+
+# Focus on specific speakers, keep others as passive
+config = VoiceAgentConfig(
+ enable_diarization=True,
+ speaker_config=SpeakerFocusConfig(
+ focus_speakers=["S1", "S2"],
+ focus_mode=SpeakerFocusMode.RETAIN
+ )
+)
+
+# Focus on specific speakers, exclude everyone else
+config = VoiceAgentConfig(
+ enable_diarization=True,
+ speaker_config=SpeakerFocusConfig(
+ focus_speakers=["S1", "S2"],
+ focus_mode=SpeakerFocusMode.IGNORE
+ )
+)
+
+# Blacklist specific speakers (exclude them from all processing)
+config = VoiceAgentConfig(
+ enable_diarization=True,
+ speaker_config=SpeakerFocusConfig(
+ ignore_speakers=["S3"],
+ )
+)
diff --git a/docs/voice-agents/overview.mdx b/docs/voice-agents/overview.mdx
index cb05b61..788d760 100644
--- a/docs/voice-agents/overview.mdx
+++ b/docs/voice-agents/overview.mdx
@@ -6,21 +6,23 @@ import { Grid } from "@radix-ui/themes";
# Voice agents overview
-Our Voice SDK provides features optimized for conversational AI, which we use to build our integrations.
-Our integration partners are the quickest way to get a production voice agent up and running,
+There are two ways to build voice agents using Speechmatics:
+
+- Integration partners (LiveKit, Pipecat and VAPI): the fastest path to a production voice agent.
+- Voice SDK: direct access for custom pipelines or working outside of supported integration platforms.
+
## Features
Speechmatics provides building blocks you can use through integrations and the Voice SDK.
-It includes:
+Key features include:
- **Turn detection**: detect when a speaker has finished talking.
- **Intelligent segmentation**: group partial transcripts into clean, speaker-attributed segments.
- **Diarization**: identify and label different speakers.
- **Speaker focus**: focus on or ignore specific speakers in multi-speaker scenarios.
- **Preset configurations**: start quickly with ready-to-use settings.
-- **Structured events**: work with clean segments instead of raw word-level events.
## Integrations
@@ -51,7 +53,4 @@ Use an integration to handle audio transport and wiring, so you can focus on you
Use the Voice SDK to handle turn detection, group transcripts into clean segments, and apply diarization for LLM workflows.
-See [Voice SDK](/voice-agents/voice-sdk) for getting started, presets, and configuration.
-
-If you’re building an integration and want to work with us, [contact support](https://support.speechmatics.com).
-
+See [Voice SDK](/voice-agents/voice-sdk) for information on getting started, presets, and configuration.
diff --git a/docs/voice-agents/voice-sdk.mdx b/docs/voice-agents/voice-sdk.mdx
index bcef206..913ea76 100644
--- a/docs/voice-agents/voice-sdk.mdx
+++ b/docs/voice-agents/voice-sdk.mdx
@@ -1,6 +1,7 @@
---
description: Learn how to use the Voice SDK.
---
+
import CodeBlock from '@theme/CodeBlock';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
@@ -8,11 +9,20 @@ import TabItem from '@theme/TabItem';
import pythonVoiceCustomConfig from "./assets/custom-config.py?raw"
import pythonVoiceConfigOverlays from "./assets/config-overlays.py?raw"
import pythonVoiceConfigSerialization from "./assets/config-serialization.py?raw"
-
-
+import pythonQuickstart from "./assets/quickstart.py?raw"
+import pythonEventSubscription from "./assets/event-subscription.py?raw"
+import pythonAdditionalVocab from "./assets/additional-vocab.py?raw"
+import pythonBasicConfig from "./assets/basic-config.py?raw"
+import pythonSmartTurn from "./assets/smart-turn.py?raw"
+import pythonSpeakerFocus from "./assets/speaker-focus.py?raw"
+import pythonSpeakerFocusHandler from "./assets/speaker-focus-handler.py?raw"
+import pythonKnownSpeakers from "./assets/known-speakers.py?raw"
+import pythonAdvancedConfig from "./assets/advanced-config.py?raw"
# Voice SDK
+## Overview
+
The Voice SDK is a Python library that provides additional features optimized for conversational AI, built on top of our Realtime API.
We use it to build our integrations, and it is also available for you to use.
@@ -21,32 +31,54 @@ We use it to build our integrations, and it is also available for you to use.
- **Turn detection**: automatically detects when speakers finish talking.
- **Speaker management**: focus on or ignore specific speakers in multi-speaker scenarios.
- **Preset configurations**: offers ready-to-use settings for conversations, note-taking, and captions.
-- **Simplified event handling**: delivers clean, structured segments instead of raw word-level events.
-### Voice SDK vs Realtime SDK
+### Segmentation
+
+Segmentation groups words into, per-speaker, readable chunks of text.
+In practice, this means you can work with finalized segments rather than stitching together word-by-word updates.
+
+### Turn detection and finalization
+
+Turn detection determines when a speaker has finished a turn.
+When a turn is detected, speech is finalized into segments that you can use in your application.
+
+Turn detection (and subsequent finalization) is important for speed: the sooner a turn is finalized, the sooner you can send a final transcript to an LLM.
-Use the Voice SDK when:
+We take the complexity out of this through presets.
+If you prefer manual control, use the `external` preset and call `client.finalize()` to end a turn.
+This sends a signal to the Speechmatics servers to finalize the current speech immediately.
-- Building conversational AI or voice agents
-- You need automatic turn detection
-- You want speaker-focused transcription
-- You need ready-to-use presets for common scenarios
+### Diarization and speaker management
-Use the Realtime SDK when:
+When diarization is enabled, the Voice SDK assigns speaker IDs (for example `S1`, `S2`) and produces segments per speaker.
-- You need the raw stream of word-by-word transcription data
-- Building custom segmentation logic
-- You want fine-grained control over every event
-- Processing audio files or custom workflows
+You can also:
-## Getting started
+- Focus on specific speakers
+- Ignore specific speakers
+- Provide known speakers for speaker identification
-### 1. Create an API key
+### Voice SDK vs Realtime SDK
+
+- Use the Voice SDK when:
+ - Building conversational AI or voice agents
+ - You need automatic turn detection
+ - You want speaker-focused transcription
+ - You need ready-to-use presets for common scenarios
+- Use the Realtime SDK when:
+ - You need the raw stream of word-by-word transcription data
+ - Building custom segmentation logic
+ - You want fine-grained control over every event
+ - Processing audio files or custom workflows
+
+## Get started
+
+### Create an API key
[Create a Speechmatics API key in the portal](https://portal.speechmatics.com/settings/api-keys) to access the Voice SDK.
Store your key securely as a managed secret.
-### 2. Install dependencies
+### Install
```bash
# Standard installation
@@ -56,79 +88,107 @@ pip install speechmatics-voice
pip install speechmatics-voice[smart]
```
-### 3. Quickstart
+### Quickstart
-Here's how to stream microphone audio to the Voice Agent and transcribe finalised segments of speech, with speaker ID:
+Here's how to stream microphone audio to the Voice Agent and transcribe finalized segments of speech, with speaker ID:
-```python
-import asyncio
-import os
-from speechmatics.rt import Microphone
-from speechmatics.voice import VoiceAgentClient, AgentServerMessageType
-
-async def main():
- """Stream microphone audio to Speechmatics Voice Agent using 'scribe' preset"""
-
- # Audio configuration
- SAMPLE_RATE = 16000 # Hz
- CHUNK_SIZE = 160 # Samples per read
- PRESET = "scribe" # Configuration preset
-
- # Create client with preset
- client = VoiceAgentClient(
- api_key=os.getenv("SPEECHMATICS_API_KEY"),
- preset=PRESET
- )
-
- # Print finalised segments of speech with speaker ID
- @client.on(AgentServerMessageType.ADD_SEGMENT)
- def on_segment(message):
- for segment in message["segments"]:
- speaker = segment["speaker_id"]
- text = segment["text"]
- print(f"{speaker}: {text}")
-
- # Setup microphone
- mic = Microphone(SAMPLE_RATE, CHUNK_SIZE)
- if not mic.start():
- print("Error: Microphone not available")
- return
-
- # Connect to the Voice Agent
- await client.connect()
-
- # Stream microphone audio (interruptable using keyboard)
- try:
- while True:
- audio_chunk = await mic.read(CHUNK_SIZE)
- if not audio_chunk:
- break # Microphone stopped producing data
- await client.send_audio(audio_chunk)
- except KeyboardInterrupt:
- pass
- finally:
- await client.disconnect()
-
-if __name__ == "__main__":
- asyncio.run(main())
+
+ {pythonQuickstart}
+
+
+Note: `Microphone` is imported from the Realtime SDK (`speechmatics.rt`). Install with `pip install speechmatics-rt`.
+
+## Events and segments
+The Voice SDK emits events as transcription progresses. The two main segment events are:
+
+- `ADD_PARTIAL_SEGMENT` - Interim results that stream in real-time as speech is recognized
+- `ADD_SEGMENT` - Final results emitted when a turn ends
+
+### How segments work
+
+As someone speaks, you receive `ADD_PARTIAL_SEGMENT` events with the current transcription. These update continuously—each new partial replaces the previous one.
+
+When a turn is detected (or you call `client.finalize()`), the SDK emits an `ADD_SEGMENT` event with the finalized transcript. This is the stable result you should use for downstream processing like sending to an LLM.
+
+```
+Speaking: "Hello, how are you?"
+
+Timeline:
+ ADD_PARTIAL_SEGMENT: "Hello"
+ ADD_PARTIAL_SEGMENT: "Hello, how"
+ ADD_PARTIAL_SEGMENT: "Hello, how are"
+ ADD_PARTIAL_SEGMENT: "Hello, how are you"
+ (turn detected or finalize() called)
+ ADD_SEGMENT: "Hello, how are you?" ← Use this
+```
+
+### Segment payload
+
+Example `ADD_SEGMENT` payload:
+
+```json
+{
+ "message": "AddSegment",
+ "segments": [
+ {
+ "speaker_id": "S1",
+ "is_active": true,
+ "timestamp": "2025-11-11T23:18:37.189+00:00",
+ "language": "en",
+ "text": "Welcome to Speechmatics.",
+ "metadata": {
+ "start_time": 1.28,
+ "end_time": 8.04
+ }
+ }
+ ],
+ "metadata": {
+ "start_time": 1.28,
+ "end_time": 8.04,
+ "processing_time": 0.187
+ }
+}
```
-#### Presets - the simplest way to get started
+**Field explanations:**
+- `speaker_id`: Speaker label (e.g., `S1`, `S2`, or custom label if using known speakers)
+- `is_active`: Whether this speaker is in your focus list (see Speaker focus)
+- `timestamp`: Absolute wall-clock time (ISO 8601 format)
+- `start_time` / `end_time`: Time in seconds relative to the start of the session
+- `processing_time`: Transcription latency in seconds
+
+### Subscribing to events
+
+
+ {pythonEventSubscription}
+
+
+### When finals are emitted
+
+Final segments (`ADD_SEGMENT`) are emitted when:
+1. **Turn detection** triggers automatically (based on your preset/config)
+2. **You call `client.finalize()`** manually (when using `external` preset)
+
+See Turn detection for more on automatic finalization.
+
+## Presets
These are purpose-built, optimized configurations, ready for use without further modification:
-`fast` - low latency, fast responses
+`FAST` - low latency, fast responses
+
+`FIXED` - general conversation with fixed, silence-based turn detection
-`adaptive` - general conversation
+`ADAPTIVE` - general conversation with adaptive timing
-`smart_turn` - complex conversation
+`SMART_TURN` - complex conversation with ML-enhanced turn detection
-`external` - user handles end of turn
+`EXTERNAL` - user handles end of turn
-`scribe` - note-taking
+`SCRIBE` - note-taking
-`captions` - live captioning
+`CAPTIONS` - live captioning
To view all available presets:
@@ -136,7 +196,10 @@ To view all available presets:
presets = VoiceAgentConfigPreset.list_presets()
```
-### 4. Custom configurations
+Presets include defaults for all settings (language defaults to English).
+To change the language (or any other preset setting), use a custom configuration or use a preset as a starting point and customize with overlays.
+
+### Custom configuration
For more control, you can also specify custom configurations or use presets as a starting point and customise with overlays:
@@ -157,52 +220,141 @@ Use presets as a starting point and customise with overlays:
Note: If no configuration or preset is provided, the client will default to the `external` preset.
-## Configuration
+## Basic configuration
+
+### Language and locale
-### Basic parameters
`language` (str, default: "en")
Language code for transcription (e.g., "en", "es", "fr").
See [supported languages](/speech-to-text/languages).
+`output_locale` (str, default: None)
+Output locale for formatting (e.g., "en-GB", "en-US").
+See [supported languages and locales](/speech-to-text/languages).
+
+### Model selection
+
`operating_point` (OperatingPoint, default: ENHANCED)
-Balance accuracy vs latency.
+Select an accuracy level.
Options: STANDARD or ENHANCED.
`domain` (str, default: None)
Domain-specific model (e.g., "finance", "medical").
See [supported languages and domains](/speech-to-text/languages).
-`output_locale` (str, default: None)
-Output locale for formatting (e.g., "en-GB", "en-US").
-See [supported languages and locales](/speech-to-text/languages).
+### Vocabulary
+
+`additional_vocab` (list[AdditionalVocabEntry], default: [])
+
+Custom vocabulary for domain-specific terms.
+
+
+ {pythonAdditionalVocab}
+
+
+`punctuation_overrides` (dict, default: None)
+Custom punctuation rules. Keys are punctuation marks, values are replacement strings.
+
+### Audio
+
+`sample_rate` (int, default: 16000)
+Audio sample rate in Hz.
+
+`audio_encoding` (AudioEncoding, default: PCM_S16LE)
+Audio encoding format.
+
+### Latency and quality
+
+`max_delay` (float, default: 1.0)
+Maximum transcription delay in seconds for word emission.
+Turn detection ensures finalisation latency is not affected.
+
+### Basic diarization
`enable_diarization` (bool, default: False)
Enable speaker diarization to identify and label different speakers.
+When enabled, segments include a `speaker_id` field (for example `S1`, `S2`).
+
+### Basic configuration example
+
+
+ {pythonBasicConfig}
+
+
+## Advanced configuration
### Turn detection
-`end_of_utterance_mode` (EndOfUtteranceMode, default: FIXED)
-Controls how turn endings are detected:
-
-- `FIXED`: Uses fixed silence threshold.
-Fast but may split slow speech.
-- `ADAPTIVE`: Adjusts delay based on speech rate, pauses, and disfluencies.
-Best for natural conversation.
-- `SMART_TURN`: Uses ML model to detect acoustic turn-taking cues.
-Requires [smart] extras.
-- `EXTERNAL`: Manual control via client.finalize().
-For custom turn logic.
-
-`end_of_utterance_silence_trigger` (float, default: 0.2)
+
+Presets configure turn detection under the hood.
+When a turn is detected (or you call `client.finalize()` using the `external` preset), we send a signal to our servers so you can get the final transcript back as quickly as possible.
+
+This works in multi-speaker scenarios, including when diarization is enabled.
+
+`end_of_utterance_mode` (EndOfUtteranceMode, default: FIXED)
+Controls the base strategy for detecting turn endings:
+
+- `FIXED`: Uses a fixed silence threshold. Fast but may split slow speech.
+- `ADAPTIVE`: Adjusts delay based on speech rate, pauses, and disfluencies. Best for natural conversation.
+- `EXTERNAL`: Manual control via `client.finalize()`. For custom turn logic.
+
+`end_of_utterance_silence_trigger` (float, default: 0.5)
Silence duration in seconds to trigger turn end.
-`end_of_utterance_max_delay` (float, default: 10.0)
+`end_of_utterance_max_delay` (float, default: 10.0)
Maximum delay before forcing turn end.
-`max_delay` (float, default: 0.7)
-Maximum transcription delay for word emission.
-Defaults to 0.7 seconds, but when using turn detection we recommend 1.0s for better accuracy. Turn detection will ensure finalisation latency is not affected.
+#### Voice activity detection
+
+`vad_config` (VoiceActivityConfig, default: None)
+Configure voice activity detection:
+
+- `enabled` (bool, default: False) - Enable VAD.
+- `silence_duration` (float, default: 0.18) - Seconds of silence before considering speech ended.
+- `threshold` (float, default: 0.35) - Sensitivity threshold for detecting speech.
+
+#### Smart turn (ML-enhanced detection)
+
+`smart_turn_config` (SmartTurnConfig, default: None)
+Enables an ML model that detects acoustic turn-taking cues (intonation, rhythm patterns) on top of the base mode.
+
+Smart turn can be combined with `FIXED` or `ADAPTIVE` modes, but **not** with `EXTERNAL` mode.
+
+
+ {pythonSmartTurn}
+
+
+Requires the `[smart]` extras: `pip install speechmatics-voice[smart]`
+
+### Segment output options
+
+`include_partials` (bool, default: True)
+Emit partial segments via `ADD_PARTIAL_SEGMENT`.
+Set to `False` for final-only output.
+
+`include_results` (bool, default: False)
+Include word-level timing data in segments.
+
+`transcription_update_preset` (TranscriptionUpdatePreset, default: COMPLETE)
+Controls when partial segment updates are emitted.
+Options: `COMPLETE`, `COMPLETE_PLUS_TIMING`, `WORDS`, `WORDS_PLUS_TIMING`, `TIMING`.
+
+### Segment generation options
+
+`speech_segment_config` (SpeechSegmentConfig, default: SpeechSegmentConfig())
+Fine-tune segment generation and post-processing:
+
+- `add_trailing_eos` (bool, default: False) - Append end-of-sentence markers to segments that are missing them.
+- `emit_sentences` (bool, default: True) - Emit a finalized segment as soon as a sentence ends. If a speaker continues during a turn, multiple segments may be emitted.
+- `pause_mark` (Optional[str], default: None) - Insert a custom string when pauses are detected (e.g., `"..."` produces `"Hello ... how are you?"`).
+
+### Advanced diarization
+
+#### Sensitivity and speaker limits
+
+`enable_diarization` (bool, default: False)
+Enable speaker diarization to identify and label different speakers.
+You must set this to `True` to use any of the diarization options below.
-### Speaker configuration
`speaker_sensitivity` (float, default: 0.5)
Diarization sensitivity between 0.0 and 1.0.
Higher values detect more speakers.
@@ -210,141 +362,66 @@ Higher values detect more speakers.
`max_speakers` (int, default: None)
Limit maximum number of speakers to detect.
+#### Speaker grouping
+
`prefer_current_speaker` (bool, default: False)
Give extra weight to current speaker for word grouping.
-`speaker_config` (SpeakerFocusConfig, default: SpeakerFocusConfig())
+#### Speaker focus
+
+`speaker_config` (SpeakerFocusConfig, default: SpeakerFocusConfig())
Configure speaker focus/ignore rules.
-
-{
-`from speechmatics.voice import SpeakerFocusConfig, SpeakerFocusMode
-
-# Focus only on specific speakers
-config = VoiceAgentConfig(
- enable_diarization=True,
- speaker_config=SpeakerFocusConfig(
- focus_speakers=["S1", "S2"],
- focus_mode=SpeakerFocusMode.RETAIN
- )
-)
-
-# Ignore specific speakers
-config = VoiceAgentConfig(
- enable_diarization=True,
- speaker_config=SpeakerFocusConfig(
- ignore_speakers=["S3"],
- focus_mode=SpeakerFocusMode.IGNORE
- )
-)`
-}
-
-`known_speakers` (list[SpeakerIdentifier], default: [])
-Pre-enrolled speaker identifiers for speaker identification.
+When diarization is enabled, you can control which speakers appear in your output and how they are treated.
-
-{
-`from speechmatics.voice import SpeakerIdentifier
-
-config = VoiceAgentConfig(
- enable_diarization=True,
- known_speakers=[
- SpeakerIdentifier(label="Alice", speaker_identifiers=["XX...XX"]),
- SpeakerIdentifier(label="Bob", speaker_identifiers=["YY...YY"])
- ]
-)`
-}
-
-### Language and vocabulary
-`additional_vocab` (list[AdditionalVocabEntry], default: [])
+When no `focus_speakers` are configured, all detected speakers are treated as active (`is_active: true`).
-Custom vocabulary for domain-specific terms.
+**Active speakers** are speakers in your `focus_speakers` list.
+Their segments have `is_active: true`.
-
-{
-`from speechmatics.voice import AdditionalVocabEntry
-
-config = VoiceAgentConfig(
- language="en",
- additional_vocab=[
- AdditionalVocabEntry(
- content="Speechmatics",
- sounds_like=["speech matters", "speech matics"]
- ),
- AdditionalVocabEntry(content="API"),
- ]
-)`
-}
-
-`punctuation_overrides` (dict, default: None)
-Custom punctuation rules.
+**Passive speakers** are speakers not in `focus_speakers` but still included in output when using `SpeakerFocusMode.RETAIN`.
+Their segments have `is_active: false`.
-### Audio parameters
-`sample_rate` (int, default: 16000)
-Audio sample rate in Hz.
+**Ignored speakers** are completely excluded from output.
+Their speech does not appear in segments and does not trigger turn detection.
-`audio_encoding` (AudioEncoding, default: PCM_S16LE)
-Audio encoding format.
+**SpeakerFocusMode options:**
-### Advanced parameters
-`transcription_update_preset` (TranscriptionUpdatePreset, default: COMPLETE)
-Controls when to emit updates: COMPLETE, COMPLETE_PLUS_TIMING, WORDS, WORDS_PLUS_TIMING, or TIMING.
+- `RETAIN`: Non-focused speakers are kept in output as passive speakers (`is_active: false`). Use this when you want to prioritize certain speakers but still see what others say.
+- `IGNORE`: Non-focused speakers are excluded entirely from output. Use this when you only care about specific speakers and want to filter out everyone else.
-`speech_segment_config` (SpeechSegmentConfig, default: SpeechSegmentConfig())
-Fine-tune segment generation and post-processing.
+
+ {pythonSpeakerFocus}
+
-`smart_turn_config` (SmartTurnConfig, default: None)
-Configure SMART_TURN behavior (buffer length, threshold).
+In your event handler, you can use `is_active` to decide how to route segments:
-`include_results` (bool, default: False)
-Include word-level timing data in segments.
+
+ {pythonSpeakerFocusHandler}
+
-`include_partials` (bool, default: True)
-Emit partial segments. Set to False for final-only output.
+#### Known speakers (speaker identification)
-### Configuration with overlays.
-Use presets as a starting point and customize with overlays:
+`known_speakers` (list[SpeakerIdentifier], default: [])
+Pre-enrolled speaker identifiers for speaker identification.
-{
-`from speechmatics.voice import VoiceAgentConfigPreset, VoiceAgentConfig
-
-# Use preset with custom overrides
-config = VoiceAgentConfigPreset.SCRIBE(
- VoiceAgentConfig(
- language="es",
- max_delay=0.8
- )
-)`
-}
+ {pythonKnownSpeakers}
-### Available presets
+### Advanced configuration example
+
-{
-`presets = VoiceAgentConfigPreset.list_presets()
-# Output: ['low_latency', 'conversation_adaptive', 'conversation_smart_turn', 'scribe', 'captions']`
-}
+ {pythonAdvancedConfig}
-### Configuration serialization
+## Import and export configurations
Export and import configurations as JSON:
-{
-`from speechmatics.voice import VoiceAgentConfigPreset, VoiceAgentConfig
-
-# Export preset to JSON
-config_json = VoiceAgentConfigPreset.SCRIBE().to_json()
-
-# Load from JSON
-config = VoiceAgentConfig.from_json(config_json)
-
-# Or create from JSON string
-config = VoiceAgentConfig.from_json('{"language": "en", "enable_diarization": true}')`
-}
+ {pythonVoiceConfigSerialization}
+## More information
-For more information, see the [Voice SDK](https://github.com/speechmatics/speechmatics-python-sdk/tree/main/sdk/voice) on github.
-`
+- Voice SDK on GitHub: https://github.com/speechmatics/speechmatics-python-sdk/tree/main/sdk/voice