diff --git a/docs/voice-agents/overview.mdx b/docs/voice-agents/overview.mdx index 55f60855..cb05b61d 100644 --- a/docs/voice-agents/overview.mdx +++ b/docs/voice-agents/overview.mdx @@ -1,170 +1,57 @@ --- -description: Learn how to build voice-enabled applications with the Speechmatics Voice SDK +description: Learn how to build voice agents with Speechmatics integrations and the Voice SDK. --- -import Admonition from '@theme/Admonition'; -import CodeBlock from '@theme/CodeBlock'; -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; +import { LinkCard } from "@site/src/theme/LinkCard"; +import { Grid } from "@radix-ui/themes"; -import pythonVoiceCustomConfig from "./assets/custom-config.py?raw" -import pythonVoiceConfigOverlays from "./assets/config-overlays.py?raw" -import pythonVoiceConfigSerialization from "./assets/config-serialization.py?raw" +# Voice agents overview -# Voice SDK overview -The Voice SDK builds on our Realtime API to provide additional features optimized for conversational AI, using Python: +Our Voice SDK provides features optimized for conversational AI, which we use to build our integrations. +Our integration partners are the quickest way to get a production voice agent up and running, -- **Intelligent segmentation**: groups words into meaningful speech segments per speaker. -- **Turn detection**: automatically detects when speakers finish talking. -- **Speaker management**: focus on or ignore specific speakers in multi-speaker scenarios. -- **Preset configurations**: offers ready-to-use settings for conversations, note-taking, and captions. -- **Simplified event handling**: delivers clean, structured segments instead of raw word-level events. +## Features -### Voice SDK vs Realtime SDK +Speechmatics provides building blocks you can use through integrations and the Voice SDK. -Use the Voice SDK when: +It includes: -- Building conversational AI or voice agents -- You need automatic turn detection -- You want speaker-focused transcription -- You need ready-to-use presets for common scenarios +- **Turn detection**: detect when a speaker has finished talking. +- **Intelligent segmentation**: group partial transcripts into clean, speaker-attributed segments. +- **Diarization**: identify and label different speakers. +- **Speaker focus**: focus on or ignore specific speakers in multi-speaker scenarios. +- **Preset configurations**: start quickly with ready-to-use settings. +- **Structured events**: work with clean segments instead of raw word-level events. -Use the Realtime SDK when: +## Integrations -- You need the raw stream of word-by-word transcription data -- Building custom segmentation logic -- You want fine-grained control over every event -- Processing audio files or custom workflows +Use an integration to handle audio transport and wiring, so you can focus on your agent logic: -## Getting started + + } + href="/integrations-and-sdks/vapi" + /> + } + href="/integrations-and-sdks/livekit" + /> + } + href="/integrations-and-sdks/pipecat" + /> + -### 1. Create an API key +## Voice SDK -[Create a Speechmatics API key in the portal](https://portal.speechmatics.com/settings/api-keys) to access the Voice SDK. -Store your key securely as a managed secret. +Use the Voice SDK to handle turn detection, group transcripts into clean segments, and apply diarization for LLM workflows. -### 2. Install dependencies +See [Voice SDK](/voice-agents/voice-sdk) for getting started, presets, and configuration. -```bash -# Standard installation -pip install speechmatics-voice - -# With SMART_TURN (ML-based turn detection) -pip install speechmatics-voice[smart] -``` - -### 3. Quickstart - -Here's how to stream microphone audio to the Voice Agent and transcribe finalised segments of speech, with speaker ID: - -```python -import asyncio -import os -from speechmatics.rt import Microphone -from speechmatics.voice import VoiceAgentClient, AgentServerMessageType - -async def main(): - """Stream microphone audio to Speechmatics Voice Agent using 'scribe' preset""" - - # Audio configuration - SAMPLE_RATE = 16000 # Hz - CHUNK_SIZE = 160 # Samples per read - PRESET = "scribe" # Configuration preset - - # Create client with preset - client = VoiceAgentClient( - api_key=os.getenv("SPEECHMATICS_API_KEY"), - preset=PRESET - ) - - # Print finalised segments of speech with speaker ID - @client.on(AgentServerMessageType.ADD_SEGMENT) - def on_segment(message): - for segment in message["segments"]: - speaker = segment["speaker_id"] - text = segment["text"] - print(f"{speaker}: {text}") - - # Setup microphone - mic = Microphone(SAMPLE_RATE, CHUNK_SIZE) - if not mic.start(): - print("Error: Microphone not available") - return - - # Connect to the Voice Agent - await client.connect() - - # Stream microphone audio (interruptable using keyboard) - try: - while True: - audio_chunk = await mic.read(CHUNK_SIZE) - if not audio_chunk: - break # Microphone stopped producing data - await client.send_audio(audio_chunk) - except KeyboardInterrupt: - pass - finally: - await client.disconnect() - -if __name__ == "__main__": - asyncio.run(main()) - -``` - -#### Presets - the simplest way to get started -These are purpose-built, optimized configurations, ready for use without further modification: - -`fast` - low latency, fast responses - -`adaptive` - general conversation - -`smart_turn` - complex conversation - -`external` - user handles end of turn - -`scribe` - note-taking - -`captions` - live captioning - -To view all available presets: -```python -presets = VoiceAgentConfigPreset.list_presets() -``` - -### 4. Custom configurations - -For more control, you can also specify custom configurations or use presets as a starting point and customise with overlays: - - -Specify configurations in a `VoiceAgentConfig` object: - - {pythonVoiceCustomConfig} - - - -Use presets as a starting point and customise with overlays: - - {pythonVoiceConfigOverlays} - - - - -Note: If no configuration or preset is provided, the client will default to the `external` preset. - - - - -## FAQ -### Support - -
-Where can I provide feedback or get help? - -You can submit feedback, bug reports, or feature requests through the Speechmatics [GitHub discussions](https://github.com/orgs/speechmatics/discussions). -
- -## Next steps - -- For more information, see the [Voice SDK](https://github.com/speechmatics/speechmatics-python-sdk/tree/main/sdk/voice) on GitHub. -- For working examples, integrations and templates, check out the [Speechmatics Academy](https://github.com/speechmatics/speechmatics-academy). -- Share and discuss your project with [our team](https://support.speechmatics.com) or join our [developer community on Reddit](https://www.reddit.com/r/Speechmatics) to connect with other builders in voice AI. +If you’re building an integration and want to work with us, [contact support](https://support.speechmatics.com). diff --git a/docs/voice-agents/sidebar.ts b/docs/voice-agents/sidebar.ts index f14bba42..a622bfff 100644 --- a/docs/voice-agents/sidebar.ts +++ b/docs/voice-agents/sidebar.ts @@ -1,5 +1,3 @@ -import voiceAgentsFlowSidebar from "./flow/sidebar"; - export default { type: "category", label: "Voice agents", @@ -13,9 +11,8 @@ export default { }, { type: "doc", - id: "voice-agents/features", - label: "Features", + id: "voice-agents/voice-sdk", + label: "Voice SDK", }, - voiceAgentsFlowSidebar, ], } as const; \ No newline at end of file diff --git a/docs/voice-agents/features.mdx b/docs/voice-agents/voice-sdk.mdx similarity index 52% rename from docs/voice-agents/features.mdx rename to docs/voice-agents/voice-sdk.mdx index 8232cc91..bcef2065 100644 --- a/docs/voice-agents/features.mdx +++ b/docs/voice-agents/voice-sdk.mdx @@ -1,9 +1,163 @@ --- -description: Learn about configuration parameters for the Voice SDK +description: Learn how to use the Voice SDK. --- import CodeBlock from '@theme/CodeBlock'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; -# Features +import pythonVoiceCustomConfig from "./assets/custom-config.py?raw" +import pythonVoiceConfigOverlays from "./assets/config-overlays.py?raw" +import pythonVoiceConfigSerialization from "./assets/config-serialization.py?raw" + + + +# Voice SDK + +The Voice SDK is a Python library that provides additional features optimized for conversational AI, built on top of our Realtime API. + +We use it to build our integrations, and it is also available for you to use. + +- **Intelligent segmentation**: groups words into meaningful speech segments per speaker. +- **Turn detection**: automatically detects when speakers finish talking. +- **Speaker management**: focus on or ignore specific speakers in multi-speaker scenarios. +- **Preset configurations**: offers ready-to-use settings for conversations, note-taking, and captions. +- **Simplified event handling**: delivers clean, structured segments instead of raw word-level events. + +### Voice SDK vs Realtime SDK + +Use the Voice SDK when: + +- Building conversational AI or voice agents +- You need automatic turn detection +- You want speaker-focused transcription +- You need ready-to-use presets for common scenarios + +Use the Realtime SDK when: + +- You need the raw stream of word-by-word transcription data +- Building custom segmentation logic +- You want fine-grained control over every event +- Processing audio files or custom workflows + +## Getting started + +### 1. Create an API key + +[Create a Speechmatics API key in the portal](https://portal.speechmatics.com/settings/api-keys) to access the Voice SDK. +Store your key securely as a managed secret. + +### 2. Install dependencies + +```bash +# Standard installation +pip install speechmatics-voice + +# With SMART_TURN (ML-based turn detection) +pip install speechmatics-voice[smart] +``` + +### 3. Quickstart + +Here's how to stream microphone audio to the Voice Agent and transcribe finalised segments of speech, with speaker ID: + +```python +import asyncio +import os +from speechmatics.rt import Microphone +from speechmatics.voice import VoiceAgentClient, AgentServerMessageType + +async def main(): + """Stream microphone audio to Speechmatics Voice Agent using 'scribe' preset""" + + # Audio configuration + SAMPLE_RATE = 16000 # Hz + CHUNK_SIZE = 160 # Samples per read + PRESET = "scribe" # Configuration preset + + # Create client with preset + client = VoiceAgentClient( + api_key=os.getenv("SPEECHMATICS_API_KEY"), + preset=PRESET + ) + + # Print finalised segments of speech with speaker ID + @client.on(AgentServerMessageType.ADD_SEGMENT) + def on_segment(message): + for segment in message["segments"]: + speaker = segment["speaker_id"] + text = segment["text"] + print(f"{speaker}: {text}") + + # Setup microphone + mic = Microphone(SAMPLE_RATE, CHUNK_SIZE) + if not mic.start(): + print("Error: Microphone not available") + return + + # Connect to the Voice Agent + await client.connect() + + # Stream microphone audio (interruptable using keyboard) + try: + while True: + audio_chunk = await mic.read(CHUNK_SIZE) + if not audio_chunk: + break # Microphone stopped producing data + await client.send_audio(audio_chunk) + except KeyboardInterrupt: + pass + finally: + await client.disconnect() + +if __name__ == "__main__": + asyncio.run(main()) + +``` + +#### Presets - the simplest way to get started + +These are purpose-built, optimized configurations, ready for use without further modification: + +`fast` - low latency, fast responses + +`adaptive` - general conversation + +`smart_turn` - complex conversation + +`external` - user handles end of turn + +`scribe` - note-taking + +`captions` - live captioning + +To view all available presets: + +```python +presets = VoiceAgentConfigPreset.list_presets() +``` + +### 4. Custom configurations + +For more control, you can also specify custom configurations or use presets as a starting point and customise with overlays: + + + +Specify configurations in a `VoiceAgentConfig` object: + + {pythonVoiceCustomConfig} + + + +Use presets as a starting point and customise with overlays: + + {pythonVoiceConfigOverlays} + + + + +Note: If no configuration or preset is provided, the client will default to the `external` preset. + +## Configuration ### Basic parameters `language` (str, default: "en") @@ -45,7 +199,8 @@ Silence duration in seconds to trigger turn end. Maximum delay before forcing turn end. `max_delay` (float, default: 0.7) -Maximum transcription delay for word emission. +Maximum transcription delay for word emission. +Defaults to 0.7 seconds, but when using turn detection we recommend 1.0s for better accuracy. Turn detection will ensure finalisation latency is not affected. ### Speaker configuration `speaker_sensitivity` (float, default: 0.5) diff --git a/sidebars.ts b/sidebars.ts index 61f6511b..2f7522b6 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -10,8 +10,8 @@ export default { docs: [ gettingStartedSidebar, speechToTextSidebar, - voiceAgentsSidebar, textToSpeechSidebar, + voiceAgentsSidebar, integrationsAndSDKSidebar, deploymentsSidebar, {