diff --git a/DeepL/DeepL.csproj b/DeepL/DeepL.csproj
index f6319aa..c6c8400 100644
--- a/DeepL/DeepL.csproj
+++ b/DeepL/DeepL.csproj
@@ -34,6 +34,7 @@
+
diff --git a/DeepL/DeepLClient.cs b/DeepL/DeepLClient.cs
index 6a2dc91..73cb4d4 100644
--- a/DeepL/DeepLClient.cs
+++ b/DeepL/DeepLClient.cs
@@ -6,6 +6,7 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
+using System.Net.WebSockets;
using System.Text.Json;
using System.Text.Json.Serialization;
using System.Threading;
@@ -54,7 +55,7 @@ Task RephraseTextAsync(
/// Client for the DeepL API. To use the DeepL API, initialize an instance of this class using your DeepL
/// Authentication Key. All functions are thread-safe, aside from .
///
- public sealed class DeepLClient : Translator, IWriter, IGlossaryManager, IStyleRuleManager {
+ public sealed class DeepLClient : Translator, IWriter, IGlossaryManager, IStyleRuleManager, IVoiceManager {
/// Initializes a new instance of the class.
/// The message that describes the error.
public DeepLClient(string authKey, DeepLClientOptions? options = null) : base(authKey, options) { }
@@ -939,6 +940,79 @@ private static (string Key, string Value)[] CreateLanguageQueryParams(
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull
};
+ ///
+ public async Task CreateVoiceSessionAsync(
+ VoiceSessionOptions options,
+ CancellationToken cancellationToken = default) {
+ if (options == null) {
+ throw new ArgumentNullException(nameof(options));
+ }
+
+ if (options.TargetLanguages == null || options.TargetLanguages.Length == 0) {
+ throw new ArgumentException("At least one target language must be specified");
+ }
+
+ if (options.TargetLanguages.Length > 5) {
+ throw new ArgumentException("Maximum 5 target languages per session");
+ }
+
+ var requestData = new Dictionary {
+ ["source_media_content_type"] = options.SourceMediaContentType,
+ ["target_languages"] = options.TargetLanguages
+ };
+
+ if (options.MessageFormat != null) {
+ requestData["message_format"] = options.MessageFormat.Value.ToApiValue();
+ }
+
+ if (options.SourceLanguage != null) {
+ requestData["source_language"] = options.SourceLanguage;
+ }
+
+ if (options.SourceLanguageMode != null) {
+ requestData["source_language_mode"] = options.SourceLanguageMode.Value.ToApiValue();
+ }
+
+ if (options.TargetMediaLanguages != null) {
+ requestData["target_media_languages"] = options.TargetMediaLanguages;
+ }
+
+ if (options.TargetMediaContentType != null) {
+ requestData["target_media_content_type"] = options.TargetMediaContentType;
+ }
+
+ if (options.TargetMediaVoice != null) {
+ requestData["target_media_voice"] = options.TargetMediaVoice.Value.ToApiValue();
+ }
+
+ if (options.GlossaryId != null) {
+ requestData["glossary_id"] = options.GlossaryId;
+ }
+
+ if (options.Formality != null) {
+ requestData["formality"] = options.Formality;
+ }
+
+ using var responseMessage = await _client
+ .ApiPostJsonAsync("v3/voice/realtime", cancellationToken, requestData, SerializationOptions)
+ .ConfigureAwait(false);
+
+ await DeepLHttpClient.CheckStatusCodeAsync(responseMessage).ConfigureAwait(false);
+ var sessionInfo = await JsonUtils.DeserializeAsync(responseMessage).ConfigureAwait(false);
+
+ // Establish WebSocket connection
+ var wsUri = new Uri($"{sessionInfo.StreamingUrl}?token={Uri.EscapeDataString(sessionInfo.Token)}");
+ var webSocket = new ClientWebSocket();
+ try {
+ await webSocket.ConnectAsync(wsUri, cancellationToken).ConfigureAwait(false);
+ } catch (Exception ex) {
+ webSocket.Dispose();
+ throw new DeepLException("Failed to establish Voice API WebSocket connection", ex);
+ }
+
+ return new VoiceSession(_client, webSocket, sessionInfo);
+ }
+
/// Class used for JSON-deserialization of style rule list results.
private readonly struct StyleRuleListResult {
/// Initializes a new instance of , used for JSON deserialization.
diff --git a/DeepL/IVoiceManager.cs b/DeepL/IVoiceManager.cs
new file mode 100644
index 0000000..afc2e6f
--- /dev/null
+++ b/DeepL/IVoiceManager.cs
@@ -0,0 +1,28 @@
+// Copyright 2025 DeepL SE (https://www.deepl.com)
+// Use of this source code is governed by an MIT
+// license that can be found in the LICENSE file.
+
+using System;
+using System.Threading;
+using System.Threading.Tasks;
+
+namespace DeepL {
+ /// Interface for creating Voice API streaming sessions.
+ public interface IVoiceManager : IDisposable {
+ ///
+ /// Creates a new Voice API streaming session for real-time speech transcription and translation.
+ /// This requests a session from the DeepL API and establishes a WebSocket connection.
+ ///
+ /// Options controlling session configuration including audio format, languages, etc.
+ /// The cancellation token to cancel the operation.
+ /// An for streaming audio and receiving transcripts.
+ /// If any option is invalid.
+ ///
+ /// If any error occurs while communicating with the DeepL API, a
+ /// or a derived class will be thrown.
+ ///
+ Task CreateVoiceSessionAsync(
+ VoiceSessionOptions options,
+ CancellationToken cancellationToken = default);
+ }
+}
diff --git a/DeepL/IVoiceSession.cs b/DeepL/IVoiceSession.cs
new file mode 100644
index 0000000..d5d0e6c
--- /dev/null
+++ b/DeepL/IVoiceSession.cs
@@ -0,0 +1,77 @@
+// Copyright 2025 DeepL SE (https://www.deepl.com)
+// Use of this source code is governed by an MIT
+// license that can be found in the LICENSE file.
+
+using System;
+using System.Threading;
+using System.Threading.Tasks;
+using DeepL.Model;
+
+namespace DeepL {
+ ///
+ /// Represents an active Voice API streaming session. Provides methods for sending audio data and receiving
+ /// real-time transcriptions and translations via events.
+ ///
+ ///
+ /// Events fire on a background thread. Consumers are responsible for marshaling to the appropriate
+ /// synchronization context if needed. Dispose the session to close the WebSocket connection.
+ ///
+ public interface IVoiceSession : IDisposable {
+ /// Raised when a source transcript update is received from the server.
+ event EventHandler? SourceTranscriptUpdated;
+
+ /// Raised when a target transcript update is received from the server.
+ event EventHandler? TargetTranscriptUpdated;
+
+ ///
+ /// Raised when a target media audio chunk is received from the server. This feature is in closed beta.
+ ///
+ event EventHandler? TargetMediaChunkReceived;
+
+ /// Raised when an error message is received from the WebSocket connection.
+ event EventHandler? ErrorReceived;
+
+ /// Raised when the end-of-stream message is received, indicating all outputs are complete.
+ event EventHandler? StreamEnded;
+
+ /// The unique session identifier.
+ string? SessionId { get; }
+
+ /// Whether the WebSocket connection is currently open.
+ bool IsConnected { get; }
+
+ ///
+ /// Sends a chunk of audio data to the server. The audio encoding must match the
+ /// specified when creating the session.
+ ///
+ /// Audio data to send. Must not exceed 100 KB or 1 second duration.
+ /// The cancellation token to cancel the operation.
+ /// If the session is not connected or sending fails.
+ Task SendAudioAsync(byte[] audioData, CancellationToken cancellationToken = default);
+
+ ///
+ /// Sends a chunk of audio data to the server using a memory-efficient overload.
+ ///
+ /// Audio data to send. Must not exceed 100 KB or 1 second duration.
+ /// The cancellation token to cancel the operation.
+ /// If the session is not connected or sending fails.
+ Task SendAudioAsync(ArraySegment audioData, CancellationToken cancellationToken = default);
+
+ ///
+ /// Signals the end of the audio stream. Causes finalization of tentative transcript segments and
+ /// triggers emission of final transcript updates, end-of-transcript, and end-of-stream messages.
+ /// No more audio data can be sent after calling this method.
+ ///
+ /// The cancellation token to cancel the operation.
+ /// If the session is not connected or sending fails.
+ Task EndAudioAsync(CancellationToken cancellationToken = default);
+
+ ///
+ /// Requests a reconnection token and establishes a new WebSocket connection, resuming the session.
+ /// This should be called when the WebSocket connection is lost unexpectedly.
+ ///
+ /// The cancellation token to cancel the operation.
+ /// If reconnection fails.
+ Task ReconnectAsync(CancellationToken cancellationToken = default);
+ }
+}
diff --git a/DeepL/Model/TargetMediaChunk.cs b/DeepL/Model/TargetMediaChunk.cs
new file mode 100644
index 0000000..f6b1522
--- /dev/null
+++ b/DeepL/Model/TargetMediaChunk.cs
@@ -0,0 +1,68 @@
+// Copyright 2025 DeepL SE (https://www.deepl.com)
+// Use of this source code is governed by an MIT
+// license that can be found in the LICENSE file.
+
+using System.Text.Json.Serialization;
+
+namespace DeepL.Model {
+ ///
+ /// Represents a translated audio chunk from the Voice API. This feature is currently in closed beta.
+ /// Audio data is provided as an array of base64-encoded indivisible chunks.
+ ///
+ public sealed class TargetMediaChunk {
+ /// Initializes a new instance of .
+ /// The content type of the audio data. Present in the first message.
+ /// Number of header packets at the start of the data array, or null if all are audio.
+ /// Array of base64-encoded audio data packets.
+ /// Text corresponding to this audio chunk, for subtitle synchronization.
+ /// The target language of this audio chunk.
+ /// Duration of this audio chunk in seconds.
+ ///
+ /// The constructor for this class (and all other Model classes) should not be used by library users. Ideally it
+ /// would be marked , but needs to be for JSON deserialization.
+ /// In future this function may have backwards-incompatible changes.
+ ///
+ [JsonConstructor]
+ public TargetMediaChunk(
+ string? contentType,
+ int? headers,
+ string[] data,
+ string? text,
+ string? language,
+ double? duration) {
+ ContentType = contentType;
+ Headers = headers;
+ Data = data;
+ Text = text;
+ Language = language;
+ Duration = duration;
+ }
+
+ /// The content type of the audio data. Present in the first message of a sequence.
+ [JsonPropertyName("content_type")]
+ public string? ContentType { get; }
+
+ ///
+ /// Number of packets at the start of that contain initialization/header data.
+ /// Null or absent when all packets are audio data.
+ ///
+ [JsonPropertyName("headers")]
+ public int? Headers { get; }
+
+ /// Array of base64-encoded indivisible audio data packets.
+ [JsonPropertyName("data")]
+ public string[] Data { get; }
+
+ /// Text corresponding to this audio chunk, for subtitle synchronization.
+ [JsonPropertyName("text")]
+ public string? Text { get; }
+
+ /// The target language of this audio chunk.
+ [JsonPropertyName("language")]
+ public string? Language { get; }
+
+ /// Duration of this audio chunk in seconds.
+ [JsonPropertyName("duration")]
+ public double? Duration { get; }
+ }
+}
diff --git a/DeepL/Model/TranscriptSegment.cs b/DeepL/Model/TranscriptSegment.cs
new file mode 100644
index 0000000..b678ce2
--- /dev/null
+++ b/DeepL/Model/TranscriptSegment.cs
@@ -0,0 +1,29 @@
+// Copyright 2025 DeepL SE (https://www.deepl.com)
+// Use of this source code is governed by an MIT
+// license that can be found in the LICENSE file.
+
+using System.Text.Json.Serialization;
+
+namespace DeepL.Model {
+ /// A single text segment within a Voice API transcript update.
+ public sealed class TranscriptSegment {
+ /// Initializes a new instance of .
+ /// The text content of this segment.
+ ///
+ /// The constructor for this class (and all other Model classes) should not be used by library users. Ideally it
+ /// would be marked , but needs to be for JSON deserialization.
+ /// In future this function may have backwards-incompatible changes.
+ ///
+ [JsonConstructor]
+ public TranscriptSegment(string text) {
+ Text = text;
+ }
+
+ /// The text content of this segment.
+ [JsonPropertyName("text")]
+ public string Text { get; }
+
+ /// Returns the text content of this segment.
+ public override string ToString() => Text;
+ }
+}
diff --git a/DeepL/Model/TranscriptUpdate.cs b/DeepL/Model/TranscriptUpdate.cs
new file mode 100644
index 0000000..9db2adc
--- /dev/null
+++ b/DeepL/Model/TranscriptUpdate.cs
@@ -0,0 +1,41 @@
+// Copyright 2025 DeepL SE (https://www.deepl.com)
+// Use of this source code is governed by an MIT
+// license that can be found in the LICENSE file.
+
+using System.Text.Json.Serialization;
+
+namespace DeepL.Model {
+ ///
+ /// Represents a transcript update from the Voice API, containing concluded (finalized) and tentative
+ /// (in-progress) text segments. Used for both source and target transcript updates.
+ ///
+ public sealed class TranscriptUpdate {
+ /// Initializes a new instance of .
+ /// Finalized text segments that will not change.
+ /// Preliminary text segments that may be refined.
+ /// The language code of this transcript update. Only present on target updates.
+ ///
+ /// The constructor for this class (and all other Model classes) should not be used by library users. Ideally it
+ /// would be marked , but needs to be for JSON deserialization.
+ /// In future this function may have backwards-incompatible changes.
+ ///
+ [JsonConstructor]
+ public TranscriptUpdate(TranscriptSegment[] concluded, TranscriptSegment[] tentative, string? language) {
+ Concluded = concluded;
+ Tentative = tentative;
+ Language = language;
+ }
+
+ /// Finalized text segments that will not change. These segments are sent once and remain fixed.
+ [JsonPropertyName("concluded")]
+ public TranscriptSegment[] Concluded { get; }
+
+ /// Preliminary text segments that may be refined as more audio context becomes available.
+ [JsonPropertyName("tentative")]
+ public TranscriptSegment[] Tentative { get; }
+
+ /// The language code of this transcript update. Only present on target transcript updates.
+ [JsonPropertyName("language")]
+ public string? Language { get; }
+ }
+}
diff --git a/DeepL/Model/VoiceSessionInfo.cs b/DeepL/Model/VoiceSessionInfo.cs
new file mode 100644
index 0000000..45aa899
--- /dev/null
+++ b/DeepL/Model/VoiceSessionInfo.cs
@@ -0,0 +1,40 @@
+// Copyright 2025 DeepL SE (https://www.deepl.com)
+// Use of this source code is governed by an MIT
+// license that can be found in the LICENSE file.
+
+using System.Text.Json.Serialization;
+
+namespace DeepL.Model {
+ /// Information about a Voice API session, received from the session request endpoint.
+ public sealed class VoiceSessionInfo {
+ /// Initializes a new instance of .
+ /// The WebSocket URL for establishing the stream connection.
+ /// Ephemeral authentication token for the streaming endpoint.
+ /// Unique identifier for the session.
+ ///
+ /// The constructor for this class (and all other Model classes) should not be used by library users. Ideally it
+ /// would be marked , but needs to be for JSON deserialization.
+ /// In future this function may have backwards-incompatible changes.
+ ///
+ [JsonConstructor]
+ public VoiceSessionInfo(string streamingUrl, string token, string? sessionId) {
+ StreamingUrl = streamingUrl;
+ Token = token;
+ SessionId = sessionId;
+ }
+
+ /// The WebSocket URL to use for establishing the stream connection.
+ [JsonPropertyName("streaming_url")]
+ public string StreamingUrl { get; }
+
+ ///
+ /// Ephemeral authentication token for the streaming endpoint. Valid for one-time use only.
+ ///
+ [JsonPropertyName("token")]
+ public string Token { get; }
+
+ /// Unique identifier for the session.
+ [JsonPropertyName("session_id")]
+ public string? SessionId { get; }
+ }
+}
diff --git a/DeepL/Model/VoiceStreamError.cs b/DeepL/Model/VoiceStreamError.cs
new file mode 100644
index 0000000..80a0311
--- /dev/null
+++ b/DeepL/Model/VoiceStreamError.cs
@@ -0,0 +1,41 @@
+// Copyright 2025 DeepL SE (https://www.deepl.com)
+// Use of this source code is governed by an MIT
+// license that can be found in the LICENSE file.
+
+using System.Text.Json.Serialization;
+
+namespace DeepL.Model {
+ /// Represents an error message received from the Voice API WebSocket connection.
+ public sealed class VoiceStreamError {
+ /// Initializes a new instance of .
+ /// The error code.
+ /// The reason code for the error.
+ /// A human-readable error message.
+ ///
+ /// The constructor for this class (and all other Model classes) should not be used by library users. Ideally it
+ /// would be marked , but needs to be for JSON deserialization.
+ /// In future this function may have backwards-incompatible changes.
+ ///
+ [JsonConstructor]
+ public VoiceStreamError(string? code, string? reason, string? message) {
+ Code = code;
+ Reason = reason;
+ Message = message;
+ }
+
+ /// The error code.
+ [JsonPropertyName("code")]
+ public string? Code { get; }
+
+ /// The reason code for the error.
+ [JsonPropertyName("reason")]
+ public string? Reason { get; }
+
+ /// A human-readable error message.
+ [JsonPropertyName("message")]
+ public string? Message { get; }
+
+ /// Returns the error message.
+ public override string ToString() => $"VoiceStreamError(code={Code}, reason={Reason}, message={Message})";
+ }
+}
diff --git a/DeepL/SourceLanguageMode.cs b/DeepL/SourceLanguageMode.cs
new file mode 100644
index 0000000..521037f
--- /dev/null
+++ b/DeepL/SourceLanguageMode.cs
@@ -0,0 +1,29 @@
+// Copyright 2025 DeepL SE (https://www.deepl.com)
+// Use of this source code is governed by an MIT
+// license that can be found in the LICENSE file.
+
+using System;
+
+namespace DeepL {
+ /// Controls how the source language value is used in Voice API sessions.
+ public enum SourceLanguageMode {
+ /// Treats source language as a hint; server can override.
+ Auto,
+
+ /// Treats source language as mandatory; server must use this language.
+ Fixed
+ }
+
+ /// Extension methods for .
+ public static class SourceLanguageModeExtensions {
+ /// Retrieves the string representation used by the DeepL API.
+ /// If an unknown enum value is passed.
+ public static string ToApiValue(this SourceLanguageMode mode) {
+ return mode switch {
+ SourceLanguageMode.Auto => "auto",
+ SourceLanguageMode.Fixed => "fixed",
+ _ => throw new ArgumentOutOfRangeException(nameof(mode), mode, "Unrecognized source language mode value")
+ };
+ }
+ }
+}
diff --git a/DeepL/SourceMediaContentType.cs b/DeepL/SourceMediaContentType.cs
new file mode 100644
index 0000000..fe48105
--- /dev/null
+++ b/DeepL/SourceMediaContentType.cs
@@ -0,0 +1,68 @@
+// Copyright 2025 DeepL SE (https://www.deepl.com)
+// Use of this source code is governed by an MIT
+// license that can be found in the LICENSE file.
+
+namespace DeepL {
+ ///
+ /// String constants for audio format content types supported by the DeepL Voice API.
+ /// Use these when configuring .
+ ///
+ public static class SourceMediaContentType {
+ /// Auto-detect container and codec. Supported for all formats except PCM.
+ public const string Auto = "audio/auto";
+
+ /// FLAC container with FLAC codec.
+ public const string Flac = "audio/flac";
+
+ /// MPEG container with MP3 codec.
+ public const string Mpeg = "audio/mpeg";
+
+ /// Ogg container with auto-detected codec (FLAC or OPUS).
+ public const string Ogg = "audio/ogg";
+
+ /// WebM container with OPUS codec.
+ public const string WebM = "audio/webm";
+
+ /// Matroska container with auto-detected codec.
+ public const string Matroska = "audio/x-matroska";
+
+ /// Ogg container with FLAC codec.
+ public const string OggFlac = "audio/ogg;codecs=flac";
+
+ /// Ogg container with OPUS codec.
+ public const string OggOpus = "audio/ogg;codecs=opus";
+
+ /// PCM signed 16-bit little-endian at 8000 Hz.
+ public const string PcmS16le8000 = "audio/pcm;encoding=s16le;rate=8000";
+
+ /// PCM signed 16-bit little-endian at 16000 Hz. Recommended for general use.
+ public const string PcmS16le16000 = "audio/pcm;encoding=s16le;rate=16000";
+
+ /// PCM signed 16-bit little-endian at 44100 Hz.
+ public const string PcmS16le44100 = "audio/pcm;encoding=s16le;rate=44100";
+
+ /// PCM signed 16-bit little-endian at 48000 Hz.
+ public const string PcmS16le48000 = "audio/pcm;encoding=s16le;rate=48000";
+
+ /// PCM A-Law at 8000 Hz (G.711).
+ public const string PcmAlaw8000 = "audio/pcm;encoding=alaw;rate=8000";
+
+ /// PCM µ-Law at 8000 Hz (G.711).
+ public const string PcmUlaw8000 = "audio/pcm;encoding=ulaw;rate=8000";
+
+ /// WebM container with OPUS codec (explicit).
+ public const string WebMOpus = "audio/webm;codecs=opus";
+
+ /// Matroska container with AAC codec.
+ public const string MatroskaAac = "audio/x-matroska;codecs=aac";
+
+ /// Matroska container with FLAC codec.
+ public const string MatroskaFlac = "audio/x-matroska;codecs=flac";
+
+ /// Matroska container with MP3 codec.
+ public const string MatroskaMp3 = "audio/x-matroska;codecs=mp3";
+
+ /// Matroska container with OPUS codec.
+ public const string MatroskaOpus = "audio/x-matroska;codecs=opus";
+ }
+}
diff --git a/DeepL/TargetMediaVoice.cs b/DeepL/TargetMediaVoice.cs
new file mode 100644
index 0000000..10b5c33
--- /dev/null
+++ b/DeepL/TargetMediaVoice.cs
@@ -0,0 +1,32 @@
+// Copyright 2025 DeepL SE (https://www.deepl.com)
+// Use of this source code is governed by an MIT
+// license that can be found in the LICENSE file.
+
+using System;
+
+namespace DeepL {
+ ///
+ /// Target audio voice selection for synthesized speech in Voice API sessions.
+ /// This feature is currently in closed beta.
+ ///
+ public enum TargetMediaVoice {
+ /// Male voice.
+ Male,
+
+ /// Female voice.
+ Female
+ }
+
+ /// Extension methods for .
+ public static class TargetMediaVoiceExtensions {
+ /// Retrieves the string representation used by the DeepL API.
+ /// If an unknown enum value is passed.
+ public static string ToApiValue(this TargetMediaVoice voice) {
+ return voice switch {
+ TargetMediaVoice.Male => "male",
+ TargetMediaVoice.Female => "female",
+ _ => throw new ArgumentOutOfRangeException(nameof(voice), voice, "Unrecognized target media voice value")
+ };
+ }
+ }
+}
diff --git a/DeepL/VoiceMessageFormat.cs b/DeepL/VoiceMessageFormat.cs
new file mode 100644
index 0000000..d4aace6
--- /dev/null
+++ b/DeepL/VoiceMessageFormat.cs
@@ -0,0 +1,29 @@
+// Copyright 2025 DeepL SE (https://www.deepl.com)
+// Use of this source code is governed by an MIT
+// license that can be found in the LICENSE file.
+
+using System;
+
+namespace DeepL {
+ /// Message encoding format for Voice API WebSocket communication.
+ public enum VoiceMessageFormat {
+ /// JSON-encoded messages sent as TEXT WebSocket frames. Binary fields are base64-encoded.
+ Json,
+
+ /// MessagePack-encoded messages sent as BINARY WebSocket frames. Binary fields are raw binary.
+ MessagePack
+ }
+
+ /// Extension methods for .
+ public static class VoiceMessageFormatExtensions {
+ /// Retrieves the string representation used by the DeepL API.
+ /// If an unknown enum value is passed.
+ public static string ToApiValue(this VoiceMessageFormat format) {
+ return format switch {
+ VoiceMessageFormat.Json => "json",
+ VoiceMessageFormat.MessagePack => "msgpack",
+ _ => throw new ArgumentOutOfRangeException(nameof(format), format, "Unrecognized message format value")
+ };
+ }
+ }
+}
diff --git a/DeepL/VoiceSession.cs b/DeepL/VoiceSession.cs
new file mode 100644
index 0000000..0ff826f
--- /dev/null
+++ b/DeepL/VoiceSession.cs
@@ -0,0 +1,258 @@
+// Copyright 2025 DeepL SE (https://www.deepl.com)
+// Use of this source code is governed by an MIT
+// license that can be found in the LICENSE file.
+
+using System;
+using System.Net.WebSockets;
+using System.Text;
+using System.Text.Json;
+using System.Threading;
+using System.Threading.Tasks;
+using DeepL.Internal;
+using DeepL.Model;
+
+namespace DeepL {
+ ///
+ /// Internal implementation of that manages a WebSocket connection
+ /// to the DeepL Voice API for real-time speech transcription and translation.
+ ///
+ internal sealed class VoiceSession : IVoiceSession {
+ private static readonly JsonSerializerOptions JsonOptions = new JsonSerializerOptions {
+ PropertyNamingPolicy = JsonNamingPolicy.CamelCase
+ };
+
+ private readonly DeepLHttpClient _httpClient;
+ private readonly object _lock = new object();
+ private ClientWebSocket _webSocket;
+ private CancellationTokenSource _receiveCts;
+ private Task? _receiveTask;
+ private string _lastToken;
+ private bool _disposed;
+
+ ///
+ public event EventHandler? SourceTranscriptUpdated;
+
+ ///
+ public event EventHandler? TargetTranscriptUpdated;
+
+ ///
+ public event EventHandler? TargetMediaChunkReceived;
+
+ ///
+ public event EventHandler? ErrorReceived;
+
+ ///
+ public event EventHandler? StreamEnded;
+
+ ///
+ public string? SessionId { get; private set; }
+
+ ///
+ public bool IsConnected {
+ get {
+ lock (_lock) {
+ return !_disposed && _webSocket.State == WebSocketState.Open;
+ }
+ }
+ }
+
+ internal VoiceSession(
+ DeepLHttpClient httpClient,
+ ClientWebSocket webSocket,
+ VoiceSessionInfo sessionInfo) {
+ _httpClient = httpClient;
+ _webSocket = webSocket;
+ _lastToken = sessionInfo.Token;
+ SessionId = sessionInfo.SessionId;
+ _receiveCts = new CancellationTokenSource();
+ _receiveTask = Task.Run(() => ReceiveLoopAsync(_receiveCts.Token));
+ }
+
+ ///
+ public async Task SendAudioAsync(byte[] audioData, CancellationToken cancellationToken = default) {
+ await SendAudioAsync(new ArraySegment(audioData), cancellationToken).ConfigureAwait(false);
+ }
+
+ ///
+ public async Task SendAudioAsync(ArraySegment audioData, CancellationToken cancellationToken = default) {
+ EnsureConnected();
+
+ var base64Data = Convert.ToBase64String(
+ audioData.Array ?? throw new ArgumentException("Audio data array is null"),
+ audioData.Offset,
+ audioData.Count);
+ var message = $"{{\"source_media_chunk\":{{\"data\":\"{base64Data}\"}}}}";
+ var bytes = Encoding.UTF8.GetBytes(message);
+
+ await _webSocket.SendAsync(
+ new ArraySegment(bytes),
+ WebSocketMessageType.Text,
+ endOfMessage: true,
+ cancellationToken).ConfigureAwait(false);
+ }
+
+ ///
+ public async Task EndAudioAsync(CancellationToken cancellationToken = default) {
+ EnsureConnected();
+
+ var message = "{\"end_of_source_media\":{}}";
+ var bytes = Encoding.UTF8.GetBytes(message);
+
+ await _webSocket.SendAsync(
+ new ArraySegment(bytes),
+ WebSocketMessageType.Text,
+ endOfMessage: true,
+ cancellationToken).ConfigureAwait(false);
+ }
+
+ ///
+ public async Task ReconnectAsync(CancellationToken cancellationToken = default) {
+ // Stop current receive loop
+ _receiveCts.Cancel();
+ if (_receiveTask != null) {
+ try {
+ await _receiveTask.ConfigureAwait(false);
+ } catch (OperationCanceledException) {
+ // Expected
+ }
+ }
+
+ // Close existing WebSocket if still open
+ if (_webSocket.State == WebSocketState.Open || _webSocket.State == WebSocketState.CloseReceived) {
+ try {
+ await _webSocket.CloseAsync(WebSocketCloseStatus.NormalClosure, "Reconnecting", CancellationToken.None)
+ .ConfigureAwait(false);
+ } catch (WebSocketException) {
+ // Ignore close errors during reconnection
+ }
+ }
+
+ _webSocket.Dispose();
+
+ // Request new token via GET v3/voice/realtime?token=
+ var queryParams = new[] { ("token", _lastToken) };
+ using var responseMessage = await _httpClient.ApiGetAsync("v3/voice/realtime", cancellationToken, queryParams)
+ .ConfigureAwait(false);
+ await DeepLHttpClient.CheckStatusCodeAsync(responseMessage).ConfigureAwait(false);
+ var sessionInfo = await JsonUtils.DeserializeAsync(responseMessage).ConfigureAwait(false);
+
+ _lastToken = sessionInfo.Token;
+ SessionId = sessionInfo.SessionId;
+
+ // Establish new WebSocket connection
+ var wsUri = new Uri($"{sessionInfo.StreamingUrl}?token={Uri.EscapeDataString(sessionInfo.Token)}");
+ _webSocket = new ClientWebSocket();
+ await _webSocket.ConnectAsync(wsUri, cancellationToken).ConfigureAwait(false);
+
+ // Restart receive loop
+ _receiveCts = new CancellationTokenSource();
+ _receiveTask = Task.Run(() => ReceiveLoopAsync(_receiveCts.Token));
+ }
+
+ /// Background loop that receives and dispatches WebSocket messages.
+ private async Task ReceiveLoopAsync(CancellationToken cancellationToken) {
+ var buffer = new byte[64 * 1024]; // 64 KB buffer
+ var messageBuilder = new StringBuilder();
+
+ try {
+ while (!cancellationToken.IsCancellationRequested &&
+ _webSocket.State == WebSocketState.Open) {
+ messageBuilder.Clear();
+ WebSocketReceiveResult result;
+ do {
+ result = await _webSocket.ReceiveAsync(
+ new ArraySegment(buffer), cancellationToken).ConfigureAwait(false);
+
+ if (result.MessageType == WebSocketMessageType.Close) {
+ return;
+ }
+
+ if (result.MessageType == WebSocketMessageType.Text) {
+ messageBuilder.Append(Encoding.UTF8.GetString(buffer, 0, result.Count));
+ }
+ } while (!result.EndOfMessage);
+
+ if (messageBuilder.Length > 0) {
+ DispatchMessage(messageBuilder.ToString());
+ }
+ }
+ } catch (OperationCanceledException) {
+ // Normal cancellation
+ } catch (WebSocketException) {
+ // Connection lost — consumer should call ReconnectAsync
+ }
+ }
+
+ /// Parses a JSON message from the WebSocket and dispatches it to the appropriate event.
+ private void DispatchMessage(string json) {
+ try {
+ using var document = JsonDocument.Parse(json);
+ var root = document.RootElement;
+
+ if (root.TryGetProperty("source_transcript_update", out var sourceUpdate)) {
+ var update = JsonSerializer.Deserialize(sourceUpdate.GetRawText(), JsonOptions);
+ if (update != null) {
+ SourceTranscriptUpdated?.Invoke(this, update);
+ }
+ } else if (root.TryGetProperty("target_transcript_update", out var targetUpdate)) {
+ var update = JsonSerializer.Deserialize(targetUpdate.GetRawText(), JsonOptions);
+ if (update != null) {
+ TargetTranscriptUpdated?.Invoke(this, update);
+ }
+ } else if (root.TryGetProperty("target_media_chunk", out var mediaChunk)) {
+ var chunk = JsonSerializer.Deserialize(mediaChunk.GetRawText(), JsonOptions);
+ if (chunk != null) {
+ TargetMediaChunkReceived?.Invoke(this, chunk);
+ }
+ } else if (root.TryGetProperty("end_of_source_transcript", out _)) {
+ // Source transcript complete — no special event needed, handled via StreamEnded
+ } else if (root.TryGetProperty("end_of_target_transcript", out _)) {
+ // Target transcript complete — no special event needed, handled via StreamEnded
+ } else if (root.TryGetProperty("end_of_target_media", out _)) {
+ // Target media complete — no special event needed, handled via StreamEnded
+ } else if (root.TryGetProperty("end_of_stream", out _)) {
+ StreamEnded?.Invoke(this, EventArgs.Empty);
+ } else if (root.TryGetProperty("error", out var errorElement)) {
+ var error = JsonSerializer.Deserialize(errorElement.GetRawText(), JsonOptions);
+ if (error != null) {
+ ErrorReceived?.Invoke(this, error);
+ }
+ }
+ } catch (JsonException) {
+ // Ignore malformed messages
+ }
+ }
+
+ private void EnsureConnected() {
+ if (_disposed) {
+ throw new ObjectDisposedException(nameof(VoiceSession));
+ }
+
+ if (_webSocket.State != WebSocketState.Open) {
+ throw new DeepLException("Voice session WebSocket is not connected");
+ }
+ }
+
+ /// Releases the WebSocket connection and stops the receive loop.
+ public void Dispose() {
+ lock (_lock) {
+ if (_disposed) return;
+ _disposed = true;
+ }
+
+ _receiveCts.Cancel();
+
+ try {
+ if (_webSocket.State == WebSocketState.Open) {
+ _webSocket.CloseAsync(WebSocketCloseStatus.NormalClosure, "Disposing", CancellationToken.None)
+ .GetAwaiter().GetResult();
+ }
+ } catch (WebSocketException) {
+ // Ignore errors during disposal
+ }
+
+ _webSocket.Dispose();
+ _receiveCts.Dispose();
+ }
+ }
+}
diff --git a/DeepL/VoiceSessionOptions.cs b/DeepL/VoiceSessionOptions.cs
new file mode 100644
index 0000000..81a6cac
--- /dev/null
+++ b/DeepL/VoiceSessionOptions.cs
@@ -0,0 +1,71 @@
+// Copyright 2025 DeepL SE (https://www.deepl.com)
+// Use of this source code is governed by an MIT
+// license that can be found in the LICENSE file.
+
+namespace DeepL {
+ ///
+ /// Options to control Voice API session creation. These options are provided to
+ /// .
+ ///
+ public sealed class VoiceSessionOptions {
+ /// Initializes a new object.
+ public VoiceSessionOptions() { }
+
+ ///
+ /// The audio format for streaming, which specifies container, codec, and encoding parameters.
+ /// Use constants from for supported values. Required.
+ ///
+ public string SourceMediaContentType { get; set; } = DeepL.SourceMediaContentType.Auto;
+
+ ///
+ /// Message encoding format for WebSocket communication. If null, the API default is used
+ /// (currently ).
+ ///
+ public VoiceMessageFormat? MessageFormat { get; set; }
+
+ ///
+ /// The source language of the audio stream, or null for auto-detection.
+ /// Must be a supported Voice API source language complying with IETF BCP 47 language tags.
+ ///
+ public string? SourceLanguage { get; set; }
+
+ ///
+ /// Controls how the value is used.
+ /// Defaults to if not specified.
+ ///
+ public SourceLanguageMode? SourceLanguageMode { get; set; }
+
+ ///
+ /// List of target languages for translation. The stream will emit translations for each language.
+ /// Maximum 5 target languages per session. Language identifiers must comply with IETF BCP 47.
+ ///
+ public string[] TargetLanguages { get; set; } = System.Array.Empty();
+
+ ///
+ /// List of target languages for which to generate synthesized audio. This feature is in closed beta.
+ /// Languages specified here will automatically be added to if not already present.
+ /// Maximum 5 target media languages per session.
+ ///
+ public string[]? TargetMediaLanguages { get; set; }
+
+ ///
+ /// The audio format for synthesized target media streaming. This feature is in closed beta.
+ /// Defaults to "audio/webm;codecs=opus" if not specified.
+ ///
+ public string? TargetMediaContentType { get; set; }
+
+ ///
+ /// Target audio voice selection for synthesized speech. This feature is in closed beta.
+ ///
+ public TargetMediaVoice? TargetMediaVoice { get; set; }
+
+ /// A glossary ID to use for translation.
+ public string? GlossaryId { get; set; }
+
+ ///
+ /// Sets whether the translated text should lean towards formal or informal language.
+ /// Possible values: "default", "formal", "more", "informal", "less".
+ ///
+ public string? Formality { get; set; }
+ }
+}
diff --git a/DeepLTests/VoiceSessionTest.cs b/DeepLTests/VoiceSessionTest.cs
new file mode 100644
index 0000000..6f494a4
--- /dev/null
+++ b/DeepLTests/VoiceSessionTest.cs
@@ -0,0 +1,180 @@
+// Copyright 2025 DeepL SE (https://www.deepl.com)
+// Use of this source code is governed by an MIT
+// license that can be found in the LICENSE file.
+
+using System;
+using System.Collections.Generic;
+using System.Text.Json;
+using System.Threading.Tasks;
+using DeepL;
+using DeepL.Model;
+using Xunit;
+
+namespace DeepLTests {
+ /// Unit tests for Voice API types that do not require API access.
+ public sealed class VoiceSessionUnitTest {
+ [Fact]
+ public void TestVoiceSessionOptionsDefaults() {
+ var options = new VoiceSessionOptions();
+ Assert.Equal(SourceMediaContentType.Auto, options.SourceMediaContentType);
+ Assert.Null(options.MessageFormat);
+ Assert.Null(options.SourceLanguage);
+ Assert.Null(options.SourceLanguageMode);
+ Assert.NotNull(options.TargetLanguages);
+ Assert.Empty(options.TargetLanguages);
+ Assert.Null(options.TargetMediaLanguages);
+ Assert.Null(options.TargetMediaContentType);
+ Assert.Null(options.TargetMediaVoice);
+ Assert.Null(options.GlossaryId);
+ Assert.Null(options.Formality);
+ }
+
+ [Fact]
+ public void TestVoiceSessionOptionsConfiguration() {
+ var options = new VoiceSessionOptions {
+ SourceMediaContentType = SourceMediaContentType.OggOpus,
+ MessageFormat = VoiceMessageFormat.Json,
+ SourceLanguage = "en",
+ SourceLanguageMode = DeepL.SourceLanguageMode.Fixed,
+ TargetLanguages = new[] { "de", "fr", "es" },
+ TargetMediaVoice = TargetMediaVoice.Female,
+ GlossaryId = "test-glossary-id",
+ Formality = "formal"
+ };
+
+ Assert.Equal(SourceMediaContentType.OggOpus, options.SourceMediaContentType);
+ Assert.Equal(VoiceMessageFormat.Json, options.MessageFormat);
+ Assert.Equal("en", options.SourceLanguage);
+ Assert.Equal(DeepL.SourceLanguageMode.Fixed, options.SourceLanguageMode);
+ Assert.Equal(3, options.TargetLanguages.Length);
+ Assert.Equal(TargetMediaVoice.Female, options.TargetMediaVoice);
+ Assert.Equal("test-glossary-id", options.GlossaryId);
+ Assert.Equal("formal", options.Formality);
+ }
+
+ [Fact]
+ public void TestVoiceMessageFormatApiValues() {
+ Assert.Equal("json", VoiceMessageFormat.Json.ToApiValue());
+ Assert.Equal("msgpack", VoiceMessageFormat.MessagePack.ToApiValue());
+ }
+
+ [Fact]
+ public void TestSourceLanguageModeApiValues() {
+ Assert.Equal("auto", DeepL.SourceLanguageMode.Auto.ToApiValue());
+ Assert.Equal("fixed", DeepL.SourceLanguageMode.Fixed.ToApiValue());
+ }
+
+ [Fact]
+ public void TestTargetMediaVoiceApiValues() {
+ Assert.Equal("male", TargetMediaVoice.Male.ToApiValue());
+ Assert.Equal("female", TargetMediaVoice.Female.ToApiValue());
+ }
+
+ [Fact]
+ public void TestVoiceSessionInfoDeserialization() {
+ var json = "{\"streaming_url\":\"wss://api.deepl.com/v3/voice/realtime/connect\"," +
+ "\"token\":\"test-token-123\"," +
+ "\"session_id\":\"test-session-456\"}";
+ var info = JsonSerializer.Deserialize(json);
+ Assert.NotNull(info);
+ Assert.Equal("wss://api.deepl.com/v3/voice/realtime/connect", info!.StreamingUrl);
+ Assert.Equal("test-token-123", info.Token);
+ Assert.Equal("test-session-456", info.SessionId);
+ }
+
+ [Fact]
+ public void TestTranscriptUpdateDeserialization() {
+ var json = "{\"concluded\":[{\"text\":\"Hello \"}],\"tentative\":[{\"text\":\"world\"}],\"language\":\"de\"}";
+ var update = JsonSerializer.Deserialize(json);
+ Assert.NotNull(update);
+ Assert.Single(update!.Concluded);
+ Assert.Equal("Hello ", update.Concluded[0].Text);
+ Assert.Single(update.Tentative);
+ Assert.Equal("world", update.Tentative[0].Text);
+ Assert.Equal("de", update.Language);
+ }
+
+ [Fact]
+ public void TestTranscriptSegmentDeserialization() {
+ var json = "{\"text\":\"Hello world\"}";
+ var segment = JsonSerializer.Deserialize(json);
+ Assert.NotNull(segment);
+ Assert.Equal("Hello world", segment!.Text);
+ Assert.Equal("Hello world", segment.ToString());
+ }
+
+ [Fact]
+ public void TestTargetMediaChunkDeserialization() {
+ var json = "{\"content_type\":\"audio/webm;codecs=opus\"," +
+ "\"headers\":1," +
+ "\"data\":[\"base64data1\",\"base64data2\"]," +
+ "\"text\":\"Hallo Welt\"," +
+ "\"language\":\"de\"," +
+ "\"duration\":1.5}";
+ var chunk = JsonSerializer.Deserialize(json);
+ Assert.NotNull(chunk);
+ Assert.Equal("audio/webm;codecs=opus", chunk!.ContentType);
+ Assert.Equal(1, chunk.Headers);
+ Assert.Equal(2, chunk.Data.Length);
+ Assert.Equal("base64data1", chunk.Data[0]);
+ Assert.Equal("Hallo Welt", chunk.Text);
+ Assert.Equal("de", chunk.Language);
+ Assert.Equal(1.5, chunk.Duration);
+ }
+
+ [Fact]
+ public void TestVoiceStreamErrorDeserialization() {
+ var json = "{\"code\":\"4001\",\"reason\":\"invalid_audio\",\"message\":\"Audio format not supported\"}";
+ var error = JsonSerializer.Deserialize(json);
+ Assert.NotNull(error);
+ Assert.Equal("4001", error!.Code);
+ Assert.Equal("invalid_audio", error.Reason);
+ Assert.Equal("Audio format not supported", error.Message);
+ }
+
+ [Fact]
+ public void TestSourceMediaContentTypeConstants() {
+ Assert.Equal("audio/auto", SourceMediaContentType.Auto);
+ Assert.Equal("audio/flac", SourceMediaContentType.Flac);
+ Assert.Equal("audio/mpeg", SourceMediaContentType.Mpeg);
+ Assert.Equal("audio/ogg", SourceMediaContentType.Ogg);
+ Assert.Equal("audio/webm", SourceMediaContentType.WebM);
+ Assert.Equal("audio/x-matroska", SourceMediaContentType.Matroska);
+ Assert.Equal("audio/ogg;codecs=flac", SourceMediaContentType.OggFlac);
+ Assert.Equal("audio/ogg;codecs=opus", SourceMediaContentType.OggOpus);
+ Assert.Equal("audio/pcm;encoding=s16le;rate=16000", SourceMediaContentType.PcmS16le16000);
+ Assert.Equal("audio/webm;codecs=opus", SourceMediaContentType.WebMOpus);
+ }
+ }
+
+ /// Tests for Voice API session creation that require API access.
+ public sealed class VoiceSessionClientTest : BaseDeepLTest {
+ [Fact]
+ public async Task TestCreateSessionRequiresTargetLanguages() {
+ var client = CreateTestClient();
+ var options = new VoiceSessionOptions {
+ SourceMediaContentType = SourceMediaContentType.OggOpus
+ };
+ await Assert.ThrowsAsync(
+ () => client.CreateVoiceSessionAsync(options));
+ }
+
+ [Fact]
+ public async Task TestCreateSessionRejectsExcessiveTargetLanguages() {
+ var client = CreateTestClient();
+ var options = new VoiceSessionOptions {
+ SourceMediaContentType = SourceMediaContentType.OggOpus,
+ TargetLanguages = new[] { "de", "fr", "es", "it", "nl", "pt" }
+ };
+ await Assert.ThrowsAsync(
+ () => client.CreateVoiceSessionAsync(options));
+ }
+
+ [Fact]
+ public async Task TestCreateSessionRejectsNullOptions() {
+ var client = CreateTestClient();
+ await Assert.ThrowsAsync(
+ () => client.CreateVoiceSessionAsync(null!));
+ }
+ }
+}