From 5330a216b66e807fcba5df96e68ce5bf14530eba Mon Sep 17 00:00:00 2001 From: Tim Cadenbach Date: Tue, 31 Mar 2026 11:22:53 +0200 Subject: [PATCH 1/2] Draft for a DeepL Voice implementation --- DeepL/DeepL.csproj | 1 + DeepL/DeepLClient.cs | 76 ++++++++- DeepL/IVoiceManager.cs | 28 ++++ DeepL/IVoiceSession.cs | 77 +++++++++ DeepL/Model/TargetMediaChunk.cs | 68 ++++++++ DeepL/Model/TranscriptSegment.cs | 29 ++++ DeepL/Model/TranscriptUpdate.cs | 41 +++++ DeepL/Model/VoiceSessionInfo.cs | 40 +++++ DeepL/Model/VoiceStreamError.cs | 41 +++++ DeepL/SourceLanguageMode.cs | 29 ++++ DeepL/SourceMediaContentType.cs | 68 ++++++++ DeepL/TargetMediaVoice.cs | 32 ++++ DeepL/VoiceMessageFormat.cs | 29 ++++ DeepL/VoiceSession.cs | 258 +++++++++++++++++++++++++++++++ DeepL/VoiceSessionOptions.cs | 70 +++++++++ DeepLTests/VoiceSessionTest.cs | 180 +++++++++++++++++++++ 16 files changed, 1066 insertions(+), 1 deletion(-) create mode 100644 DeepL/IVoiceManager.cs create mode 100644 DeepL/IVoiceSession.cs create mode 100644 DeepL/Model/TargetMediaChunk.cs create mode 100644 DeepL/Model/TranscriptSegment.cs create mode 100644 DeepL/Model/TranscriptUpdate.cs create mode 100644 DeepL/Model/VoiceSessionInfo.cs create mode 100644 DeepL/Model/VoiceStreamError.cs create mode 100644 DeepL/SourceLanguageMode.cs create mode 100644 DeepL/SourceMediaContentType.cs create mode 100644 DeepL/TargetMediaVoice.cs create mode 100644 DeepL/VoiceMessageFormat.cs create mode 100644 DeepL/VoiceSession.cs create mode 100644 DeepL/VoiceSessionOptions.cs create mode 100644 DeepLTests/VoiceSessionTest.cs diff --git a/DeepL/DeepL.csproj b/DeepL/DeepL.csproj index f6319aa..c6c8400 100644 --- a/DeepL/DeepL.csproj +++ b/DeepL/DeepL.csproj @@ -34,6 +34,7 @@ + diff --git a/DeepL/DeepLClient.cs b/DeepL/DeepLClient.cs index 6a2dc91..73cb4d4 100644 --- a/DeepL/DeepLClient.cs +++ b/DeepL/DeepLClient.cs @@ -6,6 +6,7 @@ using System.Collections.Generic; using System.IO; using System.Linq; +using System.Net.WebSockets; using System.Text.Json; using System.Text.Json.Serialization; using System.Threading; @@ -54,7 +55,7 @@ Task RephraseTextAsync( /// Client for the DeepL API. To use the DeepL API, initialize an instance of this class using your DeepL /// Authentication Key. All functions are thread-safe, aside from . /// - public sealed class DeepLClient : Translator, IWriter, IGlossaryManager, IStyleRuleManager { + public sealed class DeepLClient : Translator, IWriter, IGlossaryManager, IStyleRuleManager, IVoiceManager { /// Initializes a new instance of the class. /// The message that describes the error. public DeepLClient(string authKey, DeepLClientOptions? options = null) : base(authKey, options) { } @@ -939,6 +940,79 @@ private static (string Key, string Value)[] CreateLanguageQueryParams( DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull }; + /// + public async Task CreateVoiceSessionAsync( + VoiceSessionOptions options, + CancellationToken cancellationToken = default) { + if (options == null) { + throw new ArgumentNullException(nameof(options)); + } + + if (options.TargetLanguages == null || options.TargetLanguages.Length == 0) { + throw new ArgumentException("At least one target language must be specified"); + } + + if (options.TargetLanguages.Length > 5) { + throw new ArgumentException("Maximum 5 target languages per session"); + } + + var requestData = new Dictionary { + ["source_media_content_type"] = options.SourceMediaContentType, + ["target_languages"] = options.TargetLanguages + }; + + if (options.MessageFormat != null) { + requestData["message_format"] = options.MessageFormat.Value.ToApiValue(); + } + + if (options.SourceLanguage != null) { + requestData["source_language"] = options.SourceLanguage; + } + + if (options.SourceLanguageMode != null) { + requestData["source_language_mode"] = options.SourceLanguageMode.Value.ToApiValue(); + } + + if (options.TargetMediaLanguages != null) { + requestData["target_media_languages"] = options.TargetMediaLanguages; + } + + if (options.TargetMediaContentType != null) { + requestData["target_media_content_type"] = options.TargetMediaContentType; + } + + if (options.TargetMediaVoice != null) { + requestData["target_media_voice"] = options.TargetMediaVoice.Value.ToApiValue(); + } + + if (options.GlossaryId != null) { + requestData["glossary_id"] = options.GlossaryId; + } + + if (options.Formality != null) { + requestData["formality"] = options.Formality; + } + + using var responseMessage = await _client + .ApiPostJsonAsync("v3/voice/realtime", cancellationToken, requestData, SerializationOptions) + .ConfigureAwait(false); + + await DeepLHttpClient.CheckStatusCodeAsync(responseMessage).ConfigureAwait(false); + var sessionInfo = await JsonUtils.DeserializeAsync(responseMessage).ConfigureAwait(false); + + // Establish WebSocket connection + var wsUri = new Uri($"{sessionInfo.StreamingUrl}?token={Uri.EscapeDataString(sessionInfo.Token)}"); + var webSocket = new ClientWebSocket(); + try { + await webSocket.ConnectAsync(wsUri, cancellationToken).ConfigureAwait(false); + } catch (Exception ex) { + webSocket.Dispose(); + throw new DeepLException("Failed to establish Voice API WebSocket connection", ex); + } + + return new VoiceSession(_client, webSocket, sessionInfo); + } + /// Class used for JSON-deserialization of style rule list results. private readonly struct StyleRuleListResult { /// Initializes a new instance of , used for JSON deserialization. diff --git a/DeepL/IVoiceManager.cs b/DeepL/IVoiceManager.cs new file mode 100644 index 0000000..afc2e6f --- /dev/null +++ b/DeepL/IVoiceManager.cs @@ -0,0 +1,28 @@ +// Copyright 2025 DeepL SE (https://www.deepl.com) +// Use of this source code is governed by an MIT +// license that can be found in the LICENSE file. + +using System; +using System.Threading; +using System.Threading.Tasks; + +namespace DeepL { + /// Interface for creating Voice API streaming sessions. + public interface IVoiceManager : IDisposable { + /// + /// Creates a new Voice API streaming session for real-time speech transcription and translation. + /// This requests a session from the DeepL API and establishes a WebSocket connection. + /// + /// Options controlling session configuration including audio format, languages, etc. + /// The cancellation token to cancel the operation. + /// An for streaming audio and receiving transcripts. + /// If any option is invalid. + /// + /// If any error occurs while communicating with the DeepL API, a + /// or a derived class will be thrown. + /// + Task CreateVoiceSessionAsync( + VoiceSessionOptions options, + CancellationToken cancellationToken = default); + } +} diff --git a/DeepL/IVoiceSession.cs b/DeepL/IVoiceSession.cs new file mode 100644 index 0000000..d5d0e6c --- /dev/null +++ b/DeepL/IVoiceSession.cs @@ -0,0 +1,77 @@ +// Copyright 2025 DeepL SE (https://www.deepl.com) +// Use of this source code is governed by an MIT +// license that can be found in the LICENSE file. + +using System; +using System.Threading; +using System.Threading.Tasks; +using DeepL.Model; + +namespace DeepL { + /// + /// Represents an active Voice API streaming session. Provides methods for sending audio data and receiving + /// real-time transcriptions and translations via events. + /// + /// + /// Events fire on a background thread. Consumers are responsible for marshaling to the appropriate + /// synchronization context if needed. Dispose the session to close the WebSocket connection. + /// + public interface IVoiceSession : IDisposable { + /// Raised when a source transcript update is received from the server. + event EventHandler? SourceTranscriptUpdated; + + /// Raised when a target transcript update is received from the server. + event EventHandler? TargetTranscriptUpdated; + + /// + /// Raised when a target media audio chunk is received from the server. This feature is in closed beta. + /// + event EventHandler? TargetMediaChunkReceived; + + /// Raised when an error message is received from the WebSocket connection. + event EventHandler? ErrorReceived; + + /// Raised when the end-of-stream message is received, indicating all outputs are complete. + event EventHandler? StreamEnded; + + /// The unique session identifier. + string? SessionId { get; } + + /// Whether the WebSocket connection is currently open. + bool IsConnected { get; } + + /// + /// Sends a chunk of audio data to the server. The audio encoding must match the + /// specified when creating the session. + /// + /// Audio data to send. Must not exceed 100 KB or 1 second duration. + /// The cancellation token to cancel the operation. + /// If the session is not connected or sending fails. + Task SendAudioAsync(byte[] audioData, CancellationToken cancellationToken = default); + + /// + /// Sends a chunk of audio data to the server using a memory-efficient overload. + /// + /// Audio data to send. Must not exceed 100 KB or 1 second duration. + /// The cancellation token to cancel the operation. + /// If the session is not connected or sending fails. + Task SendAudioAsync(ArraySegment audioData, CancellationToken cancellationToken = default); + + /// + /// Signals the end of the audio stream. Causes finalization of tentative transcript segments and + /// triggers emission of final transcript updates, end-of-transcript, and end-of-stream messages. + /// No more audio data can be sent after calling this method. + /// + /// The cancellation token to cancel the operation. + /// If the session is not connected or sending fails. + Task EndAudioAsync(CancellationToken cancellationToken = default); + + /// + /// Requests a reconnection token and establishes a new WebSocket connection, resuming the session. + /// This should be called when the WebSocket connection is lost unexpectedly. + /// + /// The cancellation token to cancel the operation. + /// If reconnection fails. + Task ReconnectAsync(CancellationToken cancellationToken = default); + } +} diff --git a/DeepL/Model/TargetMediaChunk.cs b/DeepL/Model/TargetMediaChunk.cs new file mode 100644 index 0000000..f6b1522 --- /dev/null +++ b/DeepL/Model/TargetMediaChunk.cs @@ -0,0 +1,68 @@ +// Copyright 2025 DeepL SE (https://www.deepl.com) +// Use of this source code is governed by an MIT +// license that can be found in the LICENSE file. + +using System.Text.Json.Serialization; + +namespace DeepL.Model { + /// + /// Represents a translated audio chunk from the Voice API. This feature is currently in closed beta. + /// Audio data is provided as an array of base64-encoded indivisible chunks. + /// + public sealed class TargetMediaChunk { + /// Initializes a new instance of . + /// The content type of the audio data. Present in the first message. + /// Number of header packets at the start of the data array, or null if all are audio. + /// Array of base64-encoded audio data packets. + /// Text corresponding to this audio chunk, for subtitle synchronization. + /// The target language of this audio chunk. + /// Duration of this audio chunk in seconds. + /// + /// The constructor for this class (and all other Model classes) should not be used by library users. Ideally it + /// would be marked , but needs to be for JSON deserialization. + /// In future this function may have backwards-incompatible changes. + /// + [JsonConstructor] + public TargetMediaChunk( + string? contentType, + int? headers, + string[] data, + string? text, + string? language, + double? duration) { + ContentType = contentType; + Headers = headers; + Data = data; + Text = text; + Language = language; + Duration = duration; + } + + /// The content type of the audio data. Present in the first message of a sequence. + [JsonPropertyName("content_type")] + public string? ContentType { get; } + + /// + /// Number of packets at the start of that contain initialization/header data. + /// Null or absent when all packets are audio data. + /// + [JsonPropertyName("headers")] + public int? Headers { get; } + + /// Array of base64-encoded indivisible audio data packets. + [JsonPropertyName("data")] + public string[] Data { get; } + + /// Text corresponding to this audio chunk, for subtitle synchronization. + [JsonPropertyName("text")] + public string? Text { get; } + + /// The target language of this audio chunk. + [JsonPropertyName("language")] + public string? Language { get; } + + /// Duration of this audio chunk in seconds. + [JsonPropertyName("duration")] + public double? Duration { get; } + } +} diff --git a/DeepL/Model/TranscriptSegment.cs b/DeepL/Model/TranscriptSegment.cs new file mode 100644 index 0000000..b678ce2 --- /dev/null +++ b/DeepL/Model/TranscriptSegment.cs @@ -0,0 +1,29 @@ +// Copyright 2025 DeepL SE (https://www.deepl.com) +// Use of this source code is governed by an MIT +// license that can be found in the LICENSE file. + +using System.Text.Json.Serialization; + +namespace DeepL.Model { + /// A single text segment within a Voice API transcript update. + public sealed class TranscriptSegment { + /// Initializes a new instance of . + /// The text content of this segment. + /// + /// The constructor for this class (and all other Model classes) should not be used by library users. Ideally it + /// would be marked , but needs to be for JSON deserialization. + /// In future this function may have backwards-incompatible changes. + /// + [JsonConstructor] + public TranscriptSegment(string text) { + Text = text; + } + + /// The text content of this segment. + [JsonPropertyName("text")] + public string Text { get; } + + /// Returns the text content of this segment. + public override string ToString() => Text; + } +} diff --git a/DeepL/Model/TranscriptUpdate.cs b/DeepL/Model/TranscriptUpdate.cs new file mode 100644 index 0000000..9db2adc --- /dev/null +++ b/DeepL/Model/TranscriptUpdate.cs @@ -0,0 +1,41 @@ +// Copyright 2025 DeepL SE (https://www.deepl.com) +// Use of this source code is governed by an MIT +// license that can be found in the LICENSE file. + +using System.Text.Json.Serialization; + +namespace DeepL.Model { + /// + /// Represents a transcript update from the Voice API, containing concluded (finalized) and tentative + /// (in-progress) text segments. Used for both source and target transcript updates. + /// + public sealed class TranscriptUpdate { + /// Initializes a new instance of . + /// Finalized text segments that will not change. + /// Preliminary text segments that may be refined. + /// The language code of this transcript update. Only present on target updates. + /// + /// The constructor for this class (and all other Model classes) should not be used by library users. Ideally it + /// would be marked , but needs to be for JSON deserialization. + /// In future this function may have backwards-incompatible changes. + /// + [JsonConstructor] + public TranscriptUpdate(TranscriptSegment[] concluded, TranscriptSegment[] tentative, string? language) { + Concluded = concluded; + Tentative = tentative; + Language = language; + } + + /// Finalized text segments that will not change. These segments are sent once and remain fixed. + [JsonPropertyName("concluded")] + public TranscriptSegment[] Concluded { get; } + + /// Preliminary text segments that may be refined as more audio context becomes available. + [JsonPropertyName("tentative")] + public TranscriptSegment[] Tentative { get; } + + /// The language code of this transcript update. Only present on target transcript updates. + [JsonPropertyName("language")] + public string? Language { get; } + } +} diff --git a/DeepL/Model/VoiceSessionInfo.cs b/DeepL/Model/VoiceSessionInfo.cs new file mode 100644 index 0000000..45aa899 --- /dev/null +++ b/DeepL/Model/VoiceSessionInfo.cs @@ -0,0 +1,40 @@ +// Copyright 2025 DeepL SE (https://www.deepl.com) +// Use of this source code is governed by an MIT +// license that can be found in the LICENSE file. + +using System.Text.Json.Serialization; + +namespace DeepL.Model { + /// Information about a Voice API session, received from the session request endpoint. + public sealed class VoiceSessionInfo { + /// Initializes a new instance of . + /// The WebSocket URL for establishing the stream connection. + /// Ephemeral authentication token for the streaming endpoint. + /// Unique identifier for the session. + /// + /// The constructor for this class (and all other Model classes) should not be used by library users. Ideally it + /// would be marked , but needs to be for JSON deserialization. + /// In future this function may have backwards-incompatible changes. + /// + [JsonConstructor] + public VoiceSessionInfo(string streamingUrl, string token, string? sessionId) { + StreamingUrl = streamingUrl; + Token = token; + SessionId = sessionId; + } + + /// The WebSocket URL to use for establishing the stream connection. + [JsonPropertyName("streaming_url")] + public string StreamingUrl { get; } + + /// + /// Ephemeral authentication token for the streaming endpoint. Valid for one-time use only. + /// + [JsonPropertyName("token")] + public string Token { get; } + + /// Unique identifier for the session. + [JsonPropertyName("session_id")] + public string? SessionId { get; } + } +} diff --git a/DeepL/Model/VoiceStreamError.cs b/DeepL/Model/VoiceStreamError.cs new file mode 100644 index 0000000..80a0311 --- /dev/null +++ b/DeepL/Model/VoiceStreamError.cs @@ -0,0 +1,41 @@ +// Copyright 2025 DeepL SE (https://www.deepl.com) +// Use of this source code is governed by an MIT +// license that can be found in the LICENSE file. + +using System.Text.Json.Serialization; + +namespace DeepL.Model { + /// Represents an error message received from the Voice API WebSocket connection. + public sealed class VoiceStreamError { + /// Initializes a new instance of . + /// The error code. + /// The reason code for the error. + /// A human-readable error message. + /// + /// The constructor for this class (and all other Model classes) should not be used by library users. Ideally it + /// would be marked , but needs to be for JSON deserialization. + /// In future this function may have backwards-incompatible changes. + /// + [JsonConstructor] + public VoiceStreamError(string? code, string? reason, string? message) { + Code = code; + Reason = reason; + Message = message; + } + + /// The error code. + [JsonPropertyName("code")] + public string? Code { get; } + + /// The reason code for the error. + [JsonPropertyName("reason")] + public string? Reason { get; } + + /// A human-readable error message. + [JsonPropertyName("message")] + public string? Message { get; } + + /// Returns the error message. + public override string ToString() => $"VoiceStreamError(code={Code}, reason={Reason}, message={Message})"; + } +} diff --git a/DeepL/SourceLanguageMode.cs b/DeepL/SourceLanguageMode.cs new file mode 100644 index 0000000..521037f --- /dev/null +++ b/DeepL/SourceLanguageMode.cs @@ -0,0 +1,29 @@ +// Copyright 2025 DeepL SE (https://www.deepl.com) +// Use of this source code is governed by an MIT +// license that can be found in the LICENSE file. + +using System; + +namespace DeepL { + /// Controls how the source language value is used in Voice API sessions. + public enum SourceLanguageMode { + /// Treats source language as a hint; server can override. + Auto, + + /// Treats source language as mandatory; server must use this language. + Fixed + } + + /// Extension methods for . + public static class SourceLanguageModeExtensions { + /// Retrieves the string representation used by the DeepL API. + /// If an unknown enum value is passed. + public static string ToApiValue(this SourceLanguageMode mode) { + return mode switch { + SourceLanguageMode.Auto => "auto", + SourceLanguageMode.Fixed => "fixed", + _ => throw new ArgumentOutOfRangeException(nameof(mode), mode, "Unrecognized source language mode value") + }; + } + } +} diff --git a/DeepL/SourceMediaContentType.cs b/DeepL/SourceMediaContentType.cs new file mode 100644 index 0000000..fe48105 --- /dev/null +++ b/DeepL/SourceMediaContentType.cs @@ -0,0 +1,68 @@ +// Copyright 2025 DeepL SE (https://www.deepl.com) +// Use of this source code is governed by an MIT +// license that can be found in the LICENSE file. + +namespace DeepL { + /// + /// String constants for audio format content types supported by the DeepL Voice API. + /// Use these when configuring . + /// + public static class SourceMediaContentType { + /// Auto-detect container and codec. Supported for all formats except PCM. + public const string Auto = "audio/auto"; + + /// FLAC container with FLAC codec. + public const string Flac = "audio/flac"; + + /// MPEG container with MP3 codec. + public const string Mpeg = "audio/mpeg"; + + /// Ogg container with auto-detected codec (FLAC or OPUS). + public const string Ogg = "audio/ogg"; + + /// WebM container with OPUS codec. + public const string WebM = "audio/webm"; + + /// Matroska container with auto-detected codec. + public const string Matroska = "audio/x-matroska"; + + /// Ogg container with FLAC codec. + public const string OggFlac = "audio/ogg;codecs=flac"; + + /// Ogg container with OPUS codec. + public const string OggOpus = "audio/ogg;codecs=opus"; + + /// PCM signed 16-bit little-endian at 8000 Hz. + public const string PcmS16le8000 = "audio/pcm;encoding=s16le;rate=8000"; + + /// PCM signed 16-bit little-endian at 16000 Hz. Recommended for general use. + public const string PcmS16le16000 = "audio/pcm;encoding=s16le;rate=16000"; + + /// PCM signed 16-bit little-endian at 44100 Hz. + public const string PcmS16le44100 = "audio/pcm;encoding=s16le;rate=44100"; + + /// PCM signed 16-bit little-endian at 48000 Hz. + public const string PcmS16le48000 = "audio/pcm;encoding=s16le;rate=48000"; + + /// PCM A-Law at 8000 Hz (G.711). + public const string PcmAlaw8000 = "audio/pcm;encoding=alaw;rate=8000"; + + /// PCM µ-Law at 8000 Hz (G.711). + public const string PcmUlaw8000 = "audio/pcm;encoding=ulaw;rate=8000"; + + /// WebM container with OPUS codec (explicit). + public const string WebMOpus = "audio/webm;codecs=opus"; + + /// Matroska container with AAC codec. + public const string MatroskaAac = "audio/x-matroska;codecs=aac"; + + /// Matroska container with FLAC codec. + public const string MatroskaFlac = "audio/x-matroska;codecs=flac"; + + /// Matroska container with MP3 codec. + public const string MatroskaMp3 = "audio/x-matroska;codecs=mp3"; + + /// Matroska container with OPUS codec. + public const string MatroskaOpus = "audio/x-matroska;codecs=opus"; + } +} diff --git a/DeepL/TargetMediaVoice.cs b/DeepL/TargetMediaVoice.cs new file mode 100644 index 0000000..10b5c33 --- /dev/null +++ b/DeepL/TargetMediaVoice.cs @@ -0,0 +1,32 @@ +// Copyright 2025 DeepL SE (https://www.deepl.com) +// Use of this source code is governed by an MIT +// license that can be found in the LICENSE file. + +using System; + +namespace DeepL { + /// + /// Target audio voice selection for synthesized speech in Voice API sessions. + /// This feature is currently in closed beta. + /// + public enum TargetMediaVoice { + /// Male voice. + Male, + + /// Female voice. + Female + } + + /// Extension methods for . + public static class TargetMediaVoiceExtensions { + /// Retrieves the string representation used by the DeepL API. + /// If an unknown enum value is passed. + public static string ToApiValue(this TargetMediaVoice voice) { + return voice switch { + TargetMediaVoice.Male => "male", + TargetMediaVoice.Female => "female", + _ => throw new ArgumentOutOfRangeException(nameof(voice), voice, "Unrecognized target media voice value") + }; + } + } +} diff --git a/DeepL/VoiceMessageFormat.cs b/DeepL/VoiceMessageFormat.cs new file mode 100644 index 0000000..d4aace6 --- /dev/null +++ b/DeepL/VoiceMessageFormat.cs @@ -0,0 +1,29 @@ +// Copyright 2025 DeepL SE (https://www.deepl.com) +// Use of this source code is governed by an MIT +// license that can be found in the LICENSE file. + +using System; + +namespace DeepL { + /// Message encoding format for Voice API WebSocket communication. + public enum VoiceMessageFormat { + /// JSON-encoded messages sent as TEXT WebSocket frames. Binary fields are base64-encoded. + Json, + + /// MessagePack-encoded messages sent as BINARY WebSocket frames. Binary fields are raw binary. + MessagePack + } + + /// Extension methods for . + public static class VoiceMessageFormatExtensions { + /// Retrieves the string representation used by the DeepL API. + /// If an unknown enum value is passed. + public static string ToApiValue(this VoiceMessageFormat format) { + return format switch { + VoiceMessageFormat.Json => "json", + VoiceMessageFormat.MessagePack => "msgpack", + _ => throw new ArgumentOutOfRangeException(nameof(format), format, "Unrecognized message format value") + }; + } + } +} diff --git a/DeepL/VoiceSession.cs b/DeepL/VoiceSession.cs new file mode 100644 index 0000000..0ff826f --- /dev/null +++ b/DeepL/VoiceSession.cs @@ -0,0 +1,258 @@ +// Copyright 2025 DeepL SE (https://www.deepl.com) +// Use of this source code is governed by an MIT +// license that can be found in the LICENSE file. + +using System; +using System.Net.WebSockets; +using System.Text; +using System.Text.Json; +using System.Threading; +using System.Threading.Tasks; +using DeepL.Internal; +using DeepL.Model; + +namespace DeepL { + /// + /// Internal implementation of that manages a WebSocket connection + /// to the DeepL Voice API for real-time speech transcription and translation. + /// + internal sealed class VoiceSession : IVoiceSession { + private static readonly JsonSerializerOptions JsonOptions = new JsonSerializerOptions { + PropertyNamingPolicy = JsonNamingPolicy.CamelCase + }; + + private readonly DeepLHttpClient _httpClient; + private readonly object _lock = new object(); + private ClientWebSocket _webSocket; + private CancellationTokenSource _receiveCts; + private Task? _receiveTask; + private string _lastToken; + private bool _disposed; + + /// + public event EventHandler? SourceTranscriptUpdated; + + /// + public event EventHandler? TargetTranscriptUpdated; + + /// + public event EventHandler? TargetMediaChunkReceived; + + /// + public event EventHandler? ErrorReceived; + + /// + public event EventHandler? StreamEnded; + + /// + public string? SessionId { get; private set; } + + /// + public bool IsConnected { + get { + lock (_lock) { + return !_disposed && _webSocket.State == WebSocketState.Open; + } + } + } + + internal VoiceSession( + DeepLHttpClient httpClient, + ClientWebSocket webSocket, + VoiceSessionInfo sessionInfo) { + _httpClient = httpClient; + _webSocket = webSocket; + _lastToken = sessionInfo.Token; + SessionId = sessionInfo.SessionId; + _receiveCts = new CancellationTokenSource(); + _receiveTask = Task.Run(() => ReceiveLoopAsync(_receiveCts.Token)); + } + + /// + public async Task SendAudioAsync(byte[] audioData, CancellationToken cancellationToken = default) { + await SendAudioAsync(new ArraySegment(audioData), cancellationToken).ConfigureAwait(false); + } + + /// + public async Task SendAudioAsync(ArraySegment audioData, CancellationToken cancellationToken = default) { + EnsureConnected(); + + var base64Data = Convert.ToBase64String( + audioData.Array ?? throw new ArgumentException("Audio data array is null"), + audioData.Offset, + audioData.Count); + var message = $"{{\"source_media_chunk\":{{\"data\":\"{base64Data}\"}}}}"; + var bytes = Encoding.UTF8.GetBytes(message); + + await _webSocket.SendAsync( + new ArraySegment(bytes), + WebSocketMessageType.Text, + endOfMessage: true, + cancellationToken).ConfigureAwait(false); + } + + /// + public async Task EndAudioAsync(CancellationToken cancellationToken = default) { + EnsureConnected(); + + var message = "{\"end_of_source_media\":{}}"; + var bytes = Encoding.UTF8.GetBytes(message); + + await _webSocket.SendAsync( + new ArraySegment(bytes), + WebSocketMessageType.Text, + endOfMessage: true, + cancellationToken).ConfigureAwait(false); + } + + /// + public async Task ReconnectAsync(CancellationToken cancellationToken = default) { + // Stop current receive loop + _receiveCts.Cancel(); + if (_receiveTask != null) { + try { + await _receiveTask.ConfigureAwait(false); + } catch (OperationCanceledException) { + // Expected + } + } + + // Close existing WebSocket if still open + if (_webSocket.State == WebSocketState.Open || _webSocket.State == WebSocketState.CloseReceived) { + try { + await _webSocket.CloseAsync(WebSocketCloseStatus.NormalClosure, "Reconnecting", CancellationToken.None) + .ConfigureAwait(false); + } catch (WebSocketException) { + // Ignore close errors during reconnection + } + } + + _webSocket.Dispose(); + + // Request new token via GET v3/voice/realtime?token= + var queryParams = new[] { ("token", _lastToken) }; + using var responseMessage = await _httpClient.ApiGetAsync("v3/voice/realtime", cancellationToken, queryParams) + .ConfigureAwait(false); + await DeepLHttpClient.CheckStatusCodeAsync(responseMessage).ConfigureAwait(false); + var sessionInfo = await JsonUtils.DeserializeAsync(responseMessage).ConfigureAwait(false); + + _lastToken = sessionInfo.Token; + SessionId = sessionInfo.SessionId; + + // Establish new WebSocket connection + var wsUri = new Uri($"{sessionInfo.StreamingUrl}?token={Uri.EscapeDataString(sessionInfo.Token)}"); + _webSocket = new ClientWebSocket(); + await _webSocket.ConnectAsync(wsUri, cancellationToken).ConfigureAwait(false); + + // Restart receive loop + _receiveCts = new CancellationTokenSource(); + _receiveTask = Task.Run(() => ReceiveLoopAsync(_receiveCts.Token)); + } + + /// Background loop that receives and dispatches WebSocket messages. + private async Task ReceiveLoopAsync(CancellationToken cancellationToken) { + var buffer = new byte[64 * 1024]; // 64 KB buffer + var messageBuilder = new StringBuilder(); + + try { + while (!cancellationToken.IsCancellationRequested && + _webSocket.State == WebSocketState.Open) { + messageBuilder.Clear(); + WebSocketReceiveResult result; + do { + result = await _webSocket.ReceiveAsync( + new ArraySegment(buffer), cancellationToken).ConfigureAwait(false); + + if (result.MessageType == WebSocketMessageType.Close) { + return; + } + + if (result.MessageType == WebSocketMessageType.Text) { + messageBuilder.Append(Encoding.UTF8.GetString(buffer, 0, result.Count)); + } + } while (!result.EndOfMessage); + + if (messageBuilder.Length > 0) { + DispatchMessage(messageBuilder.ToString()); + } + } + } catch (OperationCanceledException) { + // Normal cancellation + } catch (WebSocketException) { + // Connection lost — consumer should call ReconnectAsync + } + } + + /// Parses a JSON message from the WebSocket and dispatches it to the appropriate event. + private void DispatchMessage(string json) { + try { + using var document = JsonDocument.Parse(json); + var root = document.RootElement; + + if (root.TryGetProperty("source_transcript_update", out var sourceUpdate)) { + var update = JsonSerializer.Deserialize(sourceUpdate.GetRawText(), JsonOptions); + if (update != null) { + SourceTranscriptUpdated?.Invoke(this, update); + } + } else if (root.TryGetProperty("target_transcript_update", out var targetUpdate)) { + var update = JsonSerializer.Deserialize(targetUpdate.GetRawText(), JsonOptions); + if (update != null) { + TargetTranscriptUpdated?.Invoke(this, update); + } + } else if (root.TryGetProperty("target_media_chunk", out var mediaChunk)) { + var chunk = JsonSerializer.Deserialize(mediaChunk.GetRawText(), JsonOptions); + if (chunk != null) { + TargetMediaChunkReceived?.Invoke(this, chunk); + } + } else if (root.TryGetProperty("end_of_source_transcript", out _)) { + // Source transcript complete — no special event needed, handled via StreamEnded + } else if (root.TryGetProperty("end_of_target_transcript", out _)) { + // Target transcript complete — no special event needed, handled via StreamEnded + } else if (root.TryGetProperty("end_of_target_media", out _)) { + // Target media complete — no special event needed, handled via StreamEnded + } else if (root.TryGetProperty("end_of_stream", out _)) { + StreamEnded?.Invoke(this, EventArgs.Empty); + } else if (root.TryGetProperty("error", out var errorElement)) { + var error = JsonSerializer.Deserialize(errorElement.GetRawText(), JsonOptions); + if (error != null) { + ErrorReceived?.Invoke(this, error); + } + } + } catch (JsonException) { + // Ignore malformed messages + } + } + + private void EnsureConnected() { + if (_disposed) { + throw new ObjectDisposedException(nameof(VoiceSession)); + } + + if (_webSocket.State != WebSocketState.Open) { + throw new DeepLException("Voice session WebSocket is not connected"); + } + } + + /// Releases the WebSocket connection and stops the receive loop. + public void Dispose() { + lock (_lock) { + if (_disposed) return; + _disposed = true; + } + + _receiveCts.Cancel(); + + try { + if (_webSocket.State == WebSocketState.Open) { + _webSocket.CloseAsync(WebSocketCloseStatus.NormalClosure, "Disposing", CancellationToken.None) + .GetAwaiter().GetResult(); + } + } catch (WebSocketException) { + // Ignore errors during disposal + } + + _webSocket.Dispose(); + _receiveCts.Dispose(); + } + } +} diff --git a/DeepL/VoiceSessionOptions.cs b/DeepL/VoiceSessionOptions.cs new file mode 100644 index 0000000..cf1235c --- /dev/null +++ b/DeepL/VoiceSessionOptions.cs @@ -0,0 +1,70 @@ +// Copyright 2025 DeepL SE (https://www.deepl.com) +// Use of this source code is governed by an MIT +// license that can be found in the LICENSE file. + +namespace DeepL { + /// + /// Options to control Voice API session creation. These options are provided to + /// . + /// + public sealed class VoiceSessionOptions { + /// Initializes a new object. + public VoiceSessionOptions() { } + + /// + /// The audio format for streaming, which specifies container, codec, and encoding parameters. + /// Use constants from for supported values. Required. + /// + public string SourceMediaContentType { get; set; } = DeepL.SourceMediaContentType.Auto; + + /// + /// Message encoding format for WebSocket communication. Defaults to . + /// + public VoiceMessageFormat? MessageFormat { get; set; } + + /// + /// The source language of the audio stream, or null for auto-detection. + /// Must be a supported Voice API source language complying with IETF BCP 47 language tags. + /// + public string? SourceLanguage { get; set; } + + /// + /// Controls how the value is used. + /// Defaults to if not specified. + /// + public SourceLanguageMode? SourceLanguageMode { get; set; } + + /// + /// List of target languages for translation. The stream will emit translations for each language. + /// Maximum 5 target languages per session. Language identifiers must comply with IETF BCP 47. + /// + public string[] TargetLanguages { get; set; } = System.Array.Empty(); + + /// + /// List of target languages for which to generate synthesized audio. This feature is in closed beta. + /// Languages specified here will automatically be added to if not already present. + /// Maximum 5 target media languages per session. + /// + public string[]? TargetMediaLanguages { get; set; } + + /// + /// The audio format for synthesized target media streaming. This feature is in closed beta. + /// Defaults to "audio/webm;codecs=opus" if not specified. + /// + public string? TargetMediaContentType { get; set; } + + /// + /// Target audio voice selection for synthesized speech. This feature is in closed beta. + /// + public TargetMediaVoice? TargetMediaVoice { get; set; } + + /// A glossary ID to use for translation. + public string? GlossaryId { get; set; } + + /// + /// Sets whether the translated text should lean towards formal or informal language. + /// Possible values: "default", "formal", "more", "informal", "less". + /// + public string? Formality { get; set; } + } +} diff --git a/DeepLTests/VoiceSessionTest.cs b/DeepLTests/VoiceSessionTest.cs new file mode 100644 index 0000000..6f494a4 --- /dev/null +++ b/DeepLTests/VoiceSessionTest.cs @@ -0,0 +1,180 @@ +// Copyright 2025 DeepL SE (https://www.deepl.com) +// Use of this source code is governed by an MIT +// license that can be found in the LICENSE file. + +using System; +using System.Collections.Generic; +using System.Text.Json; +using System.Threading.Tasks; +using DeepL; +using DeepL.Model; +using Xunit; + +namespace DeepLTests { + /// Unit tests for Voice API types that do not require API access. + public sealed class VoiceSessionUnitTest { + [Fact] + public void TestVoiceSessionOptionsDefaults() { + var options = new VoiceSessionOptions(); + Assert.Equal(SourceMediaContentType.Auto, options.SourceMediaContentType); + Assert.Null(options.MessageFormat); + Assert.Null(options.SourceLanguage); + Assert.Null(options.SourceLanguageMode); + Assert.NotNull(options.TargetLanguages); + Assert.Empty(options.TargetLanguages); + Assert.Null(options.TargetMediaLanguages); + Assert.Null(options.TargetMediaContentType); + Assert.Null(options.TargetMediaVoice); + Assert.Null(options.GlossaryId); + Assert.Null(options.Formality); + } + + [Fact] + public void TestVoiceSessionOptionsConfiguration() { + var options = new VoiceSessionOptions { + SourceMediaContentType = SourceMediaContentType.OggOpus, + MessageFormat = VoiceMessageFormat.Json, + SourceLanguage = "en", + SourceLanguageMode = DeepL.SourceLanguageMode.Fixed, + TargetLanguages = new[] { "de", "fr", "es" }, + TargetMediaVoice = TargetMediaVoice.Female, + GlossaryId = "test-glossary-id", + Formality = "formal" + }; + + Assert.Equal(SourceMediaContentType.OggOpus, options.SourceMediaContentType); + Assert.Equal(VoiceMessageFormat.Json, options.MessageFormat); + Assert.Equal("en", options.SourceLanguage); + Assert.Equal(DeepL.SourceLanguageMode.Fixed, options.SourceLanguageMode); + Assert.Equal(3, options.TargetLanguages.Length); + Assert.Equal(TargetMediaVoice.Female, options.TargetMediaVoice); + Assert.Equal("test-glossary-id", options.GlossaryId); + Assert.Equal("formal", options.Formality); + } + + [Fact] + public void TestVoiceMessageFormatApiValues() { + Assert.Equal("json", VoiceMessageFormat.Json.ToApiValue()); + Assert.Equal("msgpack", VoiceMessageFormat.MessagePack.ToApiValue()); + } + + [Fact] + public void TestSourceLanguageModeApiValues() { + Assert.Equal("auto", DeepL.SourceLanguageMode.Auto.ToApiValue()); + Assert.Equal("fixed", DeepL.SourceLanguageMode.Fixed.ToApiValue()); + } + + [Fact] + public void TestTargetMediaVoiceApiValues() { + Assert.Equal("male", TargetMediaVoice.Male.ToApiValue()); + Assert.Equal("female", TargetMediaVoice.Female.ToApiValue()); + } + + [Fact] + public void TestVoiceSessionInfoDeserialization() { + var json = "{\"streaming_url\":\"wss://api.deepl.com/v3/voice/realtime/connect\"," + + "\"token\":\"test-token-123\"," + + "\"session_id\":\"test-session-456\"}"; + var info = JsonSerializer.Deserialize(json); + Assert.NotNull(info); + Assert.Equal("wss://api.deepl.com/v3/voice/realtime/connect", info!.StreamingUrl); + Assert.Equal("test-token-123", info.Token); + Assert.Equal("test-session-456", info.SessionId); + } + + [Fact] + public void TestTranscriptUpdateDeserialization() { + var json = "{\"concluded\":[{\"text\":\"Hello \"}],\"tentative\":[{\"text\":\"world\"}],\"language\":\"de\"}"; + var update = JsonSerializer.Deserialize(json); + Assert.NotNull(update); + Assert.Single(update!.Concluded); + Assert.Equal("Hello ", update.Concluded[0].Text); + Assert.Single(update.Tentative); + Assert.Equal("world", update.Tentative[0].Text); + Assert.Equal("de", update.Language); + } + + [Fact] + public void TestTranscriptSegmentDeserialization() { + var json = "{\"text\":\"Hello world\"}"; + var segment = JsonSerializer.Deserialize(json); + Assert.NotNull(segment); + Assert.Equal("Hello world", segment!.Text); + Assert.Equal("Hello world", segment.ToString()); + } + + [Fact] + public void TestTargetMediaChunkDeserialization() { + var json = "{\"content_type\":\"audio/webm;codecs=opus\"," + + "\"headers\":1," + + "\"data\":[\"base64data1\",\"base64data2\"]," + + "\"text\":\"Hallo Welt\"," + + "\"language\":\"de\"," + + "\"duration\":1.5}"; + var chunk = JsonSerializer.Deserialize(json); + Assert.NotNull(chunk); + Assert.Equal("audio/webm;codecs=opus", chunk!.ContentType); + Assert.Equal(1, chunk.Headers); + Assert.Equal(2, chunk.Data.Length); + Assert.Equal("base64data1", chunk.Data[0]); + Assert.Equal("Hallo Welt", chunk.Text); + Assert.Equal("de", chunk.Language); + Assert.Equal(1.5, chunk.Duration); + } + + [Fact] + public void TestVoiceStreamErrorDeserialization() { + var json = "{\"code\":\"4001\",\"reason\":\"invalid_audio\",\"message\":\"Audio format not supported\"}"; + var error = JsonSerializer.Deserialize(json); + Assert.NotNull(error); + Assert.Equal("4001", error!.Code); + Assert.Equal("invalid_audio", error.Reason); + Assert.Equal("Audio format not supported", error.Message); + } + + [Fact] + public void TestSourceMediaContentTypeConstants() { + Assert.Equal("audio/auto", SourceMediaContentType.Auto); + Assert.Equal("audio/flac", SourceMediaContentType.Flac); + Assert.Equal("audio/mpeg", SourceMediaContentType.Mpeg); + Assert.Equal("audio/ogg", SourceMediaContentType.Ogg); + Assert.Equal("audio/webm", SourceMediaContentType.WebM); + Assert.Equal("audio/x-matroska", SourceMediaContentType.Matroska); + Assert.Equal("audio/ogg;codecs=flac", SourceMediaContentType.OggFlac); + Assert.Equal("audio/ogg;codecs=opus", SourceMediaContentType.OggOpus); + Assert.Equal("audio/pcm;encoding=s16le;rate=16000", SourceMediaContentType.PcmS16le16000); + Assert.Equal("audio/webm;codecs=opus", SourceMediaContentType.WebMOpus); + } + } + + /// Tests for Voice API session creation that require API access. + public sealed class VoiceSessionClientTest : BaseDeepLTest { + [Fact] + public async Task TestCreateSessionRequiresTargetLanguages() { + var client = CreateTestClient(); + var options = new VoiceSessionOptions { + SourceMediaContentType = SourceMediaContentType.OggOpus + }; + await Assert.ThrowsAsync( + () => client.CreateVoiceSessionAsync(options)); + } + + [Fact] + public async Task TestCreateSessionRejectsExcessiveTargetLanguages() { + var client = CreateTestClient(); + var options = new VoiceSessionOptions { + SourceMediaContentType = SourceMediaContentType.OggOpus, + TargetLanguages = new[] { "de", "fr", "es", "it", "nl", "pt" } + }; + await Assert.ThrowsAsync( + () => client.CreateVoiceSessionAsync(options)); + } + + [Fact] + public async Task TestCreateSessionRejectsNullOptions() { + var client = CreateTestClient(); + await Assert.ThrowsAsync( + () => client.CreateVoiceSessionAsync(null!)); + } + } +} From 97f977abf2031ae30c86ce8fe82a18483a2ce918 Mon Sep 17 00:00:00 2001 From: Tim Cadenbach Date: Wed, 8 Apr 2026 21:20:38 +0200 Subject: [PATCH 2/2] Update DeepL/VoiceSessionOptions.cs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- DeepL/VoiceSessionOptions.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DeepL/VoiceSessionOptions.cs b/DeepL/VoiceSessionOptions.cs index cf1235c..81a6cac 100644 --- a/DeepL/VoiceSessionOptions.cs +++ b/DeepL/VoiceSessionOptions.cs @@ -18,7 +18,8 @@ public VoiceSessionOptions() { } public string SourceMediaContentType { get; set; } = DeepL.SourceMediaContentType.Auto; /// - /// Message encoding format for WebSocket communication. Defaults to . + /// Message encoding format for WebSocket communication. If null, the API default is used + /// (currently ). /// public VoiceMessageFormat? MessageFormat { get; set; }