From 8f225af35615d769929ef2c2c461b94c62b1730e Mon Sep 17 00:00:00 2001 From: michalkulakowski Date: Thu, 19 Feb 2026 09:38:22 +0100 Subject: [PATCH 1/4] Update transcriptions docs (#3961) --- demos/audio/README.md | 51 ++++++++++++++++++-- demos/common/export_models/export_model.py | 4 +- docs/model_server_rest_api_speech_to_text.md | 4 +- docs/speech_recognition/reference.md | 4 +- 4 files changed, 56 insertions(+), 7 deletions(-) diff --git a/demos/audio/README.md b/demos/audio/README.md index 7989cfb4c7..bb650d47c8 100644 --- a/demos/audio/README.md +++ b/demos/audio/README.md @@ -144,7 +144,7 @@ An asynchronous benchmarking client can be used to access the model server perfo git clone https://github.com/openvinotoolkit/model_server cd model_server/demos/benchmark/v3/ pip install -r requirements.txt -python benchmark.py --api_url http://localhost:8122/v3/audio/speech --model microsoft/speecht5_tts --batch_size 1 --limit 100 --request_rate inf --backend text2speech --dataset edinburghcstr/ami --hf-subset 'ihm' --tokenizer openai/whisper-large-v3-turbo --trust-remote-code True +python benchmark.py --api_url http://localhost:8000/v3/audio/speech --model microsoft/speecht5_tts --batch_size 1 --limit 100 --request_rate inf --backend text2speech --dataset edinburghcstr/ami --hf-subset 'ihm' --tokenizer openai/whisper-large-v3-turbo --trust-remote-code True Number of documents: 100 100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:58<00:00, 1.19s/it] Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation. @@ -178,10 +178,11 @@ Run `export_model.py` script to download and quantize the model: **CPU** ```console -python export_model.py speech2text --source_model openai/whisper-large-v3-turbo --weight-format fp16 --model_name openai/whisper-large-v3-turbo --config_file_path models/config.json --model_repository_path models --overwrite_models +python export_model.py speech2text --source_model openai/whisper-large-v3-turbo --weight-format fp16 --model_name openai/whisper-large-v3-turbo --config_file_path models/config.json --model_repository_path models --overwrite_models --enable_word_timestamps ``` > **Note:** Change the `--weight-format` to quantize the model to `int8` precision to reduce memory consumption and improve performance. +> **Note:** `--enable_word_timestamps` can be omitted if there is no need for word timestamps support. ### Deployment @@ -230,7 +231,7 @@ Transcript file that was previously generated with audio/speech endpoint. ```bash -curl http://localhost:8000/v3/audio/transcriptions -H "Content-Type: multipart/form-data" -F file="@speech.wav" -F model="openai/whisper-large-v3-turbo" +curl http://localhost:8000/v3/audio/transcriptions -H "Content-Type: multipart/form-data" -F file="@speech.wav" -F model="openai/whisper-large-v3-turbo" -F language="en" ``` ```json {"text": " The quick brown fox jumped over the lazy dog."} @@ -253,6 +254,7 @@ client = OpenAI(base_url=url, api_key="not_used") audio_file = open(filename, "rb") transcript = client.audio.transcriptions.create( model="openai/whisper-large-v3-turbo", + language="en", file=audio_file ) @@ -262,6 +264,49 @@ print(transcript.text) The quick brown fox jumped over the lazy dog. ``` ::: +:::{dropdown} **Unary call with timestamps** + + +```bash +curl http://localhost:8000/v3/audio/transcriptions -H "Content-Type: multipart/form-data" -F file="@speech.wav" -F model="openai/whisper-large-v3-turbo" -F language="en" -F timestamp_granularities[]="segment" -F timestamp_granularities[]="word" +``` +```json +{"text":" A quick brown fox jumped over the lazy dog","words":[{"word":" A","start":0.0,"end":0.14000000059604645},{"word":" quick","start":0.14000000059604645,"end":0.3400000035762787},{"word":" brown","start":0.3400000035762787,"end":0.7799999713897705},{"word":" fox","start":0.7799999713897705,"end":1.3199999332427979},{"word":" jumped","start":1.3199999332427979,"end":1.7799999713897705},{"word":" over","start":1.7799999713897705,"end":2.0799999237060547},{"word":" the","start":2.0799999237060547,"end":2.259999990463257},{"word":" lazy","start":2.259999990463257,"end":2.5399999618530273},{"word":" dog","start":2.5399999618530273,"end":2.919999837875366}],"segments":[{"text":" A quick brown fox jumped over the lazy dog","start":0.0,"end":3.1399998664855957}]} +``` +::: + +:::{dropdown} **Unary call with python OpenAI library with timestamps** + +```python +from pathlib import Path +from openai import OpenAI + +filename = "speech.wav" +url="http://localhost:8000/v3" + + +speech_file_path = Path(__file__).parent / filename +client = OpenAI(base_url=url, api_key="not_used") + +audio_file = open(filename, "rb") +transcript = client.audio.transcriptions.create( + model="openai/whisper-large-v3-turbo", + language="en", + response_format="verbose_json", + timestamp_granularities=["segment", "word"], + file=audio_file +) + +print(transcript.text) +print(transcript.segments) +print(transcript.words) +``` +``` + A quick brown fox jumped over the lazy dog +[TranscriptionSegment(id=None, avg_logprob=None, compression_ratio=None, end=3.1399998664855957, no_speech_prob=None, seek=None, start=0.0, temperature=None, text=' A quick brown fox jumped over the lazy dog', tokens=None)] +[TranscriptionWord(end=0.14000000059604645, start=0.0, word=' A'), TranscriptionWord(end=0.3400000035762787, start=0.14000000059604645, word=' quick'), TranscriptionWord(end=0.7799999713897705, start=0.3400000035762787, word=' brown'), TranscriptionWord(end=1.3199999332427979, start=0.7799999713897705, word=' fox'), TranscriptionWord(end=1.7799999713897705, start=1.3199999332427979, word=' jumped'), TranscriptionWord(end=2.0799999237060547, start=1.7799999713897705, word=' over'), TranscriptionWord(end=2.259999990463257, start=2.0799999237060547, word=' the'), TranscriptionWord(end=2.5399999618530273, start=2.259999990463257, word=' lazy'), TranscriptionWord(end=2.919999837875366, start=2.5399999618530273, word=' dog')] +``` +::: ## Benchmarking transcription An asynchronous benchmarking client can be used to access the model server performance with various load conditions. Below are execution examples captured on Intel(R) Core(TM) Ultra 7 258V. diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py index 97a9149b8b..d84c564c22 100644 --- a/demos/common/export_models/export_model.py +++ b/demos/common/export_models/export_model.py @@ -95,6 +95,7 @@ def add_common_arguments(parser): parser_speech2text = subparsers.add_parser('speech2text', help='export model for speech2text endpoint') add_common_arguments(parser_speech2text) parser_speech2text.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams') +parser_speech2text.add_argument('--enable_word_timestamps', default=False, action='store_true', help='Load model with word timestamps support.', dest='enable_word_timestamps') args = vars(parser.parse_args()) t2s_graph_template = """ @@ -129,7 +130,8 @@ def add_common_arguments(parser): [type.googleapis.com / mediapipe.S2tCalculatorOptions]: { models_path: "{{model_path}}", plugin_config: '{ "NUM_STREAMS": "{{num_streams|default(1, true)}}" }', - target_device: "{{target_device|default("CPU", true)}}" + target_device: "{{target_device|default("CPU", true)}}", + enable_word_timestamps: {% if not enable_word_timestamps %}false{% else %}true{% endif%}, } } } diff --git a/docs/model_server_rest_api_speech_to_text.md b/docs/model_server_rest_api_speech_to_text.md index f3bf916b9b..b82d482481 100644 --- a/docs/model_server_rest_api_speech_to_text.md +++ b/docs/model_server_rest_api_speech_to_text.md @@ -47,8 +47,8 @@ curl -X POST http://localhost:8000/v3/audio/translations \ | prompt | ❌ | ✅ | string | An optional text to guide the model's style or continue a previous audio segment. | | response_format | ❌ | ✅ | string | The format of the output. | | stream | ❌ | ✅ | boolean | Generate the response in streaming mode. | -| temperature | ❌ | ✅ | number | The sampling temperature, between 0 and 1. | -| timestamp_granularities | ❌ | ✅ | array | The timestamp granularities to populate for this transcription. | +| temperature | ⚠️ | ✅ | number | The sampling temperature, between 0 and 1. | +| timestamp_granularities | ⚠️ | ✅ | array | The timestamp granularities to populate for this transcription. Supported values: "word" and "segment" (⚠️**Note**: To enable word timestamps `enable_word_timestamps: true` need to be set in graph.pbtxt) | ### Translation diff --git a/docs/speech_recognition/reference.md b/docs/speech_recognition/reference.md index 3ad4e0c16d..2bfe69476f 100644 --- a/docs/speech_recognition/reference.md +++ b/docs/speech_recognition/reference.md @@ -42,7 +42,8 @@ node { node_options: { [type.googleapis.com / mediapipe.S2tCalculatorOptions]: { models_path: "./", - target_device: "CPU" + target_device: "CPU", + enable_word_timestamps: true } } } @@ -53,6 +54,7 @@ Above node configuration should be used as a template since user is not expected The calculator supports the following `node_options` for tuning the pipeline configuration: - `required string models_path` - location of the models and scheduler directory (can be relative); - `optional string device` - device to load models to. Supported values: "CPU, GPU, NPU" [default = "CPU"] +- `optional bool enable_word_timestamps` - set to true if model should support user requests for word timestamp. [default = false] We recommend using [export script](../../demos/common/export_models/README.md) to prepare models directory structure for serving. Check [supported models](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/#speech-recognition-models). From 71d42d5e4fd3ab237f907794a6debf1fef4bf032 Mon Sep 17 00:00:00 2001 From: michalkulakowski Date: Fri, 20 Feb 2026 09:53:43 +0100 Subject: [PATCH 2/4] Speaker embeddings demo improvements (#3987) --- demos/audio/README.md | 95 ++++++++++++---------- demos/audio/create_speaker_embedding.py | 36 ++++++++ demos/audio/requirements.txt | 5 ++ demos/common/export_models/export_model.py | 12 ++- 4 files changed, 105 insertions(+), 43 deletions(-) create mode 100644 demos/audio/create_speaker_embedding.py create mode 100644 demos/audio/requirements.txt diff --git a/demos/audio/README.md b/demos/audio/README.md index bb650d47c8..8efda4018d 100644 --- a/demos/audio/README.md +++ b/demos/audio/README.md @@ -16,6 +16,17 @@ Check supported [Speech Recognition Models](https://openvinotoolkit.github.io/op **Client**: curl or Python for using OpenAI client package ## Speech generation +### Prepare speaker embeddings +When generating speech you can use default speaker voice or you can prepare your own speaker embedding file. Here you can see how to do it with downloaded file from online repository, but you can try with your own speech recording as well: +```bash +pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/audio/requirements.txt +mkdir -p audio_samples +curl --output audio_samples/audio.wav "https://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0032_8k.wav" +mkdir -p models +mkdir -p models/speakers +python create_speaker_embedding.py audio_samples/audio.wav models/speakers/voice1.bin +``` + ### Model preparation Supported models should use the topology of [microsoft/speecht5_tts](https://huggingface.co/microsoft/speecht5_tts) which needs to be converted to IR format before using in OVMS. @@ -40,48 +51,14 @@ Run `export_model.py` script to download and quantize the model: **CPU** ```console -python export_model.py text2speech --source_model microsoft/speecht5_tts --weight-format fp16 --model_name microsoft/speecht5_tts --config_file_path models/config.json --model_repository_path models --overwrite_models --vocoder microsoft/speecht5_hifigan +python export_model.py text2speech --source_model microsoft/speecht5_tts --weight-format fp16 --model_name microsoft/speecht5_tts --config_file_path models/config.json --model_repository_path models --overwrite_models --vocoder microsoft/speecht5_hifigan --speaker_name voice1 --speaker_path /models/speakers/voice1.bin ``` > **Note:** Change the `--weight-format` to quantize the model to `int8` precision to reduce memory consumption and improve performance. +> **Note:** `speaker_name` and `speaker_path` may be omitted if the default model voice is sufficient The default configuration should work in most cases but the parameters can be tuned via `export_model.py` script arguments. Run the script with `--help` argument to check available parameters and see the [T2s calculator documentation](../../docs/speech_generation/reference.md) to learn more about configuration options and limitations. -### Speaker embeddings - -Instead of generating speech with default model voice you can create speaker embeddings with [this script](https://github.com/openvinotoolkit/openvino.genai/blob/master/samples/python/speech_generation/create_speaker_embedding.py) -```bash -curl --output create_speaker_embedding.py "https://raw.githubusercontent.com/openvinotoolkit/openvino.genai/refs/heads/master/samples/python/speech_generation/create_speaker_embedding.py" -python create_speaker_embedding.py -mv speaker_embedding.bin models/ -``` -Script records your speech for 5 seconds(you can adjust duration of recording to achieve better results) and then, using speechbrain/spkrec-xvect-voxceleb model, creates `speaker_embedding.bin` file that contains your speaker embedding. -Now you need to add speaker embedding path to graph.pbtxt file of text2speech graph: -``` -input_stream: "HTTP_REQUEST_PAYLOAD:input" -output_stream: "HTTP_RESPONSE_PAYLOAD:output" -node { - name: "T2sExecutor" - input_side_packet: "TTS_NODE_RESOURCES:t2s_servable" - calculator: "T2sCalculator" - input_stream: "HTTP_REQUEST_PAYLOAD:input" - output_stream: "HTTP_RESPONSE_PAYLOAD:output" - node_options: { - [type.googleapis.com / mediapipe.T2sCalculatorOptions]: { - models_path: "./", - plugin_config: '{ "NUM_STREAMS": "1" }', - target_device: "CPU", - voices: [ - { - name: "voice", - path: "/models/speaker_embedding.bin", - } - ] - } - } -} -``` - ### Deployment **CPU** @@ -101,7 +78,7 @@ ovms --rest_port 8000 --source_model microsoft/speecht5_tts --model_repository_p ### Request Generation -:::{dropdown} **Unary call with curl** +:::{dropdown} **Unary call with curl with default voice** ```bash @@ -109,7 +86,41 @@ curl http://localhost:8000/v3/audio/speech -H "Content-Type: application/json" - ``` ::: -:::{dropdown} **Unary call with OpenAi python library** +:::{dropdown} **Unary call with OpenAI python library with default voice** + +```python +from pathlib import Path +from openai import OpenAI + +prompt = "The quick brown fox jumped over the lazy dog" +filename = "speech.wav" +url="http://localhost:8000/v3" + + +speech_file_path = Path(__file__).parent / "speech.wav" +client = OpenAI(base_url=url, api_key="not_used") + +with client.audio.speech.with_streaming_response.create( + model="microsoft/speecht5_tts", + voice=None, + input=prompt +) as response: + response.stream_to_file(speech_file_path) + + +print("Generation finished") +``` +::: + +:::{dropdown} **Unary call with curl** + + +```bash +curl http://localhost:8000/v3/audio/speech -H "Content-Type: application/json" -d "{\"model\": \"microsoft/speecht5_tts\", \"voice\":\"voice1\", \"input\": \"The quick brown fox jumped over the lazy dog\"}" -o speech.wav +``` +::: + +:::{dropdown} **Unary call with OpenAI python library** ```python from pathlib import Path @@ -125,7 +136,7 @@ client = OpenAI(base_url=url, api_key="not_used") with client.audio.speech.with_streaming_response.create( model="microsoft/speecht5_tts", - voice="unused", + voice="voice1", input=prompt ) as response: response.stream_to_file(speech_file_path) @@ -222,12 +233,12 @@ ovms --rest_port 8000 --source_model openai/whisper-large-v3-turbo --model_repos ``` ::: -The default configuration should work in most cases but the parameters can be tuned via `export_model.py` script arguments. Run the script with `--help` argument to check available parameters and see the [S2t calculator documentation](../../docs/speech_recognition/reference.md) to learn more about configuration options and limitations. +The default configuration should work in most cases but the parameters can be tuned via `export_model.py` script arguments. Run the script with `--help` argument to check available parameters and see the [s2t calculator documentation](../../docs/speech_recognition/reference.md) to learn more about configuration options and limitations. ### Request Generation Transcript file that was previously generated with audio/speech endpoint. -:::{dropdown} **Unary call with curl** +:::{dropdown} **Unary call with cURL** ```bash @@ -381,7 +392,7 @@ ovms --rest_port 8000 --source_model OpenVINO/whisper-large-v3-fp16-ov --model_r ### Request Generation Transcript and translate file that was previously generated with audio/speech endpoint. -:::{dropdown} **Unary call with curl** +:::{dropdown} **Unary call with cURL** ```bash diff --git a/demos/audio/create_speaker_embedding.py b/demos/audio/create_speaker_embedding.py new file mode 100644 index 0000000000..1d0a5663b4 --- /dev/null +++ b/demos/audio/create_speaker_embedding.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import torch +import torchaudio +from speechbrain.inference.speaker import EncoderClassifier +import sys + +if len(sys.argv) != 3: + print(f"Usage: {sys.argv[0]} ") + sys.exit(1) + +file = sys.argv[1] +signal, fs = torchaudio.load(file) +if signal.shape[0] > 1: + signal = torch.mean(signal, dim=0, keepdim=True) +expected_sample_rate = 16000 +if fs != expected_sample_rate: + resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=expected_sample_rate) + signal = resampler(signal) + +if signal.ndim != 2 or signal.shape[0] != 1: + print(f"Error: expected signal shape [1, num_samples], got {list(signal.shape)}") + sys.exit(1) +if signal.shape[1] == 0: + print("Error: audio file contains no samples") + sys.exit(1) + +classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb") +embedding = classifier.encode_batch(signal) +embedding = torch.nn.functional.normalize(embedding, dim=2) +embedding = embedding.squeeze().cpu().numpy().astype("float32") + +output_file = sys.argv[2] +embedding.tofile(output_file) \ No newline at end of file diff --git a/demos/audio/requirements.txt b/demos/audio/requirements.txt new file mode 100644 index 0000000000..b57ab92bd2 --- /dev/null +++ b/demos/audio/requirements.txt @@ -0,0 +1,5 @@ +--extra-index-url "https://download.pytorch.org/whl/cpu" +torch==2.9.1+cpu +torchaudio==2.9.1+cpu +speechbrain==1.0.3 +openai==2.21.0 \ No newline at end of file diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py index d84c564c22..3e986da25a 100644 --- a/demos/common/export_models/export_model.py +++ b/demos/common/export_models/export_model.py @@ -91,6 +91,9 @@ def add_common_arguments(parser): add_common_arguments(parser_text2speech) parser_text2speech.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams') parser_text2speech.add_argument('--vocoder', type=str, help='The vocoder model to use for text2speech. For example microsoft/speecht5_hifigan', dest='vocoder') +parser_text2speech.add_argument('--speaker_name', type=str, help='Name of the speaker', dest='speaker_name') +parser_text2speech.add_argument('--speaker_path', type=str, help='Path to the speaker.bin file.', dest='speaker_path') + parser_speech2text = subparsers.add_parser('speech2text', help='export model for speech2text endpoint') add_common_arguments(parser_speech2text) @@ -111,7 +114,14 @@ def add_common_arguments(parser): [type.googleapis.com / mediapipe.T2sCalculatorOptions]: { models_path: "{{model_path}}", plugin_config: '{ "NUM_STREAMS": "{{num_streams|default(1, true)}}" }', - target_device: "{{target_device|default("CPU", true)}}" + target_device: "{{target_device|default("CPU", true)}}", + {%- if speaker_name and speaker_path %} + voices: [ + { + name: "{{speaker_name}}", + path: "{{speaker_path}}" + } + ]{% endif %} } } } From 8ccafa6d9b4f6114638f03546d300d60a5861d06 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Fri, 20 Feb 2026 10:10:12 +0100 Subject: [PATCH 3/4] fix --- demos/audio/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/demos/audio/README.md b/demos/audio/README.md index 8efda4018d..92a06eef23 100644 --- a/demos/audio/README.md +++ b/demos/audio/README.md @@ -19,7 +19,7 @@ Check supported [Speech Recognition Models](https://openvinotoolkit.github.io/op ### Prepare speaker embeddings When generating speech you can use default speaker voice or you can prepare your own speaker embedding file. Here you can see how to do it with downloaded file from online repository, but you can try with your own speech recording as well: ```bash -pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/audio/requirements.txt +pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2026/0/demos/audio/requirements.txt mkdir -p audio_samples curl --output audio_samples/audio.wav "https://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0032_8k.wav" mkdir -p models @@ -73,7 +73,7 @@ docker run -d -u $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models:rw ```bat mkdir models -ovms --rest_port 8000 --source_model microsoft/speecht5_tts --model_repository_path models --model_name microsoft/speecht5_tts --task text2speech --target_device CPU +ovms --rest_port 8000 --model_path models/microsoft/speecht5_tts --model_name microsoft/speecht5_tts ``` ### Request Generation From 6fd7a4ef24ed7be4e911ca7c01865cabe19d2152 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Fri, 20 Feb 2026 10:13:40 +0100 Subject: [PATCH 4/4] fix --- demos/audio/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/audio/README.md b/demos/audio/README.md index 92a06eef23..277b64ff44 100644 --- a/demos/audio/README.md +++ b/demos/audio/README.md @@ -72,7 +72,7 @@ docker run -d -u $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models:rw **Deploying on Bare Metal** ```bat -mkdir models +mkdir -p models ovms --rest_port 8000 --model_path models/microsoft/speecht5_tts --model_name microsoft/speecht5_tts ```