From a26f7b095c7106cdd67beae3699139944eb202fd Mon Sep 17 00:00:00 2001 From: Rafal Sapala Date: Wed, 18 Feb 2026 16:41:40 +0100 Subject: [PATCH 01/10] Add log --- .../continuous_batching/llm_executor.hpp | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/llm/language_model/continuous_batching/llm_executor.hpp b/src/llm/language_model/continuous_batching/llm_executor.hpp index f396d78dc3..14bc58f047 100644 --- a/src/llm/language_model/continuous_batching/llm_executor.hpp +++ b/src/llm/language_model/continuous_batching/llm_executor.hpp @@ -59,12 +59,36 @@ struct LLMExecutor { cv.notify_one(); } + std::string formatBytes(size_t bytes) + { + const double KB = 1024.0; + const double MB = KB * 1024.0; + const double GB = MB * 1024.0; + const double TB = GB * 1024.0; + + std::ostringstream oss; + oss << std::fixed << std::setprecision(1); + + if (bytes >= TB) + oss << (bytes / TB) << " TB"; + else if (bytes >= GB) + oss << (bytes / GB) << " GB"; + else if (bytes >= MB) + oss << (bytes / MB) << " MB"; + else if (bytes >= KB) + oss << (bytes / KB) << " KB"; + else + oss << bytes << " B"; + + return oss.str(); + } + #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-but-set-variable" void printMetrics() { ov::genai::PipelineMetrics metrics = pipe->get_metrics(); - SPDLOG_LOGGER_INFO(llm_executor_logger, "All requests: {}; Scheduled requests: {}; Cache usage {:.1f}%;", - metrics.requests, metrics.scheduled_requests, metrics.cache_usage); + SPDLOG_LOGGER_INFO(llm_executor_logger, "All requests: {}; Scheduled requests: {}; Cache usage {:.1f}% of {};", + metrics.requests, metrics.scheduled_requests, metrics.cache_usage, formatBytes(metrics.kv_cache_size_in_bytes)); } }; #pragma GCC diagnostic pop From dbb1d34e10148152ffcb985705a449b2a95dc61c Mon Sep 17 00:00:00 2001 From: rasapala Date: Thu, 19 Feb 2026 14:23:28 +0100 Subject: [PATCH 02/10] Add dynamic flag --- .../continuous_batching/llm_executor.hpp | 24 +++++++++++++++---- .../servable_initializer.cpp | 2 +- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/llm/language_model/continuous_batching/llm_executor.hpp b/src/llm/language_model/continuous_batching/llm_executor.hpp index 14bc58f047..2c4bdc640a 100644 --- a/src/llm/language_model/continuous_batching/llm_executor.hpp +++ b/src/llm/language_model/continuous_batching/llm_executor.hpp @@ -31,13 +31,15 @@ namespace ovms { struct LLMExecutor { + bool isDynamicKVCache; // For logging purposes we could have more information about graph and node here std::mutex mutex; std::condition_variable cv; std::shared_ptr pipe = nullptr; - LLMExecutor(std::shared_ptr pipe) { + LLMExecutor(std::shared_ptr pipe, bool isDynamicKVCacheSet = false) { this->pipe = std::move(pipe); + this->isDynamicKVCache = isDynamicKVCacheSet; } bool hasRequests() { @@ -59,6 +61,18 @@ struct LLMExecutor { cv.notify_one(); } + std::string formatCacheInfo(float cacheUsage, size_t cacheBytes, bool isCacheDynamic) { + std::ostringstream oss; + oss << std::fixed << std::setprecision(1); + if (isCacheDynamic) { + oss << formatBytes(cacheBytes); + } else { + oss << cacheUsage << "% of " << formatBytes(cacheBytes); + } + + return oss.str(); + } + std::string formatBytes(size_t bytes) { const double KB = 1024.0; @@ -87,8 +101,8 @@ struct LLMExecutor { #pragma GCC diagnostic ignored "-Wunused-but-set-variable" void printMetrics() { ov::genai::PipelineMetrics metrics = pipe->get_metrics(); - SPDLOG_LOGGER_INFO(llm_executor_logger, "All requests: {}; Scheduled requests: {}; Cache usage {:.1f}% of {};", - metrics.requests, metrics.scheduled_requests, metrics.cache_usage, formatBytes(metrics.kv_cache_size_in_bytes)); + SPDLOG_LOGGER_INFO(llm_executor_logger, "All requests: {}; Scheduled requests: {}; Cache usage {};", + metrics.requests, metrics.scheduled_requests, formatCacheInfo(metrics.cache_usage, metrics.kv_cache_usage_in_bytes, this->isDynamicKVCache)); } }; #pragma GCC diagnostic pop @@ -122,8 +136,8 @@ class LLMExecutorWrapper { } public: - LLMExecutorWrapper(std::shared_ptr pipe) : - llmExecutor(std::move(pipe)) { + LLMExecutorWrapper(std::shared_ptr pipe, bool isDynamicKVCache = false) : + llmExecutor(std::move(pipe), isDynamicKVCache) { llmExecutorThread = std::thread(LLMExecutorWrapper::run, &llmExecutor, &finishExecutorThread); } diff --git a/src/llm/language_model/continuous_batching/servable_initializer.cpp b/src/llm/language_model/continuous_batching/servable_initializer.cpp index 9b2018b86d..27f4f51aee 100644 --- a/src/llm/language_model/continuous_batching/servable_initializer.cpp +++ b/src/llm/language_model/continuous_batching/servable_initializer.cpp @@ -223,7 +223,7 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptrmaxModelLength = parseMaxModelLength(parsedModelsPath); - properties->llmExecutorWrapper = std::make_shared(properties->pipeline); + properties->llmExecutorWrapper = std::make_shared(properties->pipeline, properties->schedulerConfig.cache_size == 0); return StatusCode::OK; } From 7729304b419b57ba811dc6221b44dd64af60d5f4 Mon Sep 17 00:00:00 2001 From: dkalinowski Date: Thu, 19 Feb 2026 09:06:44 +0100 Subject: [PATCH 03/10] Coverity fixes 2026.1 (#3803) --- demos/continuous_batching/agentic_ai/README.md | 2 +- src/http_rest_api_handler.cpp | 6 ++++++ src/server.cpp | 9 +++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/demos/continuous_batching/agentic_ai/README.md b/demos/continuous_batching/agentic_ai/README.md index a0f721a705..2f38980ca8 100644 --- a/demos/continuous_batching/agentic_ai/README.md +++ b/demos/continuous_batching/agentic_ai/README.md @@ -91,7 +91,7 @@ curl -L -o models/Qwen/Qwen3-Coder-30B-A3B-Instruct/chat_template.jinja https:// :sync: gpt-oss-20b ```console python export_model.py text_generation --source_model openai/gpt-oss-20b --weight-format int4 --config_file_path models/config.json --model_repository_path models --tool_parser gptoss --reasoning_parser gptoss -curl -L -o models/openai/gpt-oss-20b/chat_template.jinja https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2025/4/extras/chat_template_examples/chat_template_gpt_oss_multiturn.jinja +curl -L -o models/openai/gpt-oss-20b/chat_template.jinja https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2025/4/extras/chat_template_examples/chat_template_gpt_oss.jinja ``` > **Note:**: Use `--pipeline_type LM` parameter in export command, for version 2025.4.*. It disables continuous batching. Not needed in weekly release or 2026.0+ releases. ::: diff --git a/src/http_rest_api_handler.cpp b/src/http_rest_api_handler.cpp index 6bd9ab100f..afe163e6dc 100644 --- a/src/http_rest_api_handler.cpp +++ b/src/http_rest_api_handler.cpp @@ -693,6 +693,12 @@ struct V3StreamCallbackResourceGuard { CopyableObjectWrapper& requestWrapper; std::shared_ptr& serverReaderWriter; + V3StreamCallbackResourceGuard() = delete; + V3StreamCallbackResourceGuard(const V3StreamCallbackResourceGuard&) = delete; + V3StreamCallbackResourceGuard& operator=(const V3StreamCallbackResourceGuard&) = delete; + V3StreamCallbackResourceGuard& operator=(V3StreamCallbackResourceGuard&&) = delete; + V3StreamCallbackResourceGuard(V3StreamCallbackResourceGuard&&) = delete; + V3StreamCallbackResourceGuard( CopyableObjectWrapper& executorWrapper, CopyableObjectWrapper& requestWrapper, diff --git a/src/server.cpp b/src/server.cpp index d89d54fcaa..3e58d34ff7 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -430,12 +430,21 @@ class ModulesShutdownGuard { ~ModulesShutdownGuard() { this->server.shutdownModules(); } + ModulesShutdownGuard(const ModulesShutdownGuard&) = delete; + ModulesShutdownGuard& operator=(const ModulesShutdownGuard&) = delete; + ModulesShutdownGuard(ModulesShutdownGuard&&) = delete; + ModulesShutdownGuard& operator=(ModulesShutdownGuard&&) = delete; }; class OvmsExitGuard { Server& server; public: + OvmsExitGuard() = delete; + OvmsExitGuard(const OvmsExitGuard&) = delete; + OvmsExitGuard& operator=(const OvmsExitGuard&) = delete; + OvmsExitGuard& operator=(OvmsExitGuard&&) = delete; + OvmsExitGuard(OvmsExitGuard&&) = delete; OvmsExitGuard(Server& server) : server(server) { server.setExitStatus(0); } ~OvmsExitGuard() { server.setExitStatus(1); } From 7fb778ac8724cc9cc2a659b36e743e1b28a4c577 Mon Sep 17 00:00:00 2001 From: michalkulakowski Date: Thu, 19 Feb 2026 09:38:22 +0100 Subject: [PATCH 04/10] Update transcriptions docs (#3961) --- demos/audio/README.md | 51 ++++++++++++++++++-- demos/common/export_models/export_model.py | 4 +- docs/model_server_rest_api_speech_to_text.md | 4 +- docs/speech_recognition/reference.md | 4 +- 4 files changed, 56 insertions(+), 7 deletions(-) diff --git a/demos/audio/README.md b/demos/audio/README.md index 8e79f5e4be..31566f1e7d 100644 --- a/demos/audio/README.md +++ b/demos/audio/README.md @@ -144,7 +144,7 @@ An asynchronous benchmarking client can be used to access the model server perfo git clone https://github.com/openvinotoolkit/model_server cd model_server/demos/benchmark/v3/ pip install -r requirements.txt -python benchmark.py --api_url http://localhost:8122/v3/audio/speech --model microsoft/speecht5_tts --batch_size 1 --limit 100 --request_rate inf --backend text2speech --dataset edinburghcstr/ami --hf-subset 'ihm' --tokenizer openai/whisper-large-v3-turbo --trust-remote-code True +python benchmark.py --api_url http://localhost:8000/v3/audio/speech --model microsoft/speecht5_tts --batch_size 1 --limit 100 --request_rate inf --backend text2speech --dataset edinburghcstr/ami --hf-subset 'ihm' --tokenizer openai/whisper-large-v3-turbo --trust-remote-code True Number of documents: 100 100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:58<00:00, 1.19s/it] Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation. @@ -178,10 +178,11 @@ Run `export_model.py` script to download and quantize the model: **CPU** ```console -python export_model.py speech2text --source_model openai/whisper-large-v3-turbo --weight-format fp16 --model_name openai/whisper-large-v3-turbo --config_file_path models/config.json --model_repository_path models --overwrite_models +python export_model.py speech2text --source_model openai/whisper-large-v3-turbo --weight-format fp16 --model_name openai/whisper-large-v3-turbo --config_file_path models/config.json --model_repository_path models --overwrite_models --enable_word_timestamps ``` > **Note:** Change the `--weight-format` to quantize the model to `int8` precision to reduce memory consumption and improve performance. +> **Note:** `--enable_word_timestamps` can be omitted if there is no need for word timestamps support. ### Deployment @@ -230,7 +231,7 @@ Transcript file that was previously generated with audio/speech endpoint. ```bash -curl http://localhost:8000/v3/audio/transcriptions -H "Content-Type: multipart/form-data" -F file="@speech.wav" -F model="openai/whisper-large-v3-turbo" +curl http://localhost:8000/v3/audio/transcriptions -H "Content-Type: multipart/form-data" -F file="@speech.wav" -F model="openai/whisper-large-v3-turbo" -F language="en" ``` ```json {"text": " The quick brown fox jumped over the lazy dog."} @@ -253,6 +254,7 @@ client = OpenAI(base_url=url, api_key="not_used") audio_file = open(filename, "rb") transcript = client.audio.transcriptions.create( model="openai/whisper-large-v3-turbo", + language="en", file=audio_file ) @@ -262,6 +264,49 @@ print(transcript.text) The quick brown fox jumped over the lazy dog. ``` ::: +:::{dropdown} **Unary call with timestamps** + + +```bash +curl http://localhost:8000/v3/audio/transcriptions -H "Content-Type: multipart/form-data" -F file="@speech.wav" -F model="openai/whisper-large-v3-turbo" -F language="en" -F timestamp_granularities[]="segment" -F timestamp_granularities[]="word" +``` +```json +{"text":" A quick brown fox jumped over the lazy dog","words":[{"word":" A","start":0.0,"end":0.14000000059604645},{"word":" quick","start":0.14000000059604645,"end":0.3400000035762787},{"word":" brown","start":0.3400000035762787,"end":0.7799999713897705},{"word":" fox","start":0.7799999713897705,"end":1.3199999332427979},{"word":" jumped","start":1.3199999332427979,"end":1.7799999713897705},{"word":" over","start":1.7799999713897705,"end":2.0799999237060547},{"word":" the","start":2.0799999237060547,"end":2.259999990463257},{"word":" lazy","start":2.259999990463257,"end":2.5399999618530273},{"word":" dog","start":2.5399999618530273,"end":2.919999837875366}],"segments":[{"text":" A quick brown fox jumped over the lazy dog","start":0.0,"end":3.1399998664855957}]} +``` +::: + +:::{dropdown} **Unary call with python OpenAI library with timestamps** + +```python +from pathlib import Path +from openai import OpenAI + +filename = "speech.wav" +url="http://localhost:8000/v3" + + +speech_file_path = Path(__file__).parent / filename +client = OpenAI(base_url=url, api_key="not_used") + +audio_file = open(filename, "rb") +transcript = client.audio.transcriptions.create( + model="openai/whisper-large-v3-turbo", + language="en", + response_format="verbose_json", + timestamp_granularities=["segment", "word"], + file=audio_file +) + +print(transcript.text) +print(transcript.segments) +print(transcript.words) +``` +``` + A quick brown fox jumped over the lazy dog +[TranscriptionSegment(id=None, avg_logprob=None, compression_ratio=None, end=3.1399998664855957, no_speech_prob=None, seek=None, start=0.0, temperature=None, text=' A quick brown fox jumped over the lazy dog', tokens=None)] +[TranscriptionWord(end=0.14000000059604645, start=0.0, word=' A'), TranscriptionWord(end=0.3400000035762787, start=0.14000000059604645, word=' quick'), TranscriptionWord(end=0.7799999713897705, start=0.3400000035762787, word=' brown'), TranscriptionWord(end=1.3199999332427979, start=0.7799999713897705, word=' fox'), TranscriptionWord(end=1.7799999713897705, start=1.3199999332427979, word=' jumped'), TranscriptionWord(end=2.0799999237060547, start=1.7799999713897705, word=' over'), TranscriptionWord(end=2.259999990463257, start=2.0799999237060547, word=' the'), TranscriptionWord(end=2.5399999618530273, start=2.259999990463257, word=' lazy'), TranscriptionWord(end=2.919999837875366, start=2.5399999618530273, word=' dog')] +``` +::: ## Benchmarking transcription An asynchronous benchmarking client can be used to access the model server performance with various load conditions. Below are execution examples captured on Intel(R) Core(TM) Ultra 7 258V. diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py index 97a9149b8b..d84c564c22 100644 --- a/demos/common/export_models/export_model.py +++ b/demos/common/export_models/export_model.py @@ -95,6 +95,7 @@ def add_common_arguments(parser): parser_speech2text = subparsers.add_parser('speech2text', help='export model for speech2text endpoint') add_common_arguments(parser_speech2text) parser_speech2text.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams') +parser_speech2text.add_argument('--enable_word_timestamps', default=False, action='store_true', help='Load model with word timestamps support.', dest='enable_word_timestamps') args = vars(parser.parse_args()) t2s_graph_template = """ @@ -129,7 +130,8 @@ def add_common_arguments(parser): [type.googleapis.com / mediapipe.S2tCalculatorOptions]: { models_path: "{{model_path}}", plugin_config: '{ "NUM_STREAMS": "{{num_streams|default(1, true)}}" }', - target_device: "{{target_device|default("CPU", true)}}" + target_device: "{{target_device|default("CPU", true)}}", + enable_word_timestamps: {% if not enable_word_timestamps %}false{% else %}true{% endif%}, } } } diff --git a/docs/model_server_rest_api_speech_to_text.md b/docs/model_server_rest_api_speech_to_text.md index f3bf916b9b..b82d482481 100644 --- a/docs/model_server_rest_api_speech_to_text.md +++ b/docs/model_server_rest_api_speech_to_text.md @@ -47,8 +47,8 @@ curl -X POST http://localhost:8000/v3/audio/translations \ | prompt | ❌ | ✅ | string | An optional text to guide the model's style or continue a previous audio segment. | | response_format | ❌ | ✅ | string | The format of the output. | | stream | ❌ | ✅ | boolean | Generate the response in streaming mode. | -| temperature | ❌ | ✅ | number | The sampling temperature, between 0 and 1. | -| timestamp_granularities | ❌ | ✅ | array | The timestamp granularities to populate for this transcription. | +| temperature | ⚠️ | ✅ | number | The sampling temperature, between 0 and 1. | +| timestamp_granularities | ⚠️ | ✅ | array | The timestamp granularities to populate for this transcription. Supported values: "word" and "segment" (⚠️**Note**: To enable word timestamps `enable_word_timestamps: true` need to be set in graph.pbtxt) | ### Translation diff --git a/docs/speech_recognition/reference.md b/docs/speech_recognition/reference.md index 3ad4e0c16d..2bfe69476f 100644 --- a/docs/speech_recognition/reference.md +++ b/docs/speech_recognition/reference.md @@ -42,7 +42,8 @@ node { node_options: { [type.googleapis.com / mediapipe.S2tCalculatorOptions]: { models_path: "./", - target_device: "CPU" + target_device: "CPU", + enable_word_timestamps: true } } } @@ -53,6 +54,7 @@ Above node configuration should be used as a template since user is not expected The calculator supports the following `node_options` for tuning the pipeline configuration: - `required string models_path` - location of the models and scheduler directory (can be relative); - `optional string device` - device to load models to. Supported values: "CPU, GPU, NPU" [default = "CPU"] +- `optional bool enable_word_timestamps` - set to true if model should support user requests for word timestamp. [default = false] We recommend using [export script](../../demos/common/export_models/README.md) to prepare models directory structure for serving. Check [supported models](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/#speech-recognition-models). From 13d63eb0f51c9f1b0dee364b7de8211874767a09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Rzepecki?= Date: Thu, 19 Feb 2026 11:22:11 +0100 Subject: [PATCH 05/10] flag for export model to enable export InternVL2 (#3970) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### 🛠 Summary [CVS-180948](https://jira.devtools.intel.com/browse/CVS-180948) Adding new parameter to export_model to enable export of model with requirement for trusting remote code. ### 🧪 Checklist - [ ] Unit tests added. - [ ] The documentation updated. - [ ] Change follows security best practices. `` --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Miłosz Żeglarski --- demos/common/export_models/README.md | 1 + demos/common/export_models/export_model.py | 2 +- demos/common/export_models/requirements.txt | 2 +- demos/continuous_batching/vlm/README.md | 2 -- 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/demos/common/export_models/README.md b/demos/common/export_models/README.md index 532f032d2f..fe2abe3684 100644 --- a/demos/common/export_models/README.md +++ b/demos/common/export_models/README.md @@ -1,6 +1,7 @@ # Exporting models using script {#ovms_demos_common_export} This documents describes how to export, optimize and configure models prior to server deployment with provided python script. This approach is more flexible than using [pull feature](../../../docs/pull_hf_models.md) from OVMS as it allows for using models that were not optimized beforehand and provided in [OpenVINO organization](https://huggingface.co/OpenVINO) in HuggingFace, but requires having Python set up to work. +> **Warning:** This script uses option `--trust-remote-code`, which allows model-provided Python code to run on your machine during export. Use only trusted models/sources, review repository code before running, and avoid executing this script in sensitive environments. ## What it does diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py index d84c564c22..1a93b048eb 100644 --- a/demos/common/export_models/export_model.py +++ b/demos/common/export_models/export_model.py @@ -414,7 +414,7 @@ def export_text_generation_model(model_repository_path, source_model, model_name raise ValueError("Failed to export llm model", source_model) if not (os.path.isfile(os.path.join(llm_model_path, 'openvino_detokenizer.xml'))): print("Tokenizer and detokenizer not found in the exported model. Exporting tokenizer and detokenizer from HF model") - convert_tokenizer_command = "convert_tokenizer --with-detokenizer -o {} {}".format(llm_model_path, source_model) + convert_tokenizer_command = f"convert_tokenizer --with-detokenizer --trust-remote-code -o {llm_model_path} {source_model}" if os.system(convert_tokenizer_command): raise ValueError("Failed to export tokenizer and detokenizer", source_model) ### Export draft model for speculative decoding diff --git a/demos/common/export_models/requirements.txt b/demos/common/export_models/requirements.txt index a803ff361c..82bd0fb818 100644 --- a/demos/common/export_models/requirements.txt +++ b/demos/common/export_models/requirements.txt @@ -8,7 +8,7 @@ datasets diffusers # for image generation einops nncf@git+https://github.com/openvinotoolkit/nncf.git -numpy +numpy==2.2.6 # temporary change until ov tokenizers upgrade openvino-tokenizers==2026.0.0rc2 openvino==2026.0.0rc2 pillow diff --git a/demos/continuous_batching/vlm/README.md b/demos/continuous_batching/vlm/README.md index 2a391f20dc..642cc85697 100644 --- a/demos/continuous_batching/vlm/README.md +++ b/demos/continuous_batching/vlm/README.md @@ -91,8 +91,6 @@ python export_model.py text_generation --source_model OpenGVLab/InternVL2-2B --w python export_model.py text_generation --source_model OpenGVLab/InternVL2-2B --weight-format int4 --pipeline_type VLM --model_name OpenGVLab/InternVL2-2B --config_file_path models/config.json --model_repository_path models --overwrite_models --target_device GPU ``` -> **Note:** Change the `--weight-format` to quantize the model to `fp16` or `int8` precision to reduce memory consumption and improve performance. - > **Note:** You can change the model used in the demo out of any topology [tested](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/#visual-language-models-vlms) with OpenVINO. Be aware that QwenVL models executed on GPU might experience execution errors with very high resolution images. In case of such behavior, it is recommended to reduce the parameter `max_pixels` in `preprocessor_config.json`. From 0e2379258fc1d788d0512b63e8025c94c7bbce37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Rzepecki?= Date: Thu, 19 Feb 2026 15:04:15 +0100 Subject: [PATCH 06/10] Adding env var for Qwen3-coder - continue demo (#3991) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### 🛠 Summary Adding environmental variable for Qwen3-Coder-30B to make it work correctly ### 🧪 Checklist - [ ] Unit tests added. - [ ] The documentation updated. - [ ] Change follows security best practices. `` --- demos/code_local_assistant/README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/demos/code_local_assistant/README.md b/demos/code_local_assistant/README.md index 11746ae249..cbcd0fc6b2 100644 --- a/demos/code_local_assistant/README.md +++ b/demos/code_local_assistant/README.md @@ -215,6 +215,7 @@ Run OpenVINO Model Server with all downloaded models loaded at the same time: Please refer to OpenVINO Model Server installation first: [link](../../docs/deploying_server_baremetal.md) ```bat +set MOE_USE_MICRO_GEMM_PREFILL=0 ovms --rest_port 8000 --config_path ./models/config_all.json ``` ::: @@ -222,7 +223,7 @@ ovms --rest_port 8000 --config_path ./models/config_all.json :sync: Linux CPU ### Linux: via Docker with CPU ```bash -docker run -d --rm -u $(id -u):$(id -g) \ +docker run -d --rm -u $(id -u):$(id -g) -e MOE_USE_MICRO_GEMM_PREFILL=0 \ -p 8000:8000 -v $(pwd)/:/workspace/ openvino/model_server:weekly --rest_port 8000 --config_path /workspace/models/config_all.json ``` ::: @@ -230,12 +231,14 @@ docker run -d --rm -u $(id -u):$(id -g) \ :sync: Linux GPU ### Linux: via Docker with GPU ```bash -docker run -d --rm --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \ +docker run -d --rm --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) -e MOE_USE_MICRO_GEMM_PREFILL=0 \ -p 8000:8000 -v $(pwd)/:/workspace/ openvino/model_server:weekly --rest_port 8000 --config_path /workspace/models/config_all.json ``` ::: :::: +> **Note:** `MOE_USE_MICRO_GEMM_PREFILL=0` is a workaround for *Qwen3-Coder-30B-A3B-Instruct* and it will be fixed in release 2026.1 or next weekly. + ## Set Up Visual Studio Code ### Download [Continue plugin](https://www.continue.dev/) From 9e19f5be35fea586c187dd62645425d12b66e6e4 Mon Sep 17 00:00:00 2001 From: michalkulakowski Date: Fri, 20 Feb 2026 09:53:43 +0100 Subject: [PATCH 07/10] Speaker embeddings demo improvements (#3987) --- demos/audio/README.md | 95 ++++++++++++---------- demos/audio/create_speaker_embedding.py | 36 ++++++++ demos/audio/requirements.txt | 5 ++ demos/common/export_models/export_model.py | 12 ++- 4 files changed, 105 insertions(+), 43 deletions(-) create mode 100644 demos/audio/create_speaker_embedding.py create mode 100644 demos/audio/requirements.txt diff --git a/demos/audio/README.md b/demos/audio/README.md index 31566f1e7d..f2eb06f84a 100644 --- a/demos/audio/README.md +++ b/demos/audio/README.md @@ -16,6 +16,17 @@ Check supported [Speech Recognition Models](https://openvinotoolkit.github.io/op **Client**: curl or Python for using OpenAI client package ## Speech generation +### Prepare speaker embeddings +When generating speech you can use default speaker voice or you can prepare your own speaker embedding file. Here you can see how to do it with downloaded file from online repository, but you can try with your own speech recording as well: +```bash +pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/audio/requirements.txt +mkdir -p audio_samples +curl --output audio_samples/audio.wav "https://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0032_8k.wav" +mkdir -p models +mkdir -p models/speakers +python create_speaker_embedding.py audio_samples/audio.wav models/speakers/voice1.bin +``` + ### Model preparation Supported models should use the topology of [microsoft/speecht5_tts](https://huggingface.co/microsoft/speecht5_tts) which needs to be converted to IR format before using in OVMS. @@ -40,48 +51,14 @@ Run `export_model.py` script to download and quantize the model: **CPU** ```console -python export_model.py text2speech --source_model microsoft/speecht5_tts --weight-format fp16 --model_name microsoft/speecht5_tts --config_file_path models/config.json --model_repository_path models --overwrite_models --vocoder microsoft/speecht5_hifigan +python export_model.py text2speech --source_model microsoft/speecht5_tts --weight-format fp16 --model_name microsoft/speecht5_tts --config_file_path models/config.json --model_repository_path models --overwrite_models --vocoder microsoft/speecht5_hifigan --speaker_name voice1 --speaker_path /models/speakers/voice1.bin ``` > **Note:** Change the `--weight-format` to quantize the model to `int8` precision to reduce memory consumption and improve performance. +> **Note:** `speaker_name` and `speaker_path` may be omitted if the default model voice is sufficient The default configuration should work in most cases but the parameters can be tuned via `export_model.py` script arguments. Run the script with `--help` argument to check available parameters and see the [T2s calculator documentation](../../docs/speech_generation/reference.md) to learn more about configuration options and limitations. -### Speaker embeddings - -Instead of generating speech with default model voice you can create speaker embeddings with [this script](https://github.com/openvinotoolkit/openvino.genai/blob/master/samples/python/speech_generation/create_speaker_embedding.py) -```bash -curl --output create_speaker_embedding.py "https://raw.githubusercontent.com/openvinotoolkit/openvino.genai/refs/heads/master/samples/python/speech_generation/create_speaker_embedding.py" -python create_speaker_embedding.py -mv speaker_embedding.bin models/ -``` -Script records your speech for 5 seconds(you can adjust duration of recording to achieve better results) and then, using speechbrain/spkrec-xvect-voxceleb model, creates `speaker_embedding.bin` file that contains your speaker embedding. -Now you need to add speaker embedding path to graph.pbtxt file of text2speech graph: -``` -input_stream: "HTTP_REQUEST_PAYLOAD:input" -output_stream: "HTTP_RESPONSE_PAYLOAD:output" -node { - name: "T2sExecutor" - input_side_packet: "TTS_NODE_RESOURCES:t2s_servable" - calculator: "T2sCalculator" - input_stream: "HTTP_REQUEST_PAYLOAD:input" - output_stream: "HTTP_RESPONSE_PAYLOAD:output" - node_options: { - [type.googleapis.com / mediapipe.T2sCalculatorOptions]: { - models_path: "./", - plugin_config: '{ "NUM_STREAMS": "1" }', - target_device: "CPU", - voices: [ - { - name: "voice", - path: "/models/speaker_embedding.bin", - } - ] - } - } -} -``` - ### Deployment **CPU** @@ -101,7 +78,7 @@ ovms --rest_port 8000 --source_model microsoft/speecht5_tts --model_repository_p ### Request Generation -:::{dropdown} **Unary call with curl** +:::{dropdown} **Unary call with curl with default voice** ```bash @@ -109,7 +86,41 @@ curl http://localhost:8000/v3/audio/speech -H "Content-Type: application/json" - ``` ::: -:::{dropdown} **Unary call with OpenAi python library** +:::{dropdown} **Unary call with OpenAI python library with default voice** + +```python +from pathlib import Path +from openai import OpenAI + +prompt = "The quick brown fox jumped over the lazy dog" +filename = "speech.wav" +url="http://localhost:8000/v3" + + +speech_file_path = Path(__file__).parent / "speech.wav" +client = OpenAI(base_url=url, api_key="not_used") + +with client.audio.speech.with_streaming_response.create( + model="microsoft/speecht5_tts", + voice=None, + input=prompt +) as response: + response.stream_to_file(speech_file_path) + + +print("Generation finished") +``` +::: + +:::{dropdown} **Unary call with curl** + + +```bash +curl http://localhost:8000/v3/audio/speech -H "Content-Type: application/json" -d "{\"model\": \"microsoft/speecht5_tts\", \"voice\":\"voice1\", \"input\": \"The quick brown fox jumped over the lazy dog\"}" -o speech.wav +``` +::: + +:::{dropdown} **Unary call with OpenAI python library** ```python from pathlib import Path @@ -125,7 +136,7 @@ client = OpenAI(base_url=url, api_key="not_used") with client.audio.speech.with_streaming_response.create( model="microsoft/speecht5_tts", - voice="unused", + voice="voice1", input=prompt ) as response: response.stream_to_file(speech_file_path) @@ -222,12 +233,12 @@ ovms --rest_port 8000 --source_model openai/whisper-large-v3-turbo --model_repos ``` ::: -The default configuration should work in most cases but the parameters can be tuned via `export_model.py` script arguments. Run the script with `--help` argument to check available parameters and see the [S2t calculator documentation](../../docs/speech_recognition/reference.md) to learn more about configuration options and limitations. +The default configuration should work in most cases but the parameters can be tuned via `export_model.py` script arguments. Run the script with `--help` argument to check available parameters and see the [s2t calculator documentation](../../docs/speech_recognition/reference.md) to learn more about configuration options and limitations. ### Request Generation Transcript file that was previously generated with audio/speech endpoint. -:::{dropdown} **Unary call with curl** +:::{dropdown} **Unary call with cURL** ```bash @@ -381,7 +392,7 @@ ovms --rest_port 8000 --source_model OpenVINO/whisper-large-v3-fp16-ov --model_r ### Request Generation Transcript and translate file that was previously generated with audio/speech endpoint. -:::{dropdown} **Unary call with curl** +:::{dropdown} **Unary call with cURL** ```bash diff --git a/demos/audio/create_speaker_embedding.py b/demos/audio/create_speaker_embedding.py new file mode 100644 index 0000000000..1d0a5663b4 --- /dev/null +++ b/demos/audio/create_speaker_embedding.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import torch +import torchaudio +from speechbrain.inference.speaker import EncoderClassifier +import sys + +if len(sys.argv) != 3: + print(f"Usage: {sys.argv[0]} ") + sys.exit(1) + +file = sys.argv[1] +signal, fs = torchaudio.load(file) +if signal.shape[0] > 1: + signal = torch.mean(signal, dim=0, keepdim=True) +expected_sample_rate = 16000 +if fs != expected_sample_rate: + resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=expected_sample_rate) + signal = resampler(signal) + +if signal.ndim != 2 or signal.shape[0] != 1: + print(f"Error: expected signal shape [1, num_samples], got {list(signal.shape)}") + sys.exit(1) +if signal.shape[1] == 0: + print("Error: audio file contains no samples") + sys.exit(1) + +classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb") +embedding = classifier.encode_batch(signal) +embedding = torch.nn.functional.normalize(embedding, dim=2) +embedding = embedding.squeeze().cpu().numpy().astype("float32") + +output_file = sys.argv[2] +embedding.tofile(output_file) \ No newline at end of file diff --git a/demos/audio/requirements.txt b/demos/audio/requirements.txt new file mode 100644 index 0000000000..b57ab92bd2 --- /dev/null +++ b/demos/audio/requirements.txt @@ -0,0 +1,5 @@ +--extra-index-url "https://download.pytorch.org/whl/cpu" +torch==2.9.1+cpu +torchaudio==2.9.1+cpu +speechbrain==1.0.3 +openai==2.21.0 \ No newline at end of file diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py index 1a93b048eb..b7b6a55b72 100644 --- a/demos/common/export_models/export_model.py +++ b/demos/common/export_models/export_model.py @@ -91,6 +91,9 @@ def add_common_arguments(parser): add_common_arguments(parser_text2speech) parser_text2speech.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams') parser_text2speech.add_argument('--vocoder', type=str, help='The vocoder model to use for text2speech. For example microsoft/speecht5_hifigan', dest='vocoder') +parser_text2speech.add_argument('--speaker_name', type=str, help='Name of the speaker', dest='speaker_name') +parser_text2speech.add_argument('--speaker_path', type=str, help='Path to the speaker.bin file.', dest='speaker_path') + parser_speech2text = subparsers.add_parser('speech2text', help='export model for speech2text endpoint') add_common_arguments(parser_speech2text) @@ -111,7 +114,14 @@ def add_common_arguments(parser): [type.googleapis.com / mediapipe.T2sCalculatorOptions]: { models_path: "{{model_path}}", plugin_config: '{ "NUM_STREAMS": "{{num_streams|default(1, true)}}" }', - target_device: "{{target_device|default("CPU", true)}}" + target_device: "{{target_device|default("CPU", true)}}", + {%- if speaker_name and speaker_path %} + voices: [ + { + name: "{{speaker_name}}", + path: "{{speaker_path}}" + } + ]{% endif %} } } } From a446f1969115bf7774fb166e3c8b2404f643457a Mon Sep 17 00:00:00 2001 From: Adrian Tobiszewski Date: Fri, 20 Feb 2026 14:18:24 +0100 Subject: [PATCH 08/10] Add copilot instructions (#3992) --- .github/copilot-instructions.md | 191 ++++++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 .github/copilot-instructions.md diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000000..05c4dd81b7 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,191 @@ +# Copilot Instructions for OpenVINO Model Server (OVMS) + +## Project Overview + +OpenVINO Model Server (OVMS) is a high-performance inference serving platform built on top of **OpenVINO** and **OpenVINO GenAI**. The codebase is primarily **C++** with **Bazel** as the build system. Supporting infrastructure uses **Makefiles**, **Dockerfiles** (Ubuntu & Red Hat), and **batch files** (Windows). + +**Performance is a top priority** — both **throughput** and **latency** are critical. Code changes should be evaluated for their performance impact. Avoid unnecessary copies, allocations, and blocking operations on the hot path. + +## Repository Structure + +- `src/` — Main C++ source code (server, gRPC/REST handlers, model management, pipelines, mediapipe, LLM, C API) +- `src/test/` — C++ unit tests (gtest-based); this is where most developer tests live +- `src/python/` — Python custom node bindings and related code +- `demos/` — End-user demo applications +- `client/` — Client libraries (C++, Python, Go, Java) +- `docs/` — Documentation +- `third_party/` — Third-party dependency definitions for Bazel +- `Dockerfile.ubuntu` / `Dockerfile.redhat` — Multi-stage Dockerfiles for Linux builds +- `Makefile` — Orchestrates Docker-based builds and test runs +- `*.bat` files — Windows build and setup scripts + +## Code Style + +- C++ style is enforced via `cpplint` and `clang-format` +- Run `make style` to check formatting +- Apache 2.0 license headers are required on all source files + +## Expertise Areas + +1. **OpenVINO Expertise:** + - Proficient with OpenVINO core libraries and `ov::genai` components + - Familiar with OpenVINO performance optimization techniques +2. **C++ Proficiency:** + - Strong C++17 skills + - Familiar with best practices in memory management, concurrency, and template programming +3. **Serving Infrastructure:** + - gRPC and REST API handler design + - Model management, pipeline orchestration, and MediaPipe integration + - C API (`libovms_shared.so`) surface and bindings +4. **Build System Awareness:** + - Bazel build configuration and dependency management + - Minimizing C++ build times: forward declarations, include-what-you-use, avoiding transitive header leakage + - Understanding of Bazel targets, build flags (`--//:distro`), and multi-stage Docker builds + +## Code Review Instructions for PRs + +When analyzing a Pull Request, follow this protocol: + +1. Follow **C++ Core Guidelines** strictly. Include references in review comments. +2. Check for **hidden performance costs**: avoid `dynamic_cast` on the hot path; suggest `static_cast` or redesign if the type is known. +3. **Avoid copies**: ensure large data structures (tensors, buffers) are passed by reference or moved, not copied. +4. **Documentation**: ensure new public APIs have docstrings in C++ headers and Python bindings; update `docs/` as needed. +5. **Test coverage**: ensure that new features or changes have corresponding tests in `src/test/`. +6. **Formatting & safety:** + - No `using namespace std; using namespace ov;`. Prefer explicit using with specific symbols if needed, for readability. + - No `auto` for primitive types where it obscures readability. + - Use `const` and `constexpr` wherever possible. +7. Pass non-fundamental values by `const` reference wherever possible. +8. Prefer member initializer lists over direct assignments in constructor bodies. +9. Verify that the result of every newly introduced function is used in at least one call site (except `void` functions). +10. Use descriptive function and variable names. Avoid duplicate code — extract common functionality into reusable utilities. +11. When initial container values are known upfront, prefer initializer-list / brace-initialization over constructing an empty container and inserting. +12. Unused functions and includes are not allowed. Build times are already long — do not add unnecessary `#include` directives. Prefer forward declarations where possible and follow the include-what-you-use principle. + - **Forward-declare in headers, include in `.cpp`**: if a header only uses pointers or references to a type, use a forward declaration (`class Foo;`) instead of `#include "foo.hpp"`. Move the full `#include` to the `.cpp` file where the type is actually used. + - **Keep headers self-contained but minimal**: each header must compile on its own, but should not pull in transitive dependencies that callers don't need. + - **Prefer opaque types / Pimpl**: for complex implementation details, consider the Pimpl idiom to keep implementation-only types out of the public header entirely. + - **Never include a header solely for a typedef or enum**: forward-declare the enum (`enum class Foo;` in C++17) or relocate the typedef to a lightweight `fwd.hpp`-style header. +13. Be mindful when accepting `const T&` in constructors or functions that store the reference: verify that the referenced object's lifetime outlives the usage to avoid dangling references. + +## Build System + +### Bazel (primary build tool) + +Building and testing is done **inside a Docker `-build` container** with the repository mounted. Developers do not run Bazel on the host directly. + +**Important:** Building the `-build` image from scratch is time-expensive, but may be required if dependencies or the build environment change. Before building a new one, check if one already exists: +```bash +docker images | grep -- -build +``` +If a `-build` image exists, start a container from it with the repository mounted: +```bash +docker run -it -v $(pwd):/ovms \ + -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy \ + bash +``` +If a container from a previous session is still available (`docker ps -a`), reuse it with `docker start -i ` or `docker exec -it bash`. + +**Key Bazel targets:** + +| Target | Description | +|--------|-------------| +| `//src:ovms` | Main OVMS server binary | +| `//src:ovms_test` | C++ unit tests (gtest) | +| `//src:ovms_shared` | C API shared library (`libovms_shared.so`) | + +**Build the server:** +```bash +bazel build //src:ovms +``` + +**Build and run unit tests:** +```bash +bazel build //src:ovms_test +bazel test --test_summary=detailed --test_output=streamed //src:ovms_test +``` + +### Red Hat builds — distro flag + +For Red Hat (UBI9) builds, the `--//:distro=redhat` flag must be passed to all Bazel commands: +```bash +bazel build --//:distro=redhat //src:ovms +bazel test --//:distro=redhat //src:ovms_test +``` + +The default distro is `ubuntu`. Inside the `-build` container, the distro is auto-detected from `/etc/redhat-release`. + +### Makefile (Docker-based workflow) + +The Makefile orchestrates full Docker-based builds. Key targets: + +| Target | Description | +|--------|-------------| +| `make docker_build` | Full build: builder image → package → release images (default target) | +| `make ovms_builder_image` | Build the `-build` Docker image (compilation container) | +| `make targz_package` | Extract `.tar.gz` release package | +| `make ovms_release_images` | Build CPU and GPU release Docker images | +| `make run_unit_tests` | Run C++ unit tests in the `-build` container | +| `make test_functional` | Run Python functional tests | +| `make style` / `make cpplint` | Code style checks | + +**Red Hat build via Make:** +```bash +make docker_build BASE_OS=redhat +``` + +**Supported `BASE_OS` values:** `ubuntu24`, `ubuntu22`, `redhat` + +### Dockerfile stages + +Both `Dockerfile.ubuntu` and `Dockerfile.redhat` use multi-stage builds: + +| Stage | Purpose | +|-------|---------| +| `base_build` | System dependencies, Boost, Azure SDK, OpenCV | +| `build` | Bazel + OpenVINO setup, compiles OVMS (the `-build` container) | +| `capi-build` | Builds C API shared library and examples | +| `pkg` | Packages everything into a `.tar.gz` | +| `release` | Minimal runtime image with entrypoint | + +### Windows builds + +Windows builds use batch files in the repository root: +- `windows_install_build_dependencies.bat` — Install build dependencies (MSVC 2022 Build Tools, etc.) +- `windows_build.bat` — Main build script +- `windows_test.bat` — Run tests + +Windows-specific Bazel config: `--config=win_mp_on_py_off` (or `--config=win_mp_on_py_on` for Python support). + +## Testing + +### Test setup + +Before running tests, test models must be prepared: +```bash +make prepare_models +``` + +Models are exported using the `demos/common/export_models/export_model.py` script (used internally by the test setup). + +If LLM test models change (e.g., new model version, OpenVINO version change or config update), you may need to remove and regenerate the LLM test data: +```bash +rm -rf src/test/llm_testing +make prepare_models +``` + +### Running tests + +Running the full `//src:ovms_test` suite is **time-consuming**. During development, always run only the test fixtures relevant to your changes first using `--test_filter`: +```bash +bazel test --test_summary=detailed --test_output=streamed --test_filter="SuiteName.TestName" //src:ovms_test +``` +Run the full test suite only after the targeted tests pass. + +### Test structure + +- **Unit tests** are in `src/test/` — gtest-based C++ tests covering all server components +- Test files follow the `*_test.cpp` naming convention +- Test utilities: `test_utils.hpp`, `light_test_utils.hpp`, `c_api_test_utils.hpp` +- Test models are stored in `src/test/` subdirectories (e.g., `dummy/`, `passthrough/`, `summator/`) +- Specialized test areas: `src/test/llm/`, `src/test/mediapipe/`, `src/test/python/`, `src/test/embeddings/` + From 14b1c8f23feebe68123a1a5be58b08e6ba63eb96 Mon Sep 17 00:00:00 2001 From: Rafal Sapala Date: Mon, 23 Feb 2026 10:59:46 +0100 Subject: [PATCH 09/10] Npu docs (#3962) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### 🛠 Summary Updating export and NPU usage. ### 🧪 Checklist - [ ] Unit tests added. - [ ] The documentation updated. - [ ] Change follows security best practices. `` --------- Co-authored-by: Damian Kalinowski Co-authored-by: Trawinski, Dariusz --- demos/embeddings/README.md | 53 ++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md index b74042f754..fbbfd91ea5 100644 --- a/demos/embeddings/README.md +++ b/demos/embeddings/README.md @@ -243,7 +243,19 @@ python export_model.py embeddings_ov --source_model sentence-transformers/all-mp ::: :::: +**NPU** +::::{tab-set} +:::{tab-item} Qwen/Qwen3-Embedding-0.6B +:sync: Qwen3-Embedding-0.6B-fp16 +```console +python export_model.py embeddings_ov --source_model BAAI/bge-large-en-v1.5 --pooling CLS --weight-format fp16 --target_device NPU --config_file_path models/config.json --model_repository_path models +``` +::: +:::: +> **Note** For NPU Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision. For int4 precisions, add required extra parameter `--extra_quantization_params "--sym --ratio 1.0 --group-size -1"` +> **Note** For NPU the pooling mode --pooling LAST has the best accuracy. +> **Note** For NPU and the weight-format int4, use `--extra_quantization_params "--sym --ratio 1.0 --group-size -1"` > **Note** Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision to reduce memory consumption and improve performance. > **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" before running the export script to connect to the HF Hub. @@ -280,22 +292,22 @@ python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B -- ## Tested models All models supported by [optimum-intel](https://github.com/huggingface/optimum-intel) should be compatible. The demo is validated against following Hugging Face models: -|Model name|Pooling| -|---|---| -|OpenVINO/Qwen3-Embedding-0.6B-int8-ov|LAST| -|OpenVINO/bge-base-en-v1.5-int8-ov|CLS| -|BAAI/bge-large-en-v1.5|CLS| -|BAAI/bge-large-zh-v1.5|CLS| -|thenlper/gte-small|CLS| -|sentence-transformers/all-MiniLM-L12-v2|MEAN| -|sentence-transformers/all-distilroberta-v1|MEAN| -|mixedbread-ai/deepset-mxbai-embed-de-large-v1|MEAN| -|intfloat/multilingual-e5-large-instruct|MEAN| -|intfloat/multilingual-e5-large|MEAN| -|Alibaba-NLP/gte-large-en-v1.5|CLS| -|nomic-ai/nomic-embed-text-v1.5|MEAN| -|sentence-transformers/all-mpnet-base-v2|MEAN| - +|Model name|Pooling|Devices| +|---|---|---| +|OpenVINO/Qwen3-Embedding-0.6B-int8-ov|LAST|CPU,GPU| +|OpenVINO/bge-base-en-v1.5-int8-ov|CLS|CPU,GPU| +|Qwen/Qwen3-Embedding-0.6B|LAST|CPU,GPU,NPU| +|BAAI/bge-large-en-v1.5|CLS|CPU,GPU,NPU| +|BAAI/bge-large-zh-v1.5|CLS|CPU,GPU,NPU| +|thenlper/gte-small|CLS|CPU,GPU,NPU| +|sentence-transformers/all-MiniLM-L12-v2|MEAN|CPU,GPU| +|sentence-transformers/all-distilroberta-v1|MEAN|CPU,GPU| +|mixedbread-ai/deepset-mxbai-embed-de-large-v1|MEAN|CPU,GPU| +|intfloat/multilingual-e5-large-instruct|MEAN|CPU,GPU| +|intfloat/multilingual-e5-large|MEAN|CPU,GPU| +|Alibaba-NLP/gte-large-en-v1.5|CLS|CPU,GPU| +|nomic-ai/nomic-embed-text-v1.5|MEAN|CPU,GPU| +|sentence-transformers/all-mpnet-base-v2|MEAN|CPU,GPU,NPU| ## Server Deployment @@ -313,6 +325,14 @@ to `docker run` command, use the image with GPU support and make sure set the ta ```bash docker run -d --rm -p 8000:8000 --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -v $(pwd)/models:/workspace:ro openvino/model_server:latest-gpu --rest_port 8000 --config_path /workspace/config.json ``` +**NPU** +NOTE: NPU execution for embeddings model is a preview feature. +In case you want to use NPU device to run the embeddings model, add extra docker parameters `--device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1)` +to `docker run` command, use the image with NPU support and make sure set the target_device in subconfig.json to NPU. Also make sure the export model quantization level and cache size fit to the NPU memory. All of that can be applied with the commands: + +```bash +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -1) -v $(pwd)/models:/workspace:ro openvino/model_server:latest-gpu --rest_port 8000 --config_path /workspace/config.json +``` ::: :::{dropdown} **Deploying on Bare Metal** @@ -368,6 +388,7 @@ curl http://localhost:8000/v3/embeddings -H "Content-Type: application/json" -d "usage":{"prompt_tokens":4,"total_tokens":4} } + ``` ::: From a13d0ba81c1f3d0886b09f0eec98c2ea9ffc6b58 Mon Sep 17 00:00:00 2001 From: Rafal Sapala Date: Mon, 23 Feb 2026 16:16:13 +0100 Subject: [PATCH 10/10] Style --- src/llm/language_model/continuous_batching/llm_executor.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/llm/language_model/continuous_batching/llm_executor.hpp b/src/llm/language_model/continuous_batching/llm_executor.hpp index 2c4bdc640a..28203075f6 100644 --- a/src/llm/language_model/continuous_batching/llm_executor.hpp +++ b/src/llm/language_model/continuous_batching/llm_executor.hpp @@ -73,8 +73,7 @@ struct LLMExecutor { return oss.str(); } - std::string formatBytes(size_t bytes) - { + std::string formatBytes(size_t bytes) { const double KB = 1024.0; const double MB = KB * 1024.0; const double GB = MB * 1024.0;