From 13ae37bc1fa92560aea19a7236d959be56ef2eca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Rzepecki?= Date: Thu, 19 Feb 2026 11:22:11 +0100 Subject: [PATCH 1/8] flag for export model to enable export InternVL2 (#3970) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [CVS-180948](https://jira.devtools.intel.com/browse/CVS-180948) Adding new parameter to export_model to enable export of model with requirement for trusting remote code. - [ ] Unit tests added. - [ ] The documentation updated. - [ ] Change follows security best practices. `` --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Miłosz Żeglarski --- demos/common/export_models/README.md | 1 + demos/common/export_models/export_model.py | 2 +- demos/common/export_models/requirements.txt | 2 +- demos/continuous_batching/vlm/README.md | 2 -- 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/demos/common/export_models/README.md b/demos/common/export_models/README.md index 96d4f712f3..f9aa89e7cf 100644 --- a/demos/common/export_models/README.md +++ b/demos/common/export_models/README.md @@ -1,6 +1,7 @@ # Exporting models using script {#ovms_demos_common_export} This documents describes how to export, optimize and configure models prior to server deployment with provided python script. This approach is more flexible than using [pull feature](../../../docs/pull_hf_models.md) from OVMS as it allows for using models that were not optimized beforehand and provided in [OpenVINO organization](https://huggingface.co/OpenVINO) in HuggingFace, but requires having Python set up to work. +> **Warning:** This script uses option `--trust-remote-code`, which allows model-provided Python code to run on your machine during export. Use only trusted models/sources, review repository code before running, and avoid executing this script in sensitive environments. ## What it does diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py index 97a9149b8b..973490e33b 100644 --- a/demos/common/export_models/export_model.py +++ b/demos/common/export_models/export_model.py @@ -412,7 +412,7 @@ def export_text_generation_model(model_repository_path, source_model, model_name raise ValueError("Failed to export llm model", source_model) if not (os.path.isfile(os.path.join(llm_model_path, 'openvino_detokenizer.xml'))): print("Tokenizer and detokenizer not found in the exported model. Exporting tokenizer and detokenizer from HF model") - convert_tokenizer_command = "convert_tokenizer --with-detokenizer -o {} {}".format(llm_model_path, source_model) + convert_tokenizer_command = f"convert_tokenizer --with-detokenizer --trust-remote-code -o {llm_model_path} {source_model}" if os.system(convert_tokenizer_command): raise ValueError("Failed to export tokenizer and detokenizer", source_model) ### Export draft model for speculative decoding diff --git a/demos/common/export_models/requirements.txt b/demos/common/export_models/requirements.txt index 8845d56ec3..af6d59e63e 100644 --- a/demos/common/export_models/requirements.txt +++ b/demos/common/export_models/requirements.txt @@ -8,7 +8,7 @@ datasets diffusers # for image generation einops nncf@git+https://github.com/openvinotoolkit/nncf.git@7a6ac29a15d247c0fc76da93f4d53bbfa7b489cc -numpy +numpy==2.2.6 # temporary change until ov tokenizers upgrade openvino-tokenizers==2026.0.0rc3 openvino==2026.0.0rc3 pillow diff --git a/demos/continuous_batching/vlm/README.md b/demos/continuous_batching/vlm/README.md index ff4145a3a0..3e356e95c6 100644 --- a/demos/continuous_batching/vlm/README.md +++ b/demos/continuous_batching/vlm/README.md @@ -91,8 +91,6 @@ python export_model.py text_generation --source_model OpenGVLab/InternVL2-2B --w python export_model.py text_generation --source_model OpenGVLab/InternVL2-2B --weight-format int4 --pipeline_type VLM --model_name OpenGVLab/InternVL2-2B --config_file_path models/config.json --model_repository_path models --overwrite_models --target_device GPU ``` -> **Note:** Change the `--weight-format` to quantize the model to `fp16` or `int8` precision to reduce memory consumption and improve performance. - > **Note:** You can change the model used in the demo out of any topology [tested](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/#visual-language-models-vlms) with OpenVINO. Be aware that QwenVL models executed on GPU might experience execution errors with very high resolution images. In case of such behavior, it is recommended to reduce the parameter `max_pixels` in `preprocessor_config.json`. From af0eb40a5f268e60815574a19664770dfe0cb418 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Rzepecki?= Date: Wed, 18 Feb 2026 12:51:23 +0100 Subject: [PATCH 2/8] Cherry-pick: ONNX model demo improvement (#3975) [CVS-180837](https://jira.devtools.intel.com/browse/CVS-180837) Improving Using ONNX model demo by simplifying it showing improvements related to preprocessing feature. - [ ] Unit tests added. - [ ] The documentation updated. - [ ] Change follows security best practices. `` --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Trawinski, Dariusz --- demos/using_onnx_model/python/README.md | 75 +++++++++---------- .../python/onnx_model_demo.py | 52 ++++--------- .../using_onnx_model/python/requirements.txt | 2 +- 3 files changed, 51 insertions(+), 78 deletions(-) diff --git a/demos/using_onnx_model/python/README.md b/demos/using_onnx_model/python/README.md index 022a43be7b..6de85e6cbd 100644 --- a/demos/using_onnx_model/python/README.md +++ b/demos/using_onnx_model/python/README.md @@ -1,72 +1,65 @@ # Prediction Example with an ONNX Model {#ovms_demo_using_onnx_model} -Steps are similar to when you work with IR model format. Model Server accepts ONNX models as well with no differences in versioning. Locate ONNX model file in separate model version directory. +This demo demonstrates the steps required to deploy an ONNX‑based vision model. The workflow is optimized for rapid integration and ease of use: no model‑conversion step is needed, as the model is provided directly in ONNX format. +To further simplify deployment, the server applies all necessary image‑preprocessing operations, removing the need for the client to implement preprocessing pipelines such as normalization or color‑space transformation. This approach reduces development effort, ensures consistency with the model’s training configuration, and accelerates end‑to‑end deployment. +The server accepts image data in multiple formats, offering flexibility depending on the client environment. Images can be sent as: + +Raw arrays directly obtained from OpenCV or Pillow +Encoded images, including JPEG or PNG formats + +This enables seamless integration with a wide range of applications and client libraries. Below is a complete functional use case using Python 3.7 or higher. For this example let's use a public [ONNX ResNet](https://github.com/onnx/models/tree/main/validated/vision/classification/resnet) model - resnet50-caffe2-v1-9.onnx model. -This model requires additional [preprocessing function](https://github.com/onnx/models/tree/main/validated/vision/classification/resnet#preprocessing). Preprocessing can be performed in the client by manipulating data before sending the request. Preprocessing can be also delegated to the server by setting preprocessing parameters. Both methods will be explained below. +This model was trained using an additional [preprocessing](https://github.com/onnx/models/tree/main/validated/vision/classification/resnet#preprocessing). For inference, preprocessing can be executed on the client side by transforming the input data before sending the request. However, a more efficient approach is to delegate preprocessing to the server by configuring the appropriate preprocessing parameters. +Here will be adjusted `mean`, `scale`, `color` and `layout`. In addition to that, input precision conversion from fp32 to uint8 can improve performance and bandwidth efficiency. Raw images can be transmitted using more compact uint8 data, significantly reducing the payload size and lowering client‑side compute requirements. +More details about [parameters](../../../docs/parameters.md). -[Option 1: Adding preprocessing to the client side](#option-1-adding-preprocessing-to-the-client-side) -[Option 2: Adding preprocessing to the server side](#option-2-adding-preprocessing-to-the-server-side) - -## Option 1: Adding preprocessing to the client side +## Model deployment with preprocessing Clone the repository and enter using_onnx_model directory + ```bash git clone https://github.com/openvinotoolkit/model_server.git cd model_server/demos/using_onnx_model/python ``` -Download classification model +Prepare environment ```bash curl --fail -L --create-dirs https://github.com/onnx/models/raw/main/validated/vision/classification/resnet/model/resnet50-caffe2-v1-9.onnx -o workspace/resnet50-onnx/1/resnet50-caffe2-v1-9.onnx ``` -You should see `workspace` directory created with the following content: -```bash -workspace/ -└── resnet50-onnx - └── 1 - └── resnet50-caffe2-v1-9.onnx - -``` - -Start the OVMS container with single model instance: +Start the OVMS container with additional preprocessing options: ```bash docker run -d -u $(id -u):$(id -g) -v $(pwd)/workspace:/workspace -p 9001:9001 openvino/model_server:latest \ ---model_path /workspace/resnet50-onnx --model_name resnet --port 9001 +--model_path /workspace/resnet50-onnx --model_name resnet --port 9001 --layout NHWC:NCHW --mean "[123.675,116.28,103.53]" --scale "[58.395,57.12,57.375]" --shape "(1,224,224,3)" --color_format BGR:RGB --precision uint8:fp32 ``` -Install python client dependencies: -```bash -pip3 install -r requirements.txt -``` +## Running the client: -The `onnx_model_demo.py` script can run inference both with and without performing preprocessing. Since in this variant we want to run preprocessing on the client side let's set `--run_preprocessing` flag. +The `onnx_model_demo.py` script can run inference both with and without performing preprocessing. Since in this variant preprocessing is done by the model server, there's no need to perform any image preprocessing on the client side. In that case, run without `--run_preprocessing` option. See [preprocessing function](https://github.com/openvinotoolkit/model_server/blob/releases/2026/0/demos/using_onnx_model/python/onnx_model_demo.py#L26-L33) run in the client. + +Run the client without preprocessing: -Run the client with preprocessing: ```bash -python onnx_model_demo.py --service_url localhost:9001 --run_preprocessing -Running with preprocessing on client side -../../common/static/images/bee.jpeg (1, 3, 224, 224) ; data range: -2.117904 : 2.64 -Class is with highest score: 309 +pip3 install -r requirements.txt +python onnx_model_demo.py --service_url localhost:9001 +``` +Output: +``` +Running inference with image: ../../common/static/images/bee.jpeg +Class with highest score: 309 Detected class name: bee ``` -## Option 2: Adding preprocessing to the server side - -Start the OVMS container with additional preprocessing options: +The client can be also run with flag `--send_tensor` which reads encoded input image and sends it with uint8 precision. ```bash -docker run -d -u $(id -u):$(id -g) -v $(pwd)/workspace:/workspace -p 9001:9001 openvino/model_server:latest \ ---model_path /workspace/resnet50-onnx --model_name resnet --port 9001 --layout NHWC:NCHW --mean "[123.675,116.28,103.53]" --scale "[58.395,57.12,57.375]" --shape "(1,224,224,3)" --color_format BGR:RGB +python onnx_model_demo.py --service_url localhost:9001 --send_tensor ``` - -The `onnx_model_demo.py` script can run inference both with and without performing preprocessing. Since in this variant preprocessing is done by the model server, there's no need to perform any image preprocessing on the client side. In that case, run without `--run_preprocessing` option. See [preprocessing function](https://github.com/openvinotoolkit/model_server/blob/releases/2026/0/demos/using_onnx_model/python/onnx_model_demo.py#L26-L33) run in the client. - -Run the client without preprocessing: -```bash -python onnx_model_demo.py --service_url localhost:9001 -Running without preprocessing on client side -Class is with highest score: 309 +Output: +``` +Running inference with image: ../../common/static/images/bee.jpeg +Class with highest score: 309 Detected class name: bee ``` +> **Note:** While adding preprocessing to the model input, shape needs to be set as static. \ No newline at end of file diff --git a/demos/using_onnx_model/python/onnx_model_demo.py b/demos/using_onnx_model/python/onnx_model_demo.py index c3c981debe..341e6a3d97 100644 --- a/demos/using_onnx_model/python/onnx_model_demo.py +++ b/demos/using_onnx_model/python/onnx_model_demo.py @@ -19,57 +19,37 @@ import argparse import numpy as np -import cv2 +from PIL import Image from ovmsclient import make_grpc_client import classes -def preprocess(img_data): - mean_vec = np.array([0.485, 0.456, 0.406]) - stddev_vec = np.array([0.229, 0.224, 0.225]) - norm_img_data = np.zeros(img_data.shape).astype('float32') - for i in range(img_data.shape[0]): - # for each pixel in each channel, divide the value by 255 to get value between [0, 1] and then normalize - norm_img_data[i,:,:] = (img_data[i,:,:]/255 - mean_vec[i]) / stddev_vec[i] - return norm_img_data - -def getJpeg(path, size): - with open(path, mode='rb') as file: - content = file.read() - - img = np.frombuffer(content, dtype=np.uint8) - img = cv2.imdecode(img, cv2.IMREAD_COLOR) # BGR format - # format of data is HWC - # add image preprocessing if needed by the model - img = cv2.resize(img, (224, 224)) - img = img.astype('float32') - #convert to NCHW - img = img.transpose(2,0,1) - # normalize to adjust to model training dataset - img = preprocess(img) - img = img.reshape(1,3,size,size) - print(path, img.shape, "; data range:",np.amin(img),":",np.amax(img)) - return img +def load_image(path): + with Image.open(path) as img: + img = img.resize((224, 224)) + np_img = np.array(img, dtype=np.uint8) + np_img = np_img[:, :, ::-1] + np_img = np.expand_dims(np_img, axis=0) + return np_img parser = argparse.ArgumentParser(description='Run prediction on ONNX ResNet50 Model') parser.add_argument('--image_path', required=False, default='../../common/static/images/bee.jpeg', help='Path to a file with a JPEG image') parser.add_argument('--service_url',required=False, default='localhost:9001', help='Specify url to grpc service. default:localhost:9001') -parser.add_argument('--run_preprocessing',required=False, action="store_true", help='Specify if preprocessing should be run on the client side. default: False') +parser.add_argument('--send_tensor', action="store_true", required=False, help='Send image after loading it with Pillow') args = vars(parser.parse_args()) -if args["run_preprocessing"]: - print("Running with preprocessing on client side") - img = getJpeg(args["image_path"], 224) - input_name = "data" +print(f"Running inference with image: {args['image_path']}") +if args["send_tensor"]: + img = load_image(args["image_path"]) else: - print("Running without preprocessing on client side") with open(args["image_path"], "rb") as f: img = f.read() - input_name = "data" + +input_name = "gpu_0/data_0" client = make_grpc_client(args["service_url"]) output = client.predict({input_name: img}, "resnet") max = np.argmax(output) -print("Class is with highest score: {}".format(max)) -print("Detected class name: {}".format(classes.imagenet_classes[max])) +print(f"Class with highest score: {max}") +print(f"Detected class name: {classes.imagenet_classes[max]}") diff --git a/demos/using_onnx_model/python/requirements.txt b/demos/using_onnx_model/python/requirements.txt index 69efc0967e..907a5e4852 100644 --- a/demos/using_onnx_model/python/requirements.txt +++ b/demos/using_onnx_model/python/requirements.txt @@ -1,3 +1,3 @@ -opencv-python>=4.8.1.78 ovmsclient numpy<2.0.0 +pillow==12.1.1 \ No newline at end of file From 01bbf54c73f936de95cebb36986b2e7ee2727bb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Rzepecki?= Date: Wed, 18 Feb 2026 09:30:00 +0100 Subject: [PATCH 3/8] Fixing tests for agentic demo (#3980) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### 🛠 Summary [CVS-180417](https://jira.devtools.intel.com/browse/CVS-180417) Agentic demo needs to be suitable for tests. There are also some bugs and mistakes. - Changes in Qwen3-Coder with workaround command. - Removing non-agentic model ### 🧪 Checklist - [ ] Unit tests added. - [ ] The documentation updated. - [ ] Change follows security best practices. `` --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../continuous_batching/agentic_ai/README.md | 28 ++++++------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/demos/continuous_batching/agentic_ai/README.md b/demos/continuous_batching/agentic_ai/README.md index 98a3e1f0f9..42c4140714 100644 --- a/demos/continuous_batching/agentic_ai/README.md +++ b/demos/continuous_batching/agentic_ai/README.md @@ -22,7 +22,7 @@ Currently supported models: - mistralai/Mistral-7B-Instruct-v0.3 - microsoft/Phi-4-mini-instruct - Qwen/Qwen3-Coder-30B-A3B-Instruct -- openai/gpt-oss-20b* +- openai/gpt-oss-20b ### Export using python script @@ -237,13 +237,14 @@ ovms.exe --rest_port 8000 --source_model OpenVINO/Phi-4-mini-instruct-int4-ov -- :::{tab-item} Qwen3-Coder-30B-A3B-Instruct :sync: Qwen3-Coder-30B-A3B-Instruct ```bat +set MOE_USE_MICRO_GEMM_PREFILL=0 ovms.exe --rest_port 8000 --source_model Qwen/Qwen3-Coder-30B-A3B-Instruct --model_repository_path models --tool_parser qwen3coder --target_device GPU --task text_generation --cache_dir .cache --enable_prefix_caching true ``` ::: :::{tab-item} gpt-oss-20b :sync: gpt-oss-20b ```bat -ovms.exe --rest_port 8000 --source_model openai/gpt-oss-20b --model_repository_path models --tool_parser gptoss --reasoning_parser gptoss --target_device GPU --task text_generation --enable_prefix_caching true --target_device GPU +ovms.exe --rest_port 8000 --source_model openai/gpt-oss-20b --model_repository_path models --tool_parser gptoss --reasoning_parser gptoss --task text_generation --enable_prefix_caching true --target_device GPU ``` > **Note:**: Use `--pipeline_type LM` parameter in export command, for version 2025.4.*. It disables continuous batching. Not needed in last weekly or 2026.0+ releases. ::: @@ -294,12 +295,6 @@ ovms.exe --rest_port 8000 --source_model OpenVINO/Qwen3-4B-int4-ov --model_repos ovms.exe --rest_port 8000 --source_model OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov --model_repository_path models --tool_parser mistral --target_device NPU --task text_generation --enable_prefix_caching true --cache_dir .cache --max_prompt_len 4000 ``` ::: -:::{tab-item} Phi-3-mini-4k-instruct-int4-cw-ov -:sync: Phi-3-mini-4k-instruct-int4-cw-ov -```bat -ovms.exe --rest_port 8000 --source_model OpenVINO/Phi-3-mini-4k-instruct-int4-cw-ov --model_repository_path models --tool_parser phi4 --target_device NPU --task text_generation --enable_tool_guided_generation true --enable_prefix_caching true --cache_dir .cache --max_prompt_len 4000 -``` -::: :::: > **Note:** Setting the `--max_prompt_len` parameter too high may lead to performance degradation. It is recommended to use the smallest value that meets your requirements. @@ -380,8 +375,8 @@ docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/model :::{tab-item} Qwen3-Coder-30B-A3B-Instruct :sync: Qwen3-Coder-30B-A3B-Instruct ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly \ ---rest_port 8000 --source_model Qwen/Qwen3-Coder-30B-A3B-Instruct --model_repository_path models --tool_parser qwen3coder --task text_generation --cache_dir .cache --enable_prefix_caching true +docker run -d --user $(id -u):$(id -g) --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly \ +--rest_port 8000 --source_model Qwen/Qwen3-Coder-30B-A3B-Instruct --model_repository_path models --tool_parser qwen3coder --task text_generation --enable_prefix_caching true ``` ::: :::: @@ -467,7 +462,7 @@ docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/model :::{tab-item} Qwen3-Coder-30B-A3B-Instruct :sync: Qwen3-Coder-30B-A3B-Instruct ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ +docker run -d --user $(id -u):$(id -g) -e MOE_USE_MICRO_GEMM_PREFILL=0 --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ --rest_port 8000 --source_model Qwen/Qwen3-Coder-30B-A3B-Instruct --model_repository_path models --tool_parser qwen3coder --target_device GPU --task text_generation --enable_tool_guided_generation true --enable_prefix_caching true ``` ::: @@ -538,13 +533,6 @@ docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/model --rest_port 8000 --model_repository_path models --source_model OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov --tool_parser mistral --target_device NPU --task text_generation --enable_prefix_caching true --max_prompt_len 4000 ``` ::: -:::{tab-item} Phi-3-mini-4k-instruct-int4-cw-ov -:sync: Phi-3-mini-4k-instruct-int4-cw-ov -```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model OpenVINO/Phi-3-mini-4k-instruct-int4-cw-ov --tool_parser phi4 --target_device NPU --task text_generation --enable_tool_guided_generation true --enable_prefix_caching true --max_prompt_len 4000 -``` -::: :::: ### Deploy all models in a single container @@ -621,7 +609,7 @@ python openai_agent.py --query "List the files in folder /root" --model meta-lla ::: :::{tab-item} Phi-4-mini-instruct :sync: Phi-4-mini-instruct -```console +```bash python openai_agent.py --query "What is the current weather in Tokyo?" --model microsoft/Phi-4-mini-instruct --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather ``` ::: @@ -651,7 +639,7 @@ python openai_agent.py --query "What is the current weather in Tokyo?" --model Q ::: :::{tab-item} gpt-oss-20b :sync: gpt-oss-20b -```bash +```console python openai_agent.py --query "What is the current weather in Tokyo?" --model openai/gpt-oss-20b --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather ``` ::: From c235192366dfff0de174b966e3106a975055be4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Rzepecki?= Date: Thu, 12 Feb 2026 13:11:43 +0100 Subject: [PATCH 4/8] Missing option in help (#3963) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### 🛠 Summary [CVS-180602](https://jira.devtools.intel.com/browse/CVS-180602) Adding missing option to --help in docs ### 🧪 Checklist - [ ] Unit tests added. - [ ] The documentation updated. - [ ] Change follows security best practices. `` --- demos/common/export_models/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/common/export_models/README.md b/demos/common/export_models/README.md index f9aa89e7cf..fe2abe3684 100644 --- a/demos/common/export_models/README.md +++ b/demos/common/export_models/README.md @@ -45,7 +45,7 @@ Expected Output: usage: export_model.py text_generation [-h] [--model_repository_path MODEL_REPOSITORY_PATH] --source_model SOURCE_MODEL [--model_name MODEL_NAME] [--weight-format PRECISION] [--config_file_path CONFIG_FILE_PATH] [--overwrite_models] [--target_device TARGET_DEVICE] [--ov_cache_dir OV_CACHE_DIR] [--extra_quantization_params EXTRA_QUANTIZATION_PARAMS] [--pipeline_type {LM,LM_CB,VLM,VLM_CB,AUTO}] [--kv_cache_precision {u8}] [--enable_prefix_caching ENABLE_PREFIX_CACHING] [--disable_dynamic_split_fuse] [--max_num_batched_tokens MAX_NUM_BATCHED_TOKENS] [--max_num_seqs MAX_NUM_SEQS] [--cache_size CACHE_SIZE] [--draft_source_model DRAFT_SOURCE_MODEL] [--draft_model_name DRAFT_MODEL_NAME] [--draft_eagle3_mode] [--max_prompt_len MAX_PROMPT_LEN] [--prompt_lookup_decoding] [--reasoning_parser {qwen3,gptoss}] - [--tool_parser {llama3,phi4,hermes3,mistral,qwen3coder,gptoss}] [--enable_tool_guided_generation] + [--tool_parser {llama3,phi4,hermes3,mistral,qwen3coder,gptoss,devstral}] [--enable_tool_guided_generation] options: -h, --help show this help message and exit From 9f3a09ca02c9f76fb0cf432451c388e231d79775 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Rzepecki?= Date: Thu, 12 Feb 2026 13:30:04 +0100 Subject: [PATCH 5/8] Cherry-pick: Trivy/dependabot issues fix (#3964) Updating requirements for some python packages which Trivy indicated as vulnerable - [ ] Unit tests added. - [ ] The documentation updated. - [ ] Change follows security best practices. `` --- client/python/tensorflow-serving-api/samples/requirements.txt | 2 +- demos/common/stream_client/requirements.txt | 4 ++-- demos/mediapipe/object_detection/requirements.txt | 4 ++-- .../clip_image_classification/download_model_requirements.txt | 2 +- demos/python_demos/requirements.txt | 4 ++-- tests/requirements.txt | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/client/python/tensorflow-serving-api/samples/requirements.txt b/client/python/tensorflow-serving-api/samples/requirements.txt index cc5b30cbdb..a436136863 100644 --- a/client/python/tensorflow-serving-api/samples/requirements.txt +++ b/client/python/tensorflow-serving-api/samples/requirements.txt @@ -2,5 +2,5 @@ tensorflow-serving-api==2.16.1; python_version >= "3.12" tensorflow-serving-api<=2.13.1,>=2.10.1; python_version < "3.12" tensorflow==2.16.1; python_version >= "3.12" tensorflow<=2.13.1,>=2.10.1; python_version < "3.12" -protobuf<=4.53.0; python_version >= "3.12" +protobuf<=5.29.6; python_version >= "3.12" protobuf<=4.25.8; python_version < "3.12" diff --git a/demos/common/stream_client/requirements.txt b/demos/common/stream_client/requirements.txt index 183f58db18..7a967c0c6d 100644 --- a/demos/common/stream_client/requirements.txt +++ b/demos/common/stream_client/requirements.txt @@ -1,5 +1,5 @@ -tritonclient[grpc]==2.41.0 +tritonclient[grpc] ffmpeg-python==0.2.0 opencv-python==4.9.0.80 -protobuf==4.25.8 +protobuf==5.29.6 numpy<2.0.0 diff --git a/demos/mediapipe/object_detection/requirements.txt b/demos/mediapipe/object_detection/requirements.txt index 0152919a11..6ca517cc4c 100644 --- a/demos/mediapipe/object_detection/requirements.txt +++ b/demos/mediapipe/object_detection/requirements.txt @@ -1,5 +1,5 @@ -protobuf==4.25.8 -tritonclient[all]==2.45.0 +protobuf==5.29.6 +tritonclient[all] requests==2.32.4 grpcio opencv-python>=4.8.1.78 diff --git a/demos/python_demos/clip_image_classification/download_model_requirements.txt b/demos/python_demos/clip_image_classification/download_model_requirements.txt index 285a67f2dd..142bcfd2f4 100644 --- a/demos/python_demos/clip_image_classification/download_model_requirements.txt +++ b/demos/python_demos/clip_image_classification/download_model_requirements.txt @@ -4,5 +4,5 @@ openvino==2025.4.* numpy<2.0 transformers<=4.53.0 -pillow==10.3.0 +pillow==12.1.1 torch==2.8.0+cpu diff --git a/demos/python_demos/requirements.txt b/demos/python_demos/requirements.txt index f4d82a82d3..38395b51e5 100644 --- a/demos/python_demos/requirements.txt +++ b/demos/python_demos/requirements.txt @@ -1,12 +1,12 @@ --extra-index-url "https://download.pytorch.org/whl/cpu" optimum-intel@git+https://github.com/huggingface/optimum-intel.git@2c48d6430c265ac259c1b264f3e2c4025cdd7b76 -pillow==10.3.0 +pillow==12.1.1 tritonclient[grpc]==2.57.0 # Required to use batch string serialization/deserialization (4byte length prepend) numpy<2.0 huggingface_hub==0.32.0 nncf@git+https://github.com/openvinotoolkit/nncf.git@7a6ac29a15d247c0fc76da93f4d53bbfa7b489cc sentence_transformers -sentencepiece==0.2.0 +sentencepiece==0.2.1 transformers<=4.53 einops torchvision diff --git a/tests/requirements.txt b/tests/requirements.txt index c6292bfab6..70a167fb5c 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -12,6 +12,6 @@ tensorflow==2.16.1; python_version >= "3.12" tensorflow<=2.13.1,>=2.10.1; python_version < "3.12" requests==2.32.4 retry==0.9.2 -protobuf<=4.53.0 +protobuf<=5.29.6 jsonschema<=4.23.0 openai<=1.84.0 From 6e0a586027b84c0debd9b738ae94697d720717cc Mon Sep 17 00:00:00 2001 From: Pawel Date: Thu, 19 Feb 2026 13:32:25 +0100 Subject: [PATCH 6/8] fix --- demos/common/export_models/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/common/export_models/requirements.txt b/demos/common/export_models/requirements.txt index af6d59e63e..44633de1e6 100644 --- a/demos/common/export_models/requirements.txt +++ b/demos/common/export_models/requirements.txt @@ -8,7 +8,7 @@ datasets diffusers # for image generation einops nncf@git+https://github.com/openvinotoolkit/nncf.git@7a6ac29a15d247c0fc76da93f4d53bbfa7b489cc -numpy==2.2.6 # temporary change until ov tokenizers upgrade +numpy==2.2.6 openvino-tokenizers==2026.0.0rc3 openvino==2026.0.0rc3 pillow From b8007cc2d882a09c774890ca333513a8bbc12f2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Rzepecki?= Date: Thu, 19 Feb 2026 15:04:15 +0100 Subject: [PATCH 7/8] Adding env var for Qwen3-coder - continue demo (#3991) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### 🛠 Summary Adding environmental variable for Qwen3-Coder-30B to make it work correctly ### 🧪 Checklist - [ ] Unit tests added. - [ ] The documentation updated. - [ ] Change follows security best practices. `` --- demos/code_local_assistant/README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/demos/code_local_assistant/README.md b/demos/code_local_assistant/README.md index a49eaa5547..d23c520d4f 100644 --- a/demos/code_local_assistant/README.md +++ b/demos/code_local_assistant/README.md @@ -215,6 +215,7 @@ Run OpenVINO Model Server with all downloaded models loaded at the same time: Please refer to OpenVINO Model Server installation first: [link](../../docs/deploying_server_baremetal.md) ```bat +set MOE_USE_MICRO_GEMM_PREFILL=0 ovms --rest_port 8000 --config_path ./models/config_all.json ``` ::: @@ -222,7 +223,7 @@ ovms --rest_port 8000 --config_path ./models/config_all.json :sync: Linux CPU ### Linux: via Docker with CPU ```bash -docker run -d --rm -u $(id -u):$(id -g) \ +docker run -d --rm -u $(id -u):$(id -g) -e MOE_USE_MICRO_GEMM_PREFILL=0 \ -p 8000:8000 -v $(pwd)/:/workspace/ openvino/model_server:weekly --rest_port 8000 --config_path /workspace/models/config_all.json ``` ::: @@ -230,12 +231,14 @@ docker run -d --rm -u $(id -u):$(id -g) \ :sync: Linux GPU ### Linux: via Docker with GPU ```bash -docker run -d --rm --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \ +docker run -d --rm --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) -e MOE_USE_MICRO_GEMM_PREFILL=0 \ -p 8000:8000 -v $(pwd)/:/workspace/ openvino/model_server:weekly --rest_port 8000 --config_path /workspace/models/config_all.json ``` ::: :::: +> **Note:** `MOE_USE_MICRO_GEMM_PREFILL=0` is a workaround for *Qwen3-Coder-30B-A3B-Instruct* and it will be fixed in release 2026.1 or next weekly. + ## Set Up Visual Studio Code ### Download [Continue plugin](https://www.continue.dev/) From ba7fcee55e3accbd6f28179b48729463e7a17fb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Rzepecki?= Date: Mon, 23 Feb 2026 12:03:55 +0100 Subject: [PATCH 8/8] Update README.md --- demos/continuous_batching/agentic_ai/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/continuous_batching/agentic_ai/README.md b/demos/continuous_batching/agentic_ai/README.md index 42c4140714..bafeba3103 100644 --- a/demos/continuous_batching/agentic_ai/README.md +++ b/demos/continuous_batching/agentic_ai/README.md @@ -22,7 +22,7 @@ Currently supported models: - mistralai/Mistral-7B-Instruct-v0.3 - microsoft/Phi-4-mini-instruct - Qwen/Qwen3-Coder-30B-A3B-Instruct -- openai/gpt-oss-20b +- openai/gpt-oss-20b* ### Export using python script