openvinotoolkit · przepeck · Feb 23, 2026 · Feb 19, 2026 · Feb 18, 2026 · Feb 18, 2026
diff --git a/client/python/tensorflow-serving-api/samples/requirements.txt b/client/python/tensorflow-serving-api/samples/requirements.txt
@@ -2,5 +2,5 @@ tensorflow-serving-api==2.16.1; python_version >= "3.12"
 tensorflow-serving-api<=2.13.1,>=2.10.1; python_version < "3.12"
 tensorflow==2.16.1; python_version >= "3.12"
 tensorflow<=2.13.1,>=2.10.1; python_version < "3.12"
-protobuf<=4.53.0; python_version >= "3.12"
+protobuf<=5.29.6; python_version >= "3.12"
 protobuf<=4.25.8; python_version < "3.12"
diff --git a/demos/code_local_assistant/README.md b/demos/code_local_assistant/README.md
@@ -215,27 +215,30 @@ Run OpenVINO Model Server with all downloaded models loaded at the same time:
 Please refer to OpenVINO Model Server installation first: [link](../../docs/deploying_server_baremetal.md)
 
 ```bat
+set MOE_USE_MICRO_GEMM_PREFILL=0
 ovms --rest_port 8000 --config_path ./models/config_all.json
 ```
 :::
 :::{tab-item} Linux CPU
 :sync: Linux CPU
 ### Linux: via Docker with CPU
 ```bash
-docker run -d --rm -u $(id -u):$(id -g) \
+docker run -d --rm -u $(id -u):$(id -g) -e MOE_USE_MICRO_GEMM_PREFILL=0 \
   -p 8000:8000 -v $(pwd)/:/workspace/ openvino/model_server:weekly --rest_port 8000 --config_path /workspace/models/config_all.json
 ```
 :::
 :::{tab-item} Linux GPU
 :sync: Linux GPU
 ### Linux: via Docker with GPU
 ```bash
-docker run -d --rm --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
+docker run -d --rm --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) -e MOE_USE_MICRO_GEMM_PREFILL=0 \
   -p 8000:8000 -v $(pwd)/:/workspace/ openvino/model_server:weekly --rest_port 8000 --config_path /workspace/models/config_all.json
 ```
 :::
 ::::
 
+> **Note:** `MOE_USE_MICRO_GEMM_PREFILL=0` is a workaround for *Qwen3-Coder-30B-A3B-Instruct* and it will be fixed in release 2026.1 or next weekly.
+
 ## Set Up Visual Studio Code
 
 ### Download [Continue plugin](https://www.continue.dev/)

diff --git a/demos/common/export_models/README.md b/demos/common/export_models/README.md
@@ -1,6 +1,7 @@
 # Exporting models using script {#ovms_demos_common_export}
 
 This documents describes how to export, optimize and configure models prior to server deployment with provided python script. This approach is more flexible than using [pull feature](../../../docs/pull_hf_models.md) from OVMS as it allows for using models that were not optimized beforehand and provided in [OpenVINO organization](https://huggingface.co/OpenVINO) in HuggingFace, but requires having Python set up to work.
+> **Warning:** This script uses option `--trust-remote-code`, which allows model-provided Python code to run on your machine during export. Use only trusted models/sources, review repository code before running, and avoid executing this script in sensitive environments.
 
 ## What it does
 
@@ -44,7 +45,7 @@ Expected Output:
 usage: export_model.py text_generation [-h] [--model_repository_path MODEL_REPOSITORY_PATH] --source_model SOURCE_MODEL [--model_name MODEL_NAME] [--weight-format PRECISION] [--config_file_path CONFIG_FILE_PATH] [--overwrite_models] [--target_device TARGET_DEVICE] [--ov_cache_dir OV_CACHE_DIR]
                                        [--extra_quantization_params EXTRA_QUANTIZATION_PARAMS] [--pipeline_type {LM,LM_CB,VLM,VLM_CB,AUTO}] [--kv_cache_precision {u8}] [--enable_prefix_caching ENABLE_PREFIX_CACHING] [--disable_dynamic_split_fuse] [--max_num_batched_tokens MAX_NUM_BATCHED_TOKENS] [--max_num_seqs MAX_NUM_SEQS]
                                        [--cache_size CACHE_SIZE] [--draft_source_model DRAFT_SOURCE_MODEL] [--draft_model_name DRAFT_MODEL_NAME] [--draft_eagle3_mode] [--max_prompt_len MAX_PROMPT_LEN] [--prompt_lookup_decoding] [--reasoning_parser {qwen3,gptoss}]
-                                       [--tool_parser {llama3,phi4,hermes3,mistral,qwen3coder,gptoss}] [--enable_tool_guided_generation]
+                                       [--tool_parser {llama3,phi4,hermes3,mistral,qwen3coder,gptoss,devstral}] [--enable_tool_guided_generation]
 
 options:
   -h, --help            show this help message and exit

diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py
@@ -424,7 +424,7 @@ def export_text_generation_model(model_repository_path, source_model, model_name
                 raise ValueError("Failed to export llm model", source_model)
             if not (os.path.isfile(os.path.join(llm_model_path, 'openvino_detokenizer.xml'))):
                 print("Tokenizer and detokenizer not found in the exported model. Exporting tokenizer and detokenizer from HF model")
-                convert_tokenizer_command = "convert_tokenizer --with-detokenizer -o {} {}".format(llm_model_path, source_model)
+                convert_tokenizer_command = f"convert_tokenizer --with-detokenizer --trust-remote-code -o {llm_model_path} {source_model}"
                 if os.system(convert_tokenizer_command):
                     raise ValueError("Failed to export tokenizer and detokenizer", source_model)
     ### Export draft model for speculative decoding 

diff --git a/demos/common/export_models/requirements.txt b/demos/common/export_models/requirements.txt
@@ -8,7 +8,7 @@ datasets
 diffusers  # for image generation
 einops
 nncf@git+https://github.com/openvinotoolkit/nncf.git@7a6ac29a15d247c0fc76da93f4d53bbfa7b489cc
-numpy
+numpy==2.2.6
 openvino-tokenizers==2026.0.0rc4
 openvino==2026.0.0rc4
 pillow

diff --git a/demos/common/stream_client/requirements.txt b/demos/common/stream_client/requirements.txt
@@ -1,5 +1,5 @@
-tritonclient[grpc]==2.41.0
+tritonclient[grpc]
 ffmpeg-python==0.2.0
 opencv-python==4.9.0.80
-protobuf==4.25.8
+protobuf==5.29.6
 numpy<2.0.0
diff --git a/demos/continuous_batching/agentic_ai/README.md b/demos/continuous_batching/agentic_ai/README.md
@@ -237,13 +237,14 @@ ovms.exe --rest_port 8000 --source_model OpenVINO/Phi-4-mini-instruct-int4-ov --
 :::{tab-item} Qwen3-Coder-30B-A3B-Instruct
 :sync: Qwen3-Coder-30B-A3B-Instruct
 ```bat
+set MOE_USE_MICRO_GEMM_PREFILL=0
 ovms.exe --rest_port 8000 --source_model Qwen/Qwen3-Coder-30B-A3B-Instruct --model_repository_path models --tool_parser qwen3coder --target_device GPU --task text_generation --cache_dir .cache --enable_prefix_caching true
 ```
 :::
 :::{tab-item} gpt-oss-20b
 :sync: gpt-oss-20b
 ```bat
-ovms.exe --rest_port 8000 --source_model openai/gpt-oss-20b --model_repository_path models --tool_parser gptoss --reasoning_parser gptoss --target_device GPU --task text_generation --enable_prefix_caching true --target_device GPU
+ovms.exe --rest_port 8000 --source_model openai/gpt-oss-20b --model_repository_path models --tool_parser gptoss --reasoning_parser gptoss --task text_generation --enable_prefix_caching true --target_device GPU
 ```
 > **Note:**: Use `--pipeline_type LM` parameter in export command, for version 2025.4.*. It disables continuous batching. Not needed in last weekly or 2026.0+ releases.
 :::
@@ -294,12 +295,6 @@ ovms.exe --rest_port 8000 --source_model OpenVINO/Qwen3-4B-int4-ov --model_repos
 ovms.exe --rest_port 8000 --source_model OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov --model_repository_path models --tool_parser mistral --target_device NPU --task text_generation --enable_prefix_caching true --cache_dir .cache --max_prompt_len 4000
 ```
 :::
-:::{tab-item} Phi-3-mini-4k-instruct-int4-cw-ov
-:sync: Phi-3-mini-4k-instruct-int4-cw-ov
-```bat
-ovms.exe --rest_port 8000 --source_model OpenVINO/Phi-3-mini-4k-instruct-int4-cw-ov --model_repository_path models --tool_parser phi4 --target_device NPU --task text_generation --enable_tool_guided_generation true --enable_prefix_caching true --cache_dir .cache --max_prompt_len 4000
-```
-:::
 ::::
 
 > **Note:** Setting the `--max_prompt_len` parameter too high may lead to performance degradation. It is recommended to use the smallest value that meets your requirements.
@@ -380,8 +375,8 @@ docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/model
 :::{tab-item} Qwen3-Coder-30B-A3B-Instruct
 :sync: Qwen3-Coder-30B-A3B-Instruct
 ```bash
-docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly \
---rest_port 8000 --source_model Qwen/Qwen3-Coder-30B-A3B-Instruct --model_repository_path models --tool_parser qwen3coder --task text_generation --cache_dir .cache --enable_prefix_caching true
+docker run -d --user $(id -u):$(id -g) --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly \
+--rest_port 8000 --source_model Qwen/Qwen3-Coder-30B-A3B-Instruct --model_repository_path models --tool_parser qwen3coder --task text_generation --enable_prefix_caching true
 ```
 :::
 ::::
@@ -467,7 +462,7 @@ docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/model
 :::{tab-item} Qwen3-Coder-30B-A3B-Instruct
 :sync: Qwen3-Coder-30B-A3B-Instruct
 ```bash
-docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \
+docker run -d --user $(id -u):$(id -g) -e MOE_USE_MICRO_GEMM_PREFILL=0 --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \
 --rest_port 8000 --source_model Qwen/Qwen3-Coder-30B-A3B-Instruct --model_repository_path models --tool_parser qwen3coder --target_device GPU --task text_generation --enable_tool_guided_generation true --enable_prefix_caching true
 ```
 :::
@@ -538,13 +533,6 @@ docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/model
 --rest_port 8000 --model_repository_path models --source_model OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov --tool_parser mistral --target_device NPU --task text_generation --enable_prefix_caching true --max_prompt_len 4000
 ```
 :::
-:::{tab-item} Phi-3-mini-4k-instruct-int4-cw-ov
-:sync: Phi-3-mini-4k-instruct-int4-cw-ov
-```bash
-docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \
---rest_port 8000 --model_repository_path models --source_model OpenVINO/Phi-3-mini-4k-instruct-int4-cw-ov --tool_parser phi4 --target_device NPU --task text_generation --enable_tool_guided_generation true --enable_prefix_caching true --max_prompt_len 4000
-```
-:::
 ::::
 
 ### Deploy all models in a single container
@@ -621,7 +609,7 @@ python openai_agent.py --query "List the files in folder /root" --model meta-lla
 :::
 :::{tab-item} Phi-4-mini-instruct
 :sync: Phi-4-mini-instruct
-```console
+```bash
 python openai_agent.py --query "What is the current weather in Tokyo?" --model microsoft/Phi-4-mini-instruct --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather
 ```
 :::
@@ -651,7 +639,7 @@ python openai_agent.py --query "What is the current weather in Tokyo?" --model Q
 :::
 :::{tab-item} gpt-oss-20b
 :sync: gpt-oss-20b
-```bash
+```console
 python openai_agent.py --query "What is the current weather in Tokyo?" --model openai/gpt-oss-20b --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather
 ```
 :::

diff --git a/demos/continuous_batching/vlm/README.md b/demos/continuous_batching/vlm/README.md
@@ -91,8 +91,6 @@ python export_model.py text_generation --source_model OpenGVLab/InternVL2-2B --w
 python export_model.py text_generation --source_model OpenGVLab/InternVL2-2B --weight-format int4 --pipeline_type VLM --model_name OpenGVLab/InternVL2-2B --config_file_path models/config.json --model_repository_path models --overwrite_models --target_device GPU
 ```
 
-> **Note:** Change the `--weight-format` to quantize the model to `fp16` or `int8` precision to reduce memory consumption and improve performance.
-
 > **Note:** You can change the model used in the demo out of any topology [tested](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/#visual-language-models-vlms) with OpenVINO.
 Be aware that QwenVL models executed on GPU might experience execution errors with very high resolution images. In case of such behavior, it is recommended to reduce the parameter `max_pixels` in `preprocessor_config.json`.
 

diff --git a/demos/mediapipe/object_detection/requirements.txt b/demos/mediapipe/object_detection/requirements.txt
@@ -1,5 +1,5 @@
-protobuf==4.25.8
-tritonclient[all]==2.45.0
+protobuf==5.29.6
+tritonclient[all]
 requests==2.32.4
 grpcio
 opencv-python>=4.8.1.78

diff --git a/demos/python_demos/clip_image_classification/download_model_requirements.txt b/demos/python_demos/clip_image_classification/download_model_requirements.txt
@@ -4,5 +4,5 @@
 openvino==2025.4.*
 numpy<2.0
 transformers<=4.53.0
-pillow==10.3.0
+pillow==12.1.1
 torch==2.8.0+cpu
diff --git a/demos/python_demos/requirements.txt b/demos/python_demos/requirements.txt
@@ -1,12 +1,12 @@
 --extra-index-url "https://download.pytorch.org/whl/cpu"
 optimum-intel@git+https://github.com/huggingface/optimum-intel.git@2c48d6430c265ac259c1b264f3e2c4025cdd7b76
-pillow==10.3.0
+pillow==12.1.1
 tritonclient[grpc]==2.57.0  # Required to use batch string serialization/deserialization (4byte length prepend)
 numpy<2.0
 huggingface_hub==0.32.0
 nncf@git+https://github.com/openvinotoolkit/nncf.git@7a6ac29a15d247c0fc76da93f4d53bbfa7b489cc
 sentence_transformers
-sentencepiece==0.2.0
+sentencepiece==0.2.1
 transformers<=4.53
 einops
 torchvision

diff --git a/demos/using_onnx_model/python/README.md b/demos/using_onnx_model/python/README.md
@@ -1,72 +1,65 @@
 # Prediction Example with an ONNX Model {#ovms_demo_using_onnx_model}
 
-Steps are similar to when you work with IR model format. Model Server accepts ONNX models as well with no differences in versioning. Locate ONNX model file in separate model version directory.
+This demo demonstrates the steps required to deploy an ONNX‑based vision model. The workflow is optimized for rapid integration and ease of use: no model‑conversion step is needed, as the model is provided directly in ONNX format.
+To further simplify deployment, the server applies all necessary image‑preprocessing operations, removing the need for the client to implement preprocessing pipelines such as normalization or color‑space transformation. This approach reduces development effort, ensures consistency with the model’s training configuration, and accelerates end‑to‑end deployment.
+The server accepts image data in multiple formats, offering flexibility depending on the client environment. Images can be sent as:
+
+Raw arrays directly obtained from OpenCV or Pillow
+Encoded images, including JPEG or PNG formats
+
+This enables seamless integration with a wide range of applications and client libraries.
 Below is a complete functional use case using Python 3.7 or higher.
 For this example let's use a public [ONNX ResNet](https://github.com/onnx/models/tree/main/validated/vision/classification/resnet) model - resnet50-caffe2-v1-9.onnx model.
 
-This model requires additional [preprocessing function](https://github.com/onnx/models/tree/main/validated/vision/classification/resnet#preprocessing). Preprocessing can be performed in the client by manipulating data before sending the request. Preprocessing can be also delegated to the server by setting preprocessing parameters. Both methods will be explained below.
+This model was trained using an additional [preprocessing](https://github.com/onnx/models/tree/main/validated/vision/classification/resnet#preprocessing). For inference, preprocessing can be executed on the client side by transforming the input data before sending the request. However, a more efficient approach is to delegate preprocessing to the server by configuring the appropriate preprocessing parameters.
+Here will be adjusted `mean`, `scale`, `color` and `layout`. In addition to that, input precision conversion from fp32 to uint8 can improve performance and bandwidth efficiency. Raw images can be transmitted using more compact uint8 data, significantly reducing the payload size and lowering client‑side compute requirements.
+More details about [parameters](../../../docs/parameters.md).
 
-[Option 1: Adding preprocessing to the client side](#option-1-adding-preprocessing-to-the-client-side)
-[Option 2: Adding preprocessing to the server side](#option-2-adding-preprocessing-to-the-server-side)
-
-## Option 1: Adding preprocessing to the client side
+## Model deployment with preprocessing
 
 Clone the repository and enter using_onnx_model directory
+
 ```bash
 git clone https://github.com/openvinotoolkit/model_server.git
 cd model_server/demos/using_onnx_model/python
 ```
 
-Download classification model
+Prepare environment
 ```bash
 curl --fail -L --create-dirs https://github.com/onnx/models/raw/main/validated/vision/classification/resnet/model/resnet50-caffe2-v1-9.onnx -o workspace/resnet50-onnx/1/resnet50-caffe2-v1-9.onnx
 ```
 
-You should see `workspace` directory created with the following content:
-```bash
-workspace/
-└── resnet50-onnx
-    └── 1
-        └── resnet50-caffe2-v1-9.onnx
-
-```
-
-Start the OVMS container with single model instance:
+Start the OVMS container with additional preprocessing options:
 ```bash
 docker run -d -u $(id -u):$(id -g) -v $(pwd)/workspace:/workspace -p 9001:9001 openvino/model_server:latest \
---model_path /workspace/resnet50-onnx --model_name resnet --port 9001
+--model_path /workspace/resnet50-onnx --model_name resnet --port 9001 --layout NHWC:NCHW --mean "[123.675,116.28,103.53]" --scale "[58.395,57.12,57.375]" --shape "(1,224,224,3)" --color_format BGR:RGB --precision uint8:fp32
 ```
 
-Install python client dependencies:
-```bash
-pip3 install -r requirements.txt
-```
+## Running the client:
 
-The `onnx_model_demo.py` script can run inference both with and without performing preprocessing. Since in this variant we want to run preprocessing on the client side let's set `--run_preprocessing` flag.
+The `onnx_model_demo.py` script can run inference both with and without performing preprocessing. Since in this variant preprocessing is done by the model server, there's no need to perform any image preprocessing on the client side. In that case, run without `--run_preprocessing` option. See [preprocessing function](https://github.com/openvinotoolkit/model_server/blob/releases/2026/0/demos/using_onnx_model/python/onnx_model_demo.py#L26-L33) run in the client.
+
+Run the client without preprocessing:
 
-Run the client with preprocessing:
 ```bash
-python onnx_model_demo.py --service_url localhost:9001 --run_preprocessing
-Running with preprocessing on client side
-../../common/static/images/bee.jpeg (1, 3, 224, 224) ; data range: -2.117904 : 2.64
-Class is with highest score: 309
+pip3 install -r requirements.txt
+python onnx_model_demo.py --service_url localhost:9001
+```
+Output:
+```
+Running inference with image: ../../common/static/images/bee.jpeg
+Class with highest score: 309
 Detected class name: bee
 ```
 
-## Option 2: Adding preprocessing to the server side
-
-Start the OVMS container with additional preprocessing options:
+The client can be also run with flag `--send_tensor` which reads encoded input image and sends it with uint8 precision.
 ```bash
-docker run -d -u $(id -u):$(id -g) -v $(pwd)/workspace:/workspace -p 9001:9001 openvino/model_server:latest \
---model_path /workspace/resnet50-onnx --model_name resnet --port 9001 --layout NHWC:NCHW --mean "[123.675,116.28,103.53]" --scale "[58.395,57.12,57.375]" --shape "(1,224,224,3)" --color_format BGR:RGB
+python onnx_model_demo.py --service_url localhost:9001 --send_tensor
 ```
-
-The `onnx_model_demo.py` script can run inference both with and without performing preprocessing. Since in this variant preprocessing is done by the model server, there's no need to perform any image preprocessing on the client side. In that case, run without `--run_preprocessing` option. See [preprocessing function](https://github.com/openvinotoolkit/model_server/blob/releases/2026/0/demos/using_onnx_model/python/onnx_model_demo.py#L26-L33) run in the client.
-
-Run the client without preprocessing:
-```bash
-python onnx_model_demo.py --service_url localhost:9001
-Running without preprocessing on client side
-Class is with highest score: 309
+Output:
+```
+Running inference with image: ../../common/static/images/bee.jpeg
+Class with highest score: 309
 Detected class name: bee
 ```
+> **Note:** While adding preprocessing to the model input, shape needs to be set as static.