From f90fd93748b430e9aebc3fdf646a2690ecdec05d Mon Sep 17 00:00:00 2001 From: rasapala Date: Mon, 9 Feb 2026 17:42:48 +0100 Subject: [PATCH 1/9] Npu docs --- demos/common/export_models/export_model.py | 4 +++ demos/embeddings/README.md | 37 +++++++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py index 97a9149b8b..687ae7a628 100644 --- a/demos/common/export_models/export_model.py +++ b/demos/common/export_models/export_model.py @@ -486,6 +486,10 @@ def export_embeddings_model_ov(model_repository_path, source_model, model_name, destination_path = os.path.join(model_repository_path, model_name) print("Exporting embeddings model to ",destination_path) if not os.path.isdir(destination_path) or args['overwrite_models']: + if task_parameters['target_device'] == 'NPU': + if task_parameters['extra_quantization_params'] == "": + print("Using default quantization parameters for NPU: --sym --ratio 1.0 --group-size -1") + task_parameters['extra_quantization_params'] = "--sym --ratio 1.0 --group-size -1" optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task feature-extraction --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], destination_path) print('Running command:', optimum_command) # for debug purposes if os.system(optimum_command): diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md index b74042f754..72fbc6ee9a 100644 --- a/demos/embeddings/README.md +++ b/demos/embeddings/README.md @@ -243,7 +243,6 @@ python export_model.py embeddings_ov --source_model sentence-transformers/all-mp ::: :::: - > **Note** Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision to reduce memory consumption and improve performance. > **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" before running the export script to connect to the HF Hub. @@ -297,6 +296,26 @@ All models supported by [optimum-intel](https://github.com/huggingface/optimum-i |sentence-transformers/all-mpnet-base-v2|MEAN| +**NPU** +::::{tab-set} +:::{tab-item} Qwen/Qwen3-Embedding-0.6B +:sync: Qwen3-Embedding-0.6B-fp16 +```console +python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B --pooling LAST --weight-format fp16 --target_device NPU --config_file_path models/config.json --model_repository_path models +``` +::: +:::: + +> **Note** Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision to reduce memory consumption and improve performance. `fp16` is ised for better accuracy but `int8` and `int4` has better performance. +> **Note** Pooling mode --pooling LAST has the best accuracy. + +## Tested models +All models supported by [optimum-intel](https://github.com/huggingface/optimum-intel) should be compatible. The demo is validated against following Hugging Face models: + +|Model name|Pooling| +|---|---| +|Wqen/Qwen3-Embedding-0.6B|LAST| + ## Server Deployment :::{dropdown} **Deploying with Docker** @@ -313,6 +332,14 @@ to `docker run` command, use the image with GPU support and make sure set the ta ```bash docker run -d --rm -p 8000:8000 --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -v $(pwd)/models:/workspace:ro openvino/model_server:latest-gpu --rest_port 8000 --config_path /workspace/config.json ``` +**NPU** + +In case you want to use NPU device to run the embeddings model, add extra docker parameters `--device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1)` +to `docker run` command, use the image with NPU support and make sure set the target_device in subconfig.json to NPU. Also make sure the export model quantization level and cache size fit to the NPU memory. All of that can be applied with the commands: + +```bash +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -1) -v $(pwd)/models:/workspace:ro openvino/model_server:latest-gpu --rest_port 8000 --config_path /workspace/config.json +``` ::: :::{dropdown} **Deploying on Bare Metal** @@ -368,9 +395,17 @@ curl http://localhost:8000/v3/embeddings -H "Content-Type: application/json" -d "usage":{"prompt_tokens":4,"total_tokens":4} } + ``` ::: +## Client code for NPU + +:::{dropdown} **Request embeddings with cURL** +```bash +curl http://localhost:8000/v3/embeddings -H "Content-Type: application/json" -d "{ \"model\": \"Qwen/Qwen3-Embedding-0.6B\", \"input\": \"hello world\"}" +``` + :::{dropdown} **Request embeddings with OpenAI Python package** ```bash From 6fde3307f94ac8337683bc888e156fc370cfa878 Mon Sep 17 00:00:00 2001 From: rasapala Date: Mon, 9 Feb 2026 17:55:04 +0100 Subject: [PATCH 2/9] Spell --- demos/embeddings/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md index 72fbc6ee9a..55157de1f3 100644 --- a/demos/embeddings/README.md +++ b/demos/embeddings/README.md @@ -306,7 +306,7 @@ python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B -- ::: :::: -> **Note** Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision to reduce memory consumption and improve performance. `fp16` is ised for better accuracy but `int8` and `int4` has better performance. +> **Note** Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision to reduce memory consumption and improve performance. `fp16` is used for better accuracy but `int8` and `int4` has better performance. > **Note** Pooling mode --pooling LAST has the best accuracy. ## Tested models From eebc30fec92a843f35dcf549c12fe6f5e69fba9a Mon Sep 17 00:00:00 2001 From: Damian Kalinowski Date: Tue, 10 Feb 2026 11:02:33 +0100 Subject: [PATCH 3/9] Remove reference that all models should work, they dont --- demos/embeddings/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md index 55157de1f3..c8f4179baa 100644 --- a/demos/embeddings/README.md +++ b/demos/embeddings/README.md @@ -310,11 +310,11 @@ python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B -- > **Note** Pooling mode --pooling LAST has the best accuracy. ## Tested models -All models supported by [optimum-intel](https://github.com/huggingface/optimum-intel) should be compatible. The demo is validated against following Hugging Face models: +The demo is validated against following Hugging Face models: |Model name|Pooling| |---|---| -|Wqen/Qwen3-Embedding-0.6B|LAST| +|Qwen/Qwen3-Embedding-0.6B|LAST| ## Server Deployment From 82faddcdac3bf8a57e61414a0d364797d5f61850 Mon Sep 17 00:00:00 2001 From: Damian Kalinowski Date: Tue, 10 Feb 2026 11:10:38 +0100 Subject: [PATCH 4/9] Remove default quantization params since those are only required for int4 --- demos/common/export_models/export_model.py | 4 ---- demos/embeddings/README.md | 1 + 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py index 687ae7a628..97a9149b8b 100644 --- a/demos/common/export_models/export_model.py +++ b/demos/common/export_models/export_model.py @@ -486,10 +486,6 @@ def export_embeddings_model_ov(model_repository_path, source_model, model_name, destination_path = os.path.join(model_repository_path, model_name) print("Exporting embeddings model to ",destination_path) if not os.path.isdir(destination_path) or args['overwrite_models']: - if task_parameters['target_device'] == 'NPU': - if task_parameters['extra_quantization_params'] == "": - print("Using default quantization parameters for NPU: --sym --ratio 1.0 --group-size -1") - task_parameters['extra_quantization_params'] = "--sym --ratio 1.0 --group-size -1" optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task feature-extraction --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], destination_path) print('Running command:', optimum_command) # for debug purposes if os.system(optimum_command): diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md index c8f4179baa..f499bc2086 100644 --- a/demos/embeddings/README.md +++ b/demos/embeddings/README.md @@ -308,6 +308,7 @@ python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B -- > **Note** Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision to reduce memory consumption and improve performance. `fp16` is used for better accuracy but `int8` and `int4` has better performance. > **Note** Pooling mode --pooling LAST has the best accuracy. +> **Note** For weight-format int4, use `--extra_quantization_params "--sym --ratio 1.0 --group-size -1"` ## Tested models The demo is validated against following Hugging Face models: From 4bc4006b492cd3943063ff1d0227c1f1fce3c4b9 Mon Sep 17 00:00:00 2001 From: Rafal Sapala Date: Mon, 16 Feb 2026 13:18:45 +0100 Subject: [PATCH 5/9] Apply suggestions from code review Co-authored-by: Trawinski, Dariusz --- demos/embeddings/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md index f499bc2086..835fd75925 100644 --- a/demos/embeddings/README.md +++ b/demos/embeddings/README.md @@ -306,7 +306,7 @@ python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B -- ::: :::: -> **Note** Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision to reduce memory consumption and improve performance. `fp16` is used for better accuracy but `int8` and `int4` has better performance. +> **Note** Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision. For int4 precisions, add required extra parameter `--extra_quantization_params "--sym --ratio 1.0 --group-size -1"` > **Note** Pooling mode --pooling LAST has the best accuracy. > **Note** For weight-format int4, use `--extra_quantization_params "--sym --ratio 1.0 --group-size -1"` @@ -334,7 +334,7 @@ to `docker run` command, use the image with GPU support and make sure set the ta docker run -d --rm -p 8000:8000 --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -v $(pwd)/models:/workspace:ro openvino/model_server:latest-gpu --rest_port 8000 --config_path /workspace/config.json ``` **NPU** - +NOTE: NPU execution for embeddings model is a preview feature. In case you want to use NPU device to run the embeddings model, add extra docker parameters `--device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1)` to `docker run` command, use the image with NPU support and make sure set the target_device in subconfig.json to NPU. Also make sure the export model quantization level and cache size fit to the NPU memory. All of that can be applied with the commands: From aabf97660dff32365c5af2e3aaee5884fcfb2428 Mon Sep 17 00:00:00 2001 From: rasapala Date: Mon, 16 Feb 2026 13:31:29 +0100 Subject: [PATCH 6/9] Code review --- demos/embeddings/README.md | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md index 835fd75925..8b54466e2f 100644 --- a/demos/embeddings/README.md +++ b/demos/embeddings/README.md @@ -310,12 +310,16 @@ python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B -- > **Note** Pooling mode --pooling LAST has the best accuracy. > **Note** For weight-format int4, use `--extra_quantization_params "--sym --ratio 1.0 --group-size -1"` -## Tested models +## Tested NPU models The demo is validated against following Hugging Face models: |Model name|Pooling| |---|---| |Qwen/Qwen3-Embedding-0.6B|LAST| +|BAAI/bge-large-en-v1.5|CLS| +|BAAI/bge-large-zh-v1.5|CLS| +|thenlper/gte-small|CLS| +|sentence-transformers/all-mpnet-base-v2|MEAN| ## Server Deployment @@ -335,7 +339,7 @@ docker run -d --rm -p 8000:8000 --device /dev/dri --group-add=$(stat -c "%g" /de ``` **NPU** NOTE: NPU execution for embeddings model is a preview feature. -In case you want to use NPU device to run the embeddings model, add extra docker parameters `--device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1)` +In case you want to use NPU device to run the embeddings model, add extra docker parameters `--device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1)` to `docker run` command, use the image with NPU support and make sure set the target_device in subconfig.json to NPU. Also make sure the export model quantization level and cache size fit to the NPU memory. All of that can be applied with the commands: ```bash @@ -400,15 +404,6 @@ curl http://localhost:8000/v3/embeddings -H "Content-Type: application/json" -d ``` ::: -## Client code for NPU - -:::{dropdown} **Request embeddings with cURL** -```bash -curl http://localhost:8000/v3/embeddings -H "Content-Type: application/json" -d "{ \"model\": \"Qwen/Qwen3-Embedding-0.6B\", \"input\": \"hello world\"}" -``` - -:::{dropdown} **Request embeddings with OpenAI Python package** - ```bash pip3 install openai "numpy<2" ``` From db1d4eef4a2d47da2fb08162ee6efb9155091232 Mon Sep 17 00:00:00 2001 From: rasapala Date: Mon, 16 Feb 2026 13:33:44 +0100 Subject: [PATCH 7/9] Fix --- demos/embeddings/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md index 8b54466e2f..27f714e6c7 100644 --- a/demos/embeddings/README.md +++ b/demos/embeddings/README.md @@ -243,6 +243,7 @@ python export_model.py embeddings_ov --source_model sentence-transformers/all-mp ::: :::: + > **Note** Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision to reduce memory consumption and improve performance. > **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" before running the export script to connect to the HF Hub. @@ -404,6 +405,8 @@ curl http://localhost:8000/v3/embeddings -H "Content-Type: application/json" -d ``` ::: +:::{dropdown} **Request embeddings with OpenAI Python package** + ```bash pip3 install openai "numpy<2" ``` From f63121ca0510769926b96f6627ad433a6426b0c1 Mon Sep 17 00:00:00 2001 From: rasapala Date: Wed, 18 Feb 2026 11:26:04 +0100 Subject: [PATCH 8/9] Fix model --- demos/embeddings/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md index 27f714e6c7..189823c44d 100644 --- a/demos/embeddings/README.md +++ b/demos/embeddings/README.md @@ -302,7 +302,7 @@ All models supported by [optimum-intel](https://github.com/huggingface/optimum-i :::{tab-item} Qwen/Qwen3-Embedding-0.6B :sync: Qwen3-Embedding-0.6B-fp16 ```console -python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B --pooling LAST --weight-format fp16 --target_device NPU --config_file_path models/config.json --model_repository_path models +python export_model.py embeddings_ov --source_model BAAI/bge-large-en-v1.5 --pooling CLS --weight-format fp16 --target_device NPU --config_file_path models/config.json --model_repository_path models ``` ::: :::: From a2899c3793b45231ec88ee821187f4f51b1b4ca2 Mon Sep 17 00:00:00 2001 From: rasapala Date: Thu, 19 Feb 2026 09:38:58 +0100 Subject: [PATCH 9/9] Code review --- demos/embeddings/README.md | 69 ++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 41 deletions(-) diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md index 189823c44d..fbbfd91ea5 100644 --- a/demos/embeddings/README.md +++ b/demos/embeddings/README.md @@ -243,7 +243,19 @@ python export_model.py embeddings_ov --source_model sentence-transformers/all-mp ::: :::: +**NPU** +::::{tab-set} +:::{tab-item} Qwen/Qwen3-Embedding-0.6B +:sync: Qwen3-Embedding-0.6B-fp16 +```console +python export_model.py embeddings_ov --source_model BAAI/bge-large-en-v1.5 --pooling CLS --weight-format fp16 --target_device NPU --config_file_path models/config.json --model_repository_path models +``` +::: +:::: +> **Note** For NPU Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision. For int4 precisions, add required extra parameter `--extra_quantization_params "--sym --ratio 1.0 --group-size -1"` +> **Note** For NPU the pooling mode --pooling LAST has the best accuracy. +> **Note** For NPU and the weight-format int4, use `--extra_quantization_params "--sym --ratio 1.0 --group-size -1"` > **Note** Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision to reduce memory consumption and improve performance. > **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" before running the export script to connect to the HF Hub. @@ -280,47 +292,22 @@ python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B -- ## Tested models All models supported by [optimum-intel](https://github.com/huggingface/optimum-intel) should be compatible. The demo is validated against following Hugging Face models: -|Model name|Pooling| -|---|---| -|OpenVINO/Qwen3-Embedding-0.6B-int8-ov|LAST| -|OpenVINO/bge-base-en-v1.5-int8-ov|CLS| -|BAAI/bge-large-en-v1.5|CLS| -|BAAI/bge-large-zh-v1.5|CLS| -|thenlper/gte-small|CLS| -|sentence-transformers/all-MiniLM-L12-v2|MEAN| -|sentence-transformers/all-distilroberta-v1|MEAN| -|mixedbread-ai/deepset-mxbai-embed-de-large-v1|MEAN| -|intfloat/multilingual-e5-large-instruct|MEAN| -|intfloat/multilingual-e5-large|MEAN| -|Alibaba-NLP/gte-large-en-v1.5|CLS| -|nomic-ai/nomic-embed-text-v1.5|MEAN| -|sentence-transformers/all-mpnet-base-v2|MEAN| - - -**NPU** -::::{tab-set} -:::{tab-item} Qwen/Qwen3-Embedding-0.6B -:sync: Qwen3-Embedding-0.6B-fp16 -```console -python export_model.py embeddings_ov --source_model BAAI/bge-large-en-v1.5 --pooling CLS --weight-format fp16 --target_device NPU --config_file_path models/config.json --model_repository_path models -``` -::: -:::: - -> **Note** Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision. For int4 precisions, add required extra parameter `--extra_quantization_params "--sym --ratio 1.0 --group-size -1"` -> **Note** Pooling mode --pooling LAST has the best accuracy. -> **Note** For weight-format int4, use `--extra_quantization_params "--sym --ratio 1.0 --group-size -1"` - -## Tested NPU models -The demo is validated against following Hugging Face models: - -|Model name|Pooling| -|---|---| -|Qwen/Qwen3-Embedding-0.6B|LAST| -|BAAI/bge-large-en-v1.5|CLS| -|BAAI/bge-large-zh-v1.5|CLS| -|thenlper/gte-small|CLS| -|sentence-transformers/all-mpnet-base-v2|MEAN| +|Model name|Pooling|Devices| +|---|---|---| +|OpenVINO/Qwen3-Embedding-0.6B-int8-ov|LAST|CPU,GPU| +|OpenVINO/bge-base-en-v1.5-int8-ov|CLS|CPU,GPU| +|Qwen/Qwen3-Embedding-0.6B|LAST|CPU,GPU,NPU| +|BAAI/bge-large-en-v1.5|CLS|CPU,GPU,NPU| +|BAAI/bge-large-zh-v1.5|CLS|CPU,GPU,NPU| +|thenlper/gte-small|CLS|CPU,GPU,NPU| +|sentence-transformers/all-MiniLM-L12-v2|MEAN|CPU,GPU| +|sentence-transformers/all-distilroberta-v1|MEAN|CPU,GPU| +|mixedbread-ai/deepset-mxbai-embed-de-large-v1|MEAN|CPU,GPU| +|intfloat/multilingual-e5-large-instruct|MEAN|CPU,GPU| +|intfloat/multilingual-e5-large|MEAN|CPU,GPU| +|Alibaba-NLP/gte-large-en-v1.5|CLS|CPU,GPU| +|nomic-ai/nomic-embed-text-v1.5|MEAN|CPU,GPU| +|sentence-transformers/all-mpnet-base-v2|MEAN|CPU,GPU,NPU| ## Server Deployment