From f90fd93748b430e9aebc3fdf646a2690ecdec05d Mon Sep 17 00:00:00 2001
From: rasapala <rafal.a.sapala@intel.com>
Date: Mon, 9 Feb 2026 17:42:48 +0100
Subject: [PATCH 1/9] Npu docs

---
 demos/common/export_models/export_model.py |  4 +++
 demos/embeddings/README.md                 | 37 +++++++++++++++++++++-
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py
index 97a9149b8b..687ae7a628 100644
--- a/demos/common/export_models/export_model.py
+++ b/demos/common/export_models/export_model.py
@@ -486,6 +486,10 @@ def export_embeddings_model_ov(model_repository_path, source_model, model_name,
     destination_path = os.path.join(model_repository_path, model_name)
     print("Exporting embeddings model to ",destination_path)
     if not os.path.isdir(destination_path) or args['overwrite_models']:
+        if task_parameters['target_device'] == 'NPU':
+            if task_parameters['extra_quantization_params'] == "":
+                print("Using default quantization parameters for NPU: --sym --ratio 1.0 --group-size -1")
+                task_parameters['extra_quantization_params'] = "--sym --ratio 1.0 --group-size -1"
         optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task feature-extraction --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], destination_path)
         print('Running command:', optimum_command)  # for debug purposes
         if os.system(optimum_command):
diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md
index b74042f754..72fbc6ee9a 100644
--- a/demos/embeddings/README.md
+++ b/demos/embeddings/README.md
@@ -243,7 +243,6 @@ python export_model.py embeddings_ov --source_model sentence-transformers/all-mp
 :::
 ::::
 
-
 > **Note** Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision to reduce memory consumption and improve performance.
 > **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" before running the export script to connect to the HF Hub.
 
@@ -297,6 +296,26 @@ All models supported by [optimum-intel](https://github.com/huggingface/optimum-i
 |sentence-transformers/all-mpnet-base-v2|MEAN|
 
 
+**NPU**
+::::{tab-set}
+:::{tab-item} Qwen/Qwen3-Embedding-0.6B
+:sync: Qwen3-Embedding-0.6B-fp16
+```console
+python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B --pooling LAST --weight-format fp16 --target_device NPU --config_file_path models/config.json --model_repository_path models
+```
+:::
+::::
+
+> **Note** Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision to reduce memory consumption and improve performance. `fp16` is ised for better accuracy but `int8` and `int4` has better performance.
+> **Note** Pooling mode --pooling LAST has the best accuracy.
+
+## Tested models
+All models supported by [optimum-intel](https://github.com/huggingface/optimum-intel) should be compatible. The demo is validated against following Hugging Face models:
+
+|Model name|Pooling|
+|---|---|
+|Wqen/Qwen3-Embedding-0.6B|LAST|
+
 ## Server Deployment
 
 :::{dropdown} **Deploying with Docker**
@@ -313,6 +332,14 @@ to `docker run` command, use the image with GPU support and make sure set the ta
 ```bash
 docker run -d --rm -p 8000:8000 --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -v $(pwd)/models:/workspace:ro openvino/model_server:latest-gpu --rest_port 8000 --config_path /workspace/config.json
 ```
+**NPU**
+
+In case you want to use NPU device to run the embeddings model, add extra docker parameters `--device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1)` 
+to `docker run` command, use the image with NPU support and make sure set the target_device in subconfig.json to NPU. Also make sure the export model quantization level and cache size fit to the NPU memory. All of that can be applied with the commands:
+
+```bash
+docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render*  | head -1) -v $(pwd)/models:/workspace:ro openvino/model_server:latest-gpu --rest_port 8000 --config_path /workspace/config.json
+```
 :::
 
 :::{dropdown} **Deploying on Bare Metal**
@@ -368,9 +395,17 @@ curl http://localhost:8000/v3/embeddings -H "Content-Type: application/json" -d
   "usage":{"prompt_tokens":4,"total_tokens":4}
 }
 
+
 ```
 :::
 
+## Client code for NPU
+
+:::{dropdown} **Request embeddings with cURL**
+```bash
+curl http://localhost:8000/v3/embeddings -H "Content-Type: application/json" -d "{ \"model\": \"Qwen/Qwen3-Embedding-0.6B\", \"input\": \"hello world\"}"
+```
+
 :::{dropdown} **Request embeddings with OpenAI Python package**
 
 ```bash

From 6fde3307f94ac8337683bc888e156fc370cfa878 Mon Sep 17 00:00:00 2001
From: rasapala <rafal.a.sapala@intel.com>
Date: Mon, 9 Feb 2026 17:55:04 +0100
Subject: [PATCH 2/9] Spell

---
 demos/embeddings/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md
index 72fbc6ee9a..55157de1f3 100644
--- a/demos/embeddings/README.md
+++ b/demos/embeddings/README.md
@@ -306,7 +306,7 @@ python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B --
 :::
 ::::
 
-> **Note** Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision to reduce memory consumption and improve performance. `fp16` is ised for better accuracy but `int8` and `int4` has better performance.
+> **Note** Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision to reduce memory consumption and improve performance. `fp16` is used for better accuracy but `int8` and `int4` has better performance.
 > **Note** Pooling mode --pooling LAST has the best accuracy.
 
 ## Tested models

From eebc30fec92a843f35dcf549c12fe6f5e69fba9a Mon Sep 17 00:00:00 2001
From: Damian Kalinowski <damian.kalinowski@intel.com>
Date: Tue, 10 Feb 2026 11:02:33 +0100
Subject: [PATCH 3/9] Remove reference that all models should work, they dont

---
 demos/embeddings/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md
index 55157de1f3..c8f4179baa 100644
--- a/demos/embeddings/README.md
+++ b/demos/embeddings/README.md
@@ -310,11 +310,11 @@ python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B --
 > **Note** Pooling mode --pooling LAST has the best accuracy.
 
 ## Tested models
-All models supported by [optimum-intel](https://github.com/huggingface/optimum-intel) should be compatible. The demo is validated against following Hugging Face models:
+The demo is validated against following Hugging Face models:
 
 |Model name|Pooling|
 |---|---|
-|Wqen/Qwen3-Embedding-0.6B|LAST|
+|Qwen/Qwen3-Embedding-0.6B|LAST|
 
 ## Server Deployment
 

From 82faddcdac3bf8a57e61414a0d364797d5f61850 Mon Sep 17 00:00:00 2001
From: Damian Kalinowski <damian.kalinowski@intel.com>
Date: Tue, 10 Feb 2026 11:10:38 +0100
Subject: [PATCH 4/9] Remove default quantization params since those are only
 required for int4

---
 demos/common/export_models/export_model.py | 4 ----
 demos/embeddings/README.md                 | 1 +
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py
index 687ae7a628..97a9149b8b 100644
--- a/demos/common/export_models/export_model.py
+++ b/demos/common/export_models/export_model.py
@@ -486,10 +486,6 @@ def export_embeddings_model_ov(model_repository_path, source_model, model_name,
     destination_path = os.path.join(model_repository_path, model_name)
     print("Exporting embeddings model to ",destination_path)
     if not os.path.isdir(destination_path) or args['overwrite_models']:
-        if task_parameters['target_device'] == 'NPU':
-            if task_parameters['extra_quantization_params'] == "":
-                print("Using default quantization parameters for NPU: --sym --ratio 1.0 --group-size -1")
-                task_parameters['extra_quantization_params'] = "--sym --ratio 1.0 --group-size -1"
         optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task feature-extraction --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], destination_path)
         print('Running command:', optimum_command)  # for debug purposes
         if os.system(optimum_command):
diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md
index c8f4179baa..f499bc2086 100644
--- a/demos/embeddings/README.md
+++ b/demos/embeddings/README.md
@@ -308,6 +308,7 @@ python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B --
 
 > **Note** Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision to reduce memory consumption and improve performance. `fp16` is used for better accuracy but `int8` and `int4` has better performance.
 > **Note** Pooling mode --pooling LAST has the best accuracy.
+> **Note** For weight-format int4, use `--extra_quantization_params "--sym --ratio 1.0 --group-size -1"`
 
 ## Tested models
 The demo is validated against following Hugging Face models:

From 4bc4006b492cd3943063ff1d0227c1f1fce3c4b9 Mon Sep 17 00:00:00 2001
From: Rafal Sapala <rafal.a.sapala@intel.com>
Date: Mon, 16 Feb 2026 13:18:45 +0100
Subject: [PATCH 5/9] Apply suggestions from code review

Co-authored-by: Trawinski, Dariusz <dariusz.trawinski@intel.com>
---
 demos/embeddings/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md
index f499bc2086..835fd75925 100644
--- a/demos/embeddings/README.md
+++ b/demos/embeddings/README.md
@@ -306,7 +306,7 @@ python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B --
 :::
 ::::
 
-> **Note** Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision to reduce memory consumption and improve performance. `fp16` is used for better accuracy but `int8` and `int4` has better performance.
+> **Note** Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision. For int4 precisions, add required extra parameter `--extra_quantization_params "--sym --ratio 1.0 --group-size -1"`
 > **Note** Pooling mode --pooling LAST has the best accuracy.
 > **Note** For weight-format int4, use `--extra_quantization_params "--sym --ratio 1.0 --group-size -1"`
 
@@ -334,7 +334,7 @@ to `docker run` command, use the image with GPU support and make sure set the ta
 docker run -d --rm -p 8000:8000 --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -v $(pwd)/models:/workspace:ro openvino/model_server:latest-gpu --rest_port 8000 --config_path /workspace/config.json
 ```
 **NPU**
-
+NOTE: NPU execution for embeddings model is a preview feature.
 In case you want to use NPU device to run the embeddings model, add extra docker parameters `--device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1)` 
 to `docker run` command, use the image with NPU support and make sure set the target_device in subconfig.json to NPU. Also make sure the export model quantization level and cache size fit to the NPU memory. All of that can be applied with the commands:
 

From aabf97660dff32365c5af2e3aaee5884fcfb2428 Mon Sep 17 00:00:00 2001
From: rasapala <rafal.a.sapala@intel.com>
Date: Mon, 16 Feb 2026 13:31:29 +0100
Subject: [PATCH 6/9] Code review

---
 demos/embeddings/README.md | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md
index 835fd75925..8b54466e2f 100644
--- a/demos/embeddings/README.md
+++ b/demos/embeddings/README.md
@@ -310,12 +310,16 @@ python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B --
 > **Note** Pooling mode --pooling LAST has the best accuracy.
 > **Note** For weight-format int4, use `--extra_quantization_params "--sym --ratio 1.0 --group-size -1"`
 
-## Tested models
+## Tested NPU models
 The demo is validated against following Hugging Face models:
 
 |Model name|Pooling|
 |---|---|
 |Qwen/Qwen3-Embedding-0.6B|LAST|
+|BAAI/bge-large-en-v1.5|CLS|
+|BAAI/bge-large-zh-v1.5|CLS|
+|thenlper/gte-small|CLS|
+|sentence-transformers/all-mpnet-base-v2|MEAN|
 
 ## Server Deployment
 
@@ -335,7 +339,7 @@ docker run -d --rm -p 8000:8000 --device /dev/dri --group-add=$(stat -c "%g" /de
 ```
 **NPU**
 NOTE: NPU execution for embeddings model is a preview feature.
-In case you want to use NPU device to run the embeddings model, add extra docker parameters `--device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1)` 
+In case you want to use NPU device to run the embeddings model, add extra docker parameters `--device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1)` 
 to `docker run` command, use the image with NPU support and make sure set the target_device in subconfig.json to NPU. Also make sure the export model quantization level and cache size fit to the NPU memory. All of that can be applied with the commands:
 
 ```bash
@@ -400,15 +404,6 @@ curl http://localhost:8000/v3/embeddings -H "Content-Type: application/json" -d
 ```
 :::
 
-## Client code for NPU
-
-:::{dropdown} **Request embeddings with cURL**
-```bash
-curl http://localhost:8000/v3/embeddings -H "Content-Type: application/json" -d "{ \"model\": \"Qwen/Qwen3-Embedding-0.6B\", \"input\": \"hello world\"}"
-```
-
-:::{dropdown} **Request embeddings with OpenAI Python package**
-
 ```bash
 pip3 install openai "numpy<2"
 ```

From db1d4eef4a2d47da2fb08162ee6efb9155091232 Mon Sep 17 00:00:00 2001
From: rasapala <rafal.a.sapala@intel.com>
Date: Mon, 16 Feb 2026 13:33:44 +0100
Subject: [PATCH 7/9] Fix

---
 demos/embeddings/README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md
index 8b54466e2f..27f714e6c7 100644
--- a/demos/embeddings/README.md
+++ b/demos/embeddings/README.md
@@ -243,6 +243,7 @@ python export_model.py embeddings_ov --source_model sentence-transformers/all-mp
 :::
 ::::
 
+
 > **Note** Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision to reduce memory consumption and improve performance.
 > **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" before running the export script to connect to the HF Hub.
 
@@ -404,6 +405,8 @@ curl http://localhost:8000/v3/embeddings -H "Content-Type: application/json" -d
 ```
 :::
 
+:::{dropdown} **Request embeddings with OpenAI Python package**
+
 ```bash
 pip3 install openai "numpy<2"
 ```

From f63121ca0510769926b96f6627ad433a6426b0c1 Mon Sep 17 00:00:00 2001
From: rasapala <rafal.a.sapala@intel.com>
Date: Wed, 18 Feb 2026 11:26:04 +0100
Subject: [PATCH 8/9] Fix model

---
 demos/embeddings/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md
index 27f714e6c7..189823c44d 100644
--- a/demos/embeddings/README.md
+++ b/demos/embeddings/README.md
@@ -302,7 +302,7 @@ All models supported by [optimum-intel](https://github.com/huggingface/optimum-i
 :::{tab-item} Qwen/Qwen3-Embedding-0.6B
 :sync: Qwen3-Embedding-0.6B-fp16
 ```console
-python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B --pooling LAST --weight-format fp16 --target_device NPU --config_file_path models/config.json --model_repository_path models
+python export_model.py embeddings_ov --source_model BAAI/bge-large-en-v1.5 --pooling CLS --weight-format fp16 --target_device NPU --config_file_path models/config.json --model_repository_path models
 ```
 :::
 ::::

From a2899c3793b45231ec88ee821187f4f51b1b4ca2 Mon Sep 17 00:00:00 2001
From: rasapala <rafal.a.sapala@intel.com>
Date: Thu, 19 Feb 2026 09:38:58 +0100
Subject: [PATCH 9/9] Code review

---
 demos/embeddings/README.md | 69 ++++++++++++++++----------------------
 1 file changed, 28 insertions(+), 41 deletions(-)

diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md
index 189823c44d..fbbfd91ea5 100644
--- a/demos/embeddings/README.md
+++ b/demos/embeddings/README.md
@@ -243,7 +243,19 @@ python export_model.py embeddings_ov --source_model sentence-transformers/all-mp
 :::
 ::::
 
+**NPU**
+::::{tab-set}
+:::{tab-item} Qwen/Qwen3-Embedding-0.6B
+:sync: Qwen3-Embedding-0.6B-fp16
+```console
+python export_model.py embeddings_ov --source_model BAAI/bge-large-en-v1.5 --pooling CLS --weight-format fp16 --target_device NPU --config_file_path models/config.json --model_repository_path models
+```
+:::
+::::
 
+> **Note** For NPU Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision. For int4 precisions, add required extra parameter `--extra_quantization_params "--sym --ratio 1.0 --group-size -1"`
+> **Note** For NPU the pooling mode --pooling LAST has the best accuracy.
+> **Note** For NPU and the weight-format int4, use `--extra_quantization_params "--sym --ratio 1.0 --group-size -1"`
 > **Note** Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision to reduce memory consumption and improve performance.
 > **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" before running the export script to connect to the HF Hub.
 
@@ -280,47 +292,22 @@ python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B --
 ## Tested models
 All models supported by [optimum-intel](https://github.com/huggingface/optimum-intel) should be compatible. The demo is validated against following Hugging Face models:
 
-|Model name|Pooling|
-|---|---|
-|OpenVINO/Qwen3-Embedding-0.6B-int8-ov|LAST|
-|OpenVINO/bge-base-en-v1.5-int8-ov|CLS|
-|BAAI/bge-large-en-v1.5|CLS|
-|BAAI/bge-large-zh-v1.5|CLS|
-|thenlper/gte-small|CLS|
-|sentence-transformers/all-MiniLM-L12-v2|MEAN|
-|sentence-transformers/all-distilroberta-v1|MEAN|
-|mixedbread-ai/deepset-mxbai-embed-de-large-v1|MEAN|
-|intfloat/multilingual-e5-large-instruct|MEAN|
-|intfloat/multilingual-e5-large|MEAN|
-|Alibaba-NLP/gte-large-en-v1.5|CLS|
-|nomic-ai/nomic-embed-text-v1.5|MEAN|
-|sentence-transformers/all-mpnet-base-v2|MEAN|
-
-
-**NPU**
-::::{tab-set}
-:::{tab-item} Qwen/Qwen3-Embedding-0.6B
-:sync: Qwen3-Embedding-0.6B-fp16
-```console
-python export_model.py embeddings_ov --source_model BAAI/bge-large-en-v1.5 --pooling CLS --weight-format fp16 --target_device NPU --config_file_path models/config.json --model_repository_path models
-```
-:::
-::::
-
-> **Note** Change the `--weight-format` to quantize the model to `fp16`, `int8` or `int4` precision. For int4 precisions, add required extra parameter `--extra_quantization_params "--sym --ratio 1.0 --group-size -1"`
-> **Note** Pooling mode --pooling LAST has the best accuracy.
-> **Note** For weight-format int4, use `--extra_quantization_params "--sym --ratio 1.0 --group-size -1"`
-
-## Tested NPU models
-The demo is validated against following Hugging Face models:
-
-|Model name|Pooling|
-|---|---|
-|Qwen/Qwen3-Embedding-0.6B|LAST|
-|BAAI/bge-large-en-v1.5|CLS|
-|BAAI/bge-large-zh-v1.5|CLS|
-|thenlper/gte-small|CLS|
-|sentence-transformers/all-mpnet-base-v2|MEAN|
+|Model name|Pooling|Devices|
+|---|---|---|
+|OpenVINO/Qwen3-Embedding-0.6B-int8-ov|LAST|CPU,GPU|
+|OpenVINO/bge-base-en-v1.5-int8-ov|CLS|CPU,GPU|
+|Qwen/Qwen3-Embedding-0.6B|LAST|CPU,GPU,NPU|
+|BAAI/bge-large-en-v1.5|CLS|CPU,GPU,NPU|
+|BAAI/bge-large-zh-v1.5|CLS|CPU,GPU,NPU|
+|thenlper/gte-small|CLS|CPU,GPU,NPU|
+|sentence-transformers/all-MiniLM-L12-v2|MEAN|CPU,GPU|
+|sentence-transformers/all-distilroberta-v1|MEAN|CPU,GPU|
+|mixedbread-ai/deepset-mxbai-embed-de-large-v1|MEAN|CPU,GPU|
+|intfloat/multilingual-e5-large-instruct|MEAN|CPU,GPU|
+|intfloat/multilingual-e5-large|MEAN|CPU,GPU|
+|Alibaba-NLP/gte-large-en-v1.5|CLS|CPU,GPU|
+|nomic-ai/nomic-embed-text-v1.5|MEAN|CPU,GPU|
+|sentence-transformers/all-mpnet-base-v2|MEAN|CPU,GPU,NPU|
 
 ## Server Deployment