From 72e003324a45729deb5cc819c1bc3ea7a51ca7be Mon Sep 17 00:00:00 2001 From: Liubov Talamanova Date: Tue, 20 Jan 2026 18:30:08 +0000 Subject: [PATCH 1/4] [OV 2026.0.0] Fix accuracy_metrics for tiny_llama_synthetic_data test --- tests/cross_fw/examples/example_scope.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/cross_fw/examples/example_scope.json b/tests/cross_fw/examples/example_scope.json index 3e638e347eb..fb45cfbc9a9 100644 --- a/tests/cross_fw/examples/example_scope.json +++ b/tests/cross_fw/examples/example_scope.json @@ -242,7 +242,7 @@ "requirements": "examples/llm_compression/openvino/tiny_llama_synthetic_data/requirements.txt", "cpu": "Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz", "accuracy_metrics": { - "word_count": 84 + "word_count": 77 } }, "quantization_aware_training_torch_anomalib": { From 5511685795d31d1ba2cb574a94eacdc6593dfd34 Mon Sep 17 00:00:00 2001 From: Liubov Talamanova Date: Mon, 9 Feb 2026 14:05:21 +0000 Subject: [PATCH 2/4] Disable KV cache quantization --- .../openvino/tiny_llama_synthetic_data/main.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py b/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py index 25faa00ee28..0ea2bfb9077 100644 --- a/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py +++ b/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py @@ -41,9 +41,22 @@ def transform_func(text, tokenizer): def main(): MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + ov_config = { + "PERFORMANCE_HINT": "LATENCY", + "NUM_STREAMS": "1", + "CACHE_DIR": "", + "DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", + "KV_CACHE_PRECISION": "f16", + } tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - hf_model = OVModelForCausalLM.from_pretrained(MODEL_ID, export=True, load_in_8bit=False, compile=False) + hf_model = OVModelForCausalLM.from_pretrained( + MODEL_ID, + export=True, + load_in_8bit=False, + compile=False, + ov_config=ov_config, + ) dataset_size = 100 From 0f4df53821201c50376b4cd05d16914170c11d5b Mon Sep 17 00:00:00 2001 From: Liubov Talamanova Date: Tue, 10 Feb 2026 09:20:34 +0000 Subject: [PATCH 3/4] Switch back to stateless model --- .../tiny_llama_synthetic_data/main.py | 31 +++++++++++++------ tests/cross_fw/examples/example_scope.json | 2 +- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py b/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py index 0ea2bfb9077..afa987aafaa 100644 --- a/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py +++ b/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py @@ -12,6 +12,7 @@ from functools import partial import numpy as np +import openvino as ov import torch from optimum.intel.openvino import OVModelForCausalLM from transformers import AutoTokenizer @@ -21,23 +22,32 @@ SEED = 0 -def transform_func(text, tokenizer): +def transform_func(text, tokenizer, ov_model): + input_dtypes = {inp.get_any_name(): inp.get_element_type() for inp in ov_model.inputs} tokens = tokenizer(text) input_ids = np.expand_dims(np.array(tokens["input_ids"]), 0) attention_mask = np.expand_dims(np.array(tokens["attention_mask"]), 0) - position_ids = np.cumsum(attention_mask, axis=1) - 1 position_ids[attention_mask == 0] = 1 - - batch_size = input_ids.shape[0] - - return { + res = { "input_ids": input_ids, "attention_mask": attention_mask, - "position_ids": position_ids, - "beam_idx": np.arange(batch_size, dtype=np.int64), + "position_ids": position_ids.reshape(*attention_mask.shape), } + def gen_pkv(num_heads, head_dim, num_layers): + res = {} + shape = (1, num_heads, 0, head_dim) + for i in range(num_layers): + key_name = f"past_key_values.{i}.key" + val_name = f"past_key_values.{i}.value" + res[key_name] = ov.Tensor(shape=shape, type=input_dtypes[key_name]) + res[val_name] = ov.Tensor(shape=shape, type=input_dtypes[val_name]) + return res + + res.update(gen_pkv(4, 64, 22)) + return res + def main(): MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" @@ -56,6 +66,7 @@ def main(): load_in_8bit=False, compile=False, ov_config=ov_config, + stateful=False, ) dataset_size = 100 @@ -64,7 +75,9 @@ def main(): saved_seed = torch.seed() torch.manual_seed(SEED) synthetic_dataset = nncf.data.generate_text_data(hf_model, tokenizer, dataset_size=dataset_size) - quantization_dataset = nncf.Dataset(synthetic_dataset, partial(transform_func, tokenizer=tokenizer)) + quantization_dataset = nncf.Dataset( + synthetic_dataset, partial(transform_func, tokenizer=tokenizer, ov_model=hf_model.model) + ) hf_model.request = None torch.manual_seed(saved_seed) diff --git a/tests/cross_fw/examples/example_scope.json b/tests/cross_fw/examples/example_scope.json index fb45cfbc9a9..fbaf0adf123 100644 --- a/tests/cross_fw/examples/example_scope.json +++ b/tests/cross_fw/examples/example_scope.json @@ -242,7 +242,7 @@ "requirements": "examples/llm_compression/openvino/tiny_llama_synthetic_data/requirements.txt", "cpu": "Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz", "accuracy_metrics": { - "word_count": 77 + "word_count": 81 } }, "quantization_aware_training_torch_anomalib": { From 51bb60042bfd1e01f00fd57898d5d4171b0b31b9 Mon Sep 17 00:00:00 2001 From: Liubov Talamanova Date: Wed, 18 Feb 2026 09:57:23 +0000 Subject: [PATCH 4/4] update ov config --- .../llm_compression/openvino/tiny_llama_synthetic_data/main.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py b/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py index afa987aafaa..c9e37560a69 100644 --- a/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py +++ b/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py @@ -55,8 +55,6 @@ def main(): "PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": "", - "DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", - "KV_CACHE_PRECISION": "f16", } tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)