diff --git a/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py b/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py index 25faa00ee28..c9e37560a69 100644 --- a/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py +++ b/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py @@ -12,6 +12,7 @@ from functools import partial import numpy as np +import openvino as ov import torch from optimum.intel.openvino import OVModelForCausalLM from transformers import AutoTokenizer @@ -21,29 +22,50 @@ SEED = 0 -def transform_func(text, tokenizer): +def transform_func(text, tokenizer, ov_model): + input_dtypes = {inp.get_any_name(): inp.get_element_type() for inp in ov_model.inputs} tokens = tokenizer(text) input_ids = np.expand_dims(np.array(tokens["input_ids"]), 0) attention_mask = np.expand_dims(np.array(tokens["attention_mask"]), 0) - position_ids = np.cumsum(attention_mask, axis=1) - 1 position_ids[attention_mask == 0] = 1 - - batch_size = input_ids.shape[0] - - return { + res = { "input_ids": input_ids, "attention_mask": attention_mask, - "position_ids": position_ids, - "beam_idx": np.arange(batch_size, dtype=np.int64), + "position_ids": position_ids.reshape(*attention_mask.shape), } + def gen_pkv(num_heads, head_dim, num_layers): + res = {} + shape = (1, num_heads, 0, head_dim) + for i in range(num_layers): + key_name = f"past_key_values.{i}.key" + val_name = f"past_key_values.{i}.value" + res[key_name] = ov.Tensor(shape=shape, type=input_dtypes[key_name]) + res[val_name] = ov.Tensor(shape=shape, type=input_dtypes[val_name]) + return res + + res.update(gen_pkv(4, 64, 22)) + return res + def main(): MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + ov_config = { + "PERFORMANCE_HINT": "LATENCY", + "NUM_STREAMS": "1", + "CACHE_DIR": "", + } tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - hf_model = OVModelForCausalLM.from_pretrained(MODEL_ID, export=True, load_in_8bit=False, compile=False) + hf_model = OVModelForCausalLM.from_pretrained( + MODEL_ID, + export=True, + load_in_8bit=False, + compile=False, + ov_config=ov_config, + stateful=False, + ) dataset_size = 100 @@ -51,7 +73,9 @@ def main(): saved_seed = torch.seed() torch.manual_seed(SEED) synthetic_dataset = nncf.data.generate_text_data(hf_model, tokenizer, dataset_size=dataset_size) - quantization_dataset = nncf.Dataset(synthetic_dataset, partial(transform_func, tokenizer=tokenizer)) + quantization_dataset = nncf.Dataset( + synthetic_dataset, partial(transform_func, tokenizer=tokenizer, ov_model=hf_model.model) + ) hf_model.request = None torch.manual_seed(saved_seed) diff --git a/tests/cross_fw/examples/example_scope.json b/tests/cross_fw/examples/example_scope.json index 3e638e347eb..fbaf0adf123 100644 --- a/tests/cross_fw/examples/example_scope.json +++ b/tests/cross_fw/examples/example_scope.json @@ -242,7 +242,7 @@ "requirements": "examples/llm_compression/openvino/tiny_llama_synthetic_data/requirements.txt", "cpu": "Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz", "accuracy_metrics": { - "word_count": 84 + "word_count": 81 } }, "quantization_aware_training_torch_anomalib": {