From 488cacc2be70b7ae7e417c555d2aeea29163f5b6 Mon Sep 17 00:00:00 2001 From: Aleksandr Suslov Date: Mon, 10 Jun 2024 19:17:08 +0400 Subject: [PATCH 01/11] Support scale estimation inside GPTQ --- .../algorithms/layerwise/scheduler.py | 34 +- .../weight_compression/activation_stats.py | 7 +- .../weight_compression/algorithm.py | 59 ++-- .../algorithms/weight_compression/gptq.py | 41 ++- .../weight_compression/scale_estimation.py | 316 ++++++++++-------- nncf/quantization/quantize_model.py | 5 - .../openvino/native/quantization/test_gptq.py | 5 +- .../quantization/test_weights_compression.py | 5 +- 8 files changed, 271 insertions(+), 201 deletions(-) diff --git a/nncf/quantization/algorithms/layerwise/scheduler.py b/nncf/quantization/algorithms/layerwise/scheduler.py index 8eee99fad28..8abc03400c0 100644 --- a/nncf/quantization/algorithms/layerwise/scheduler.py +++ b/nncf/quantization/algorithms/layerwise/scheduler.py @@ -9,6 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from collections import OrderedDict from copy import deepcopy from dataclasses import dataclass from dataclasses import field @@ -177,26 +178,31 @@ def schedule( old_input_nodes = set() new_input_nodes = set() for p in paths: - target_output_nodes = set() + target_outputs = [] additional_output_nodes = set() for output_node in p.output_nodes: - if output_node in target_nodes: - target_output_nodes.add(output_node) - elif output_node in p.input_nodes: - reuse_input_nodes.add(output_node) - else: - # filter additional output nodes - for prev_node in inference_graph.get_previous_nodes(output_node): - if prev_node not in p.output_nodes: - additional_output_nodes.add(output_node) - break - if not target_output_nodes: + try: + target_node_index = target_nodes.index(output_node) + target_outputs.append((target_node_index, output_node)) + except ValueError: + if output_node in p.input_nodes: + reuse_input_nodes.add(output_node) + else: + # filter additional output nodes + for prev_node in inference_graph.get_previous_nodes(output_node): + if prev_node not in p.output_nodes: + additional_output_nodes.add(output_node) + break + if not target_outputs: continue + target_outputs.sort(key=lambda target_output: target_output[0]) + target_output_nodes = [output[1] for output in target_outputs] + old_input_nodes |= p.input_nodes - new_input_nodes |= target_output_nodes | additional_output_nodes + new_input_nodes |= set(target_output_nodes) | additional_output_nodes subgraph_inputs = list(p.inputs) - step_target_nodes = {} + step_target_nodes = OrderedDict() subgraph_outputs = [] for node in target_output_nodes: target_edge = {} diff --git a/nncf/quantization/algorithms/weight_compression/activation_stats.py b/nncf/quantization/algorithms/weight_compression/activation_stats.py index eb8286e6383..359887e7769 100644 --- a/nncf/quantization/algorithms/weight_compression/activation_stats.py +++ b/nncf/quantization/algorithms/weight_compression/activation_stats.py @@ -9,14 +9,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Tuple, TypeVar +from typing import List, Tuple +from nncf.tensor import Tensor from nncf.tensor import functions as fns -TTensor = TypeVar("TTensor") - -def process_stats(stats: List[TTensor], subset_size: int) -> Tuple[TTensor, TTensor]: +def process_stats(stats: List[Tensor], subset_size: int) -> Tuple[Tensor, Tensor]: """ It's a processing of activations shared between AWQ, Scale Estimation and LoRA Correction algorithms. diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index 3499521bce3..1b2af0fd9a3 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -124,7 +124,12 @@ def __init__( if self._gptq: gptq_params = self._advanced_parameters.gptq_params - self._gptq_algo = GPTQ(gptq_params.damp_percent, gptq_params.block_size, gptq_params.subset_size) + self._gptq_algo = GPTQ( + damp_percent=gptq_params.damp_percent, + block_size=gptq_params.block_size, + subset_size=gptq_params.subset_size, + scale_estimation=self._scale_estimation, + ) self._gptq_statistics = None @property @@ -379,25 +384,8 @@ def apply( scales = {} zero_points = {} - if ( - self._scale_estimation - and activations is not None - and self._mode not in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] - ): - scale_estimation_params = self._advanced_parameters.scale_estimation_params - scale_algo = ScaleEstimation( - model, - self._backend_entity.name_to_node_mapping, - all_weight_params, - nodes_to_compress, - activations, - scale_estimation_params.subset_size, - scale_estimation_params.initial_steps, - scale_estimation_params.scale_steps, - scale_estimation_params.weight_penalty, - ) - scales = scale_algo.apply(model, graph) - + lora_correction_algo = None + description = "Applying Weight Compression" if self._gptq: model, scales, zero_points = self._gptq_algo.apply( model=model, @@ -407,13 +395,30 @@ def apply( statistic_points=self._gptq_statistics, backend_entity=self._backend_entity, ) + else: + if ( + self._scale_estimation + and activations is not None + and self._mode not in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] + ): + scale_estimation_params = self._advanced_parameters.scale_estimation_params + scale_algo = ScaleEstimation( + model, + self._backend_entity.name_to_node_mapping, + all_weight_params, + nodes_to_compress, + activations, + scale_estimation_params.subset_size, + scale_estimation_params.initial_steps, + scale_estimation_params.scale_steps, + scale_estimation_params.weight_penalty, + ) + scales = scale_algo.apply(model, graph) - lora_correction_algo = None - description = "Applying Weight Compression" - if self._lora_correction: - lora_correction_params = self._advanced_parameters.lora_correction_params - lora_correction_algo = LoraCorrectionAlgorithm(activations, lora_correction_params) - description += " with correction of low-rank adapters" + if self._lora_correction: + lora_correction_params = self._advanced_parameters.lora_correction_params + lora_correction_algo = LoraCorrectionAlgorithm(activations, lora_correction_params) + description += " with correction of low-rank adapters" # Sort weight params to start compression with the bigger constants. This lowers peak memory footprint. all_weight_params = sorted(all_weight_params, key=lambda wp: wp.num_weights, reverse=True) @@ -542,7 +547,7 @@ def _get_activations( statistics_aggregator = StatisticsAggregatorFactory.create(model, dataset) statistics_aggregator.register_statistic_points(statistic_container) - if self._gptq: + if self._gptq and not self._awq: self._gptq_statistics = self._gptq_algo.get_statistic_points( model, graph, nodes_to_compress, self._backend_entity ) diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py index b595e080533..b1101916da3 100644 --- a/nncf/quantization/algorithms/weight_compression/gptq.py +++ b/nncf/quantization/algorithms/weight_compression/gptq.py @@ -25,6 +25,7 @@ from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters +from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_integer_quantization_params from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_nf4_scale from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_weight @@ -44,10 +45,7 @@ class GPTQ: """ def __init__( - self, - damp_percent: float = 0.1, - block_size: int = 128, - subset_size: int = 128, + self, damp_percent: float = 0.1, block_size: int = 128, subset_size: int = 128, scale_estimation: bool = False ): """ :param damp_percent: The percent of the average Hessian diagonal to use for dampening, @@ -58,6 +56,7 @@ def __init__( self._damp_percent = damp_percent self._block_size = block_size self._subset_size = subset_size + self._scale_estimation = scale_estimation self._backend = None self._backend_entity = None @@ -124,10 +123,9 @@ def apply( CompressWeightsMode.INT8_SYM, ]: continue - assert len(inputs) == 1 _, input_tensors = next(iter(inputs.items())) hessian = self._calculate_hessian(node, input_tensors) - scale, zero_point = self._quantize_weights(model, graph, wc_params, hessian) + scale, zero_point = self._quantize_weights(model, graph, wc_params, hessian, input_tensors) scales[wc_params.weight_name] = scale zero_points[wc_params.weight_name] = zero_point @@ -193,7 +191,12 @@ def _calculate_hessian(self, node: NNCFNode, inputs: List[Tensor]) -> Tensor: return hessian def _quantize_weights( - self, model: TModel, graph: NNCFGraph, wc_params: WeightCompressionParameters, hessian: Tensor + self, + model: TModel, + graph: NNCFGraph, + wc_params: WeightCompressionParameters, + hessian: Tensor, + inputs: List[Tensor], ): """ Quantizes the weights of the model based on the calculated Hessian matrix. @@ -260,11 +263,25 @@ def _quantize_weights( scale = calculate_nf4_scale(weight_tensor[:, (i1 + i) : (i1 + i + group_size)], reduction_axes) scales.append(scale) else: - scale, zero_point = calculate_integer_quantization_params( - weight_tensor[:, (i1 + i) : (i1 + i + group_size)], reduction_axes, block_compression_config - ) - scales.append(scale) - zero_points.append(zero_point) + if self._scale_estimation and block_compression_config.num_bits == 4: + activations = [inp.squeeze()[:, (i1 + i) : (i1 + i + group_size)] for inp in inputs] + scale, zero_point = ScaleEstimation.calculate_quantization_params( + self._backend_entity, + activations, + weight_tensor[:, (i1 + i) : (i1 + i + group_size)], + reduction_axes, + wc_params.compression_config, + ) + scales.append(scale.squeeze(axis=1)) + zero_points.append(zero_point) + else: + scale, zero_point = calculate_integer_quantization_params( + weight_tensor[:, (i1 + i) : (i1 + i + group_size)], + reduction_axes, + block_compression_config, + ) + scales.append(scale) + zero_points.append(zero_point) if block_compression_config.mode == CompressWeightsMode.NF4: compressed_weights = do_nf4_quantization( fns.unsqueeze(weight_col, 1), scales[-1], is_normalized_weight=False diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 6d1110c108f..712c5fd955d 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -20,16 +20,17 @@ from nncf.common.utils.backend import BackendType from nncf.common.utils.backend import get_backend from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats +from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization +from nncf.tensor import Tensor from nncf.tensor import TensorDataType from nncf.tensor import functions as fns TModel = TypeVar("TModel") -TTensor = TypeVar("TTensor") -TWeightType = TypeVar("TWeightType") class ScaleEstimation: @@ -37,13 +38,15 @@ class ScaleEstimation: Scale estimation algorithm implementation. """ + compress_decompress_cache = {} + def __init__( self, model: TModel, name_to_node_mapping: Dict[str, Any], all_weight_params: List[WeightCompressionParameters], nodes_to_compress: List[NNCFNode], - activations: Optional[Dict[str, TTensor]] = None, + activations: Optional[Dict[str, List[Tensor]]] = None, subset_size: int = 32, initial_steps: int = 5, scale_steps: int = 10, @@ -103,7 +106,7 @@ def apply( graph: NNCFGraph, statistic_points: Optional[StatisticPointsContainer] = None, dataset: Optional[Dataset] = None, - ) -> Dict[str, TTensor]: + ) -> Dict[str, Tensor]: """ Estimates better scale for the int4 nodes in the model. Minimizes per-group difference between floating point MatMul and @@ -118,8 +121,7 @@ def apply( :return: Dict with pairs (weight name, estimated scale). """ - compress_decompress_cache = {} - res = dict() + scales = dict() for wp in track(self._all_weight_params, description="Applying Scale Estimation"): weight_name = wp.weight_name @@ -127,11 +129,10 @@ def apply( config = wp.compression_config if config.num_bits != 4 or node_name not in self._activations: - res[weight_name] = None + scales[weight_name] = None continue - s, X = process_stats(self._activations[node_name], self._subset_size) - reduction_axis = wp.reduction_axes[0] + stats = self._activations[node_name] weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph) if len(weight_data) != 1: # not supported by the algorithm @@ -139,162 +140,211 @@ def apply( _, weight_port_id = weight_data[0] weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph) - weight = weight.astype(TensorDataType.float32) - eps = fns.finfo(weight).eps - if reduction_axis == 0: - weight = fns.transpose(weight) - reduction_axis = 1 + scales[weight_name], _ = self.calculate_quantization_params( + self._backend_entity, + stats, + weight, + wp.reduction_axes, + config, + self._subset_size, + self._initial_steps, + self._scale_steps, + self._weight_penalty, + ) - group_size = config.group_size if config.group_size != -1 else weight.shape[reduction_axis] - cur_config = deepcopy(config) - cur_config.group_size = group_size + return scales - original_weight = fns.zeros_like(weight) + weight + @staticmethod + def calculate_quantization_params( + backend_entity: WeightCompressionAlgoBackend, + activations: List[Tensor], + weight: Tensor, + reduction_axes: Tuple[int, ...], + config: WeightCompressionConfig, + subset_size: int = 32, + initial_steps: int = 5, + scale_steps: int = 10, + weight_penalty: float = -1.0, + ) -> Tensor: + """ + Calculates the quantization parameters for a given set of weights and activations. + This function estimates the optimal quantization scale for weight compression by + minimizing the difference between floating-point operations and operations with + quantized weights. + + The function uses an iterative process: + 1. Initial scale rectification based on activation statistics. + 2. A grid search to further refine the scale parameters. + + :param backend_entity: The backend-specific implementation of the weight compression algorithm. + :param activations: List of activation tensors corresponding to the layers being quantized. + :param weight: The weight tensor that is being quantized. + :param reduction_axes: Tuple specifying the axes along which the reduction is performed for quantization. + :param config: Configuration parameters for the weight compression, including quantization settings. + :param subset_size: The number of samples to use for scale estimation. Defaults to 32. + :param initial_steps: The number of steps for initial scale rectification using activation statistics. + Defaults to 5. + :param scale_steps: The number of steps for refining the scale using a grid search. Defaults to 10. + :param weight_penalty: Penalty coefficient applied to the difference between floating-point + and quantized weights. A value of -1 disables the penalty. Defaults to -1.0. + :return: A tensor containing the calculated quantization scales and zero points if applicable. + """ + reduction_axis = reduction_axes[0] - compressed_weights, scale, zp = do_int_quantization(original_weight, reduction_axis, cur_config) - if zp is not None: - zp = zp.astype(scale.dtype) - q_weights = do_int_dequantization(compressed_weights, scale, zp, reduction_axis) + s, X = process_stats(activations, subset_size) - s = fns.unsqueeze(s, 0) - s, _ = reshape_weight_for_grouped_quantization(s, reduction_axis, group_size) + weight = weight.astype(TensorDataType.float32) + eps = fns.finfo(weight).eps - original_weight, _ = reshape_weight_for_grouped_quantization(original_weight, reduction_axis, group_size) + if reduction_axis == 0: + weight = fns.transpose(weight) + reduction_axis = 1 - # all weight in group has importance based on corresponding input activations - importance = fns.ones_like(original_weight) - importance = importance * s + group_size = config.group_size if config.group_size != -1 else weight.shape[reduction_axis] + cur_config = deepcopy(config) + cur_config.group_size = group_size - target, zero_mask = get_target_zero_mask(compressed_weights, zp) - importance = fns.where(zero_mask, 0.0, importance) - - # normalize importances for every group of weights to make sum of them equal to 1.0 - denum = fns.sum(importance, axis=2, keepdims=True) - importance = importance / (denum + eps) - - X, _ = reshape_weight_for_grouped_quantization(X, 0, group_size) - q_weights, _ = reshape_weight_for_grouped_quantization(q_weights, reduction_axis, group_size) - best_diffs = None - result_scale = None - - fp_outs = fns.matmul(fns.transpose(original_weight, (1, 0, 2)), X) - q_outs = fns.matmul(fns.transpose(q_weights, (1, 0, 2)), X) - - # metric for minimization with shape [C_OUT, N_GROUPS], N_GROUPS = C_IN / GROUP_SIZE - min_max_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) - min_max_scale_diffs = fns.transpose(min_max_scale_diffs, (1, 0)) - if self._weight_penalty > 0.0: - min_max_scale_diffs += self._weight_penalty * fns.mean((q_weights - original_weight) ** 2, axis=-1) - - zp_shape = zp.shape if zp is not None else None - key = [(wp.compression_config.mode, wp.compression_config.num_bits) + q_weights.shape + scale.shape] - if zp is not None: - key += zp_shape - key = tuple(key) - if key in compress_decompress_cache: - compress_decompress_model = compress_decompress_cache[key]["compress_decompress_model"] - compress_model = compress_decompress_cache[key]["compress_model"] - else: - compress_decompress_model = self._backend_entity.get_compress_decompress_pipeline( - wp.compression_config, q_weights.shape, scale.shape, zp_shape - ) - compress_model = self._backend_entity.get_compress_pipeline( - wp.compression_config, q_weights.shape, scale.shape, zp_shape - ) - compress_decompress_cache[key] = { - "compress_decompress_model": compress_decompress_model, - "compress_model": compress_model, - } - - scale_sign = scale / fns.abs(scale) - zero_scale = 0.001 - zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) + original_weight = fns.zeros_like(weight) + weight - input_tensors = [original_weight.data, None] - if zp is not None: - input_tensors.append(zp.data) - # iterative rectification of initial scale - for i in range(self._initial_steps): - near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) - near_to_ideal_scale = near_to_ideal_scale * scale_sign - input_tensors[1] = near_to_ideal_scale.data + compressed_weights, scale, zp = do_int_quantization(original_weight, reduction_axis, cur_config) + if zp is not None: + zp = zp.astype(scale.dtype) + q_weights = do_int_dequantization(compressed_weights, scale, zp, reduction_axis) - out = compress_decompress_model(input_tensors) - q_weights_ = fns.zeros_like(original_weight) + out - q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) + s = fns.unsqueeze(s, 0) + s, _ = reshape_weight_for_grouped_quantization(s, reduction_axis, group_size) - ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) - ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0)) - if self._weight_penalty > 0.0: - ideal_scale_diffs += self._weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1) + original_weight, _ = reshape_weight_for_grouped_quantization(original_weight, reduction_axis, group_size) - if best_diffs is None: - best_diffs = min_max_scale_diffs + # all weight in group has importance based on corresponding input activations + importance = fns.ones_like(original_weight) + importance = importance * s - mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype) + target, zero_mask = get_target_zero_mask(compressed_weights, zp) + importance = fns.where(zero_mask, 0.0, importance) - best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs + # normalize importances for every group of weights to make sum of them equal to 1.0 + denum = fns.sum(importance, axis=2, keepdims=True) + importance = importance / (denum + eps) - mask = fns.unsqueeze(mask, axis=2) + X, _ = reshape_weight_for_grouped_quantization(X, 0, group_size) + q_weights, _ = reshape_weight_for_grouped_quantization(q_weights, reduction_axis, group_size) + best_diffs = None + result_scale = None - if result_scale is None: - near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale - else: - near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale - result_scale = near_to_ideal_scale - input_tensors[1] = near_to_ideal_scale.data + fp_outs = fns.matmul(fns.transpose(original_weight, (1, 0, 2)), X) + q_outs = fns.matmul(fns.transpose(q_weights, (1, 0, 2)), X) - if i < self._initial_steps - 1: - out = compress_model(input_tensors) - compressed_weights = fns.zeros_like(original_weight) + out - target, zero_mask = get_target_zero_mask(compressed_weights, zp) - zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) + # metric for minimization with shape [C_OUT, N_GROUPS], N_GROUPS = C_IN / GROUP_SIZE + min_max_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) + min_max_scale_diffs = fns.transpose(min_max_scale_diffs, (1, 0)) + if weight_penalty > 0.0: + min_max_scale_diffs += weight_penalty * fns.mean((q_weights - original_weight) ** 2, axis=-1) - # iterative rectification of scale based on grid search - for scale_steps in range(self._scale_steps): - factor = 1.0 - 0.05 * scale_steps - scaled_scale = factor * scale + zp_shape = zp.shape if zp is not None else None + key = (config.mode, config.num_bits) + q_weights.shape + scale.shape + if zp is not None: + key += zp_shape + if key in ScaleEstimation.compress_decompress_cache: + compress_decompress_model = ScaleEstimation.compress_decompress_cache[key]["compress_decompress_model"] + compress_model = ScaleEstimation.compress_decompress_cache[key]["compress_model"] + else: + compress_decompress_model = backend_entity.get_compress_decompress_pipeline( + config, q_weights.shape, scale.shape, zp_shape + ) + compress_model = backend_entity.get_compress_pipeline(config, q_weights.shape, scale.shape, zp_shape) + ScaleEstimation.compress_decompress_cache[key] = { + "compress_decompress_model": compress_decompress_model, + "compress_model": compress_model, + } + scale_sign = scale / fns.abs(scale) + zero_scale = 0.001 + zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) + + input_tensors = [original_weight.data, None] + if zp is not None: + input_tensors.append(zp.data) + # iterative rectification of initial scale + for i in range(initial_steps): + near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) + near_to_ideal_scale = near_to_ideal_scale * scale_sign + input_tensors[1] = near_to_ideal_scale.data + + out = compress_decompress_model(input_tensors) + q_weights_ = fns.zeros_like(original_weight) + out + q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) + + ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) + ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0)) + if weight_penalty > 0.0: + ideal_scale_diffs += weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1) + + if best_diffs is None: + best_diffs = min_max_scale_diffs + + mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype) + + best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs + + mask = fns.unsqueeze(mask, axis=2) + + if result_scale is None: + near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale + else: + near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale + result_scale = near_to_ideal_scale + input_tensors[1] = near_to_ideal_scale.data - input_tensors[1] = scaled_scale.data + if i < initial_steps - 1: out = compress_model(input_tensors) compressed_weights = fns.zeros_like(original_weight) + out - target, zero_mask = get_target_zero_mask(compressed_weights, zp) zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) - near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) - near_to_ideal_scale = near_to_ideal_scale * scale_sign - input_tensors[1] = near_to_ideal_scale.data - out = compress_decompress_model(input_tensors) - q_weights_ = fns.zeros_like(original_weight) + out + # iterative rectification of scale based on grid search + for scale_steps in range(scale_steps): + factor = 1.0 - 0.05 * scale_steps + scaled_scale = factor * scale + + input_tensors[1] = scaled_scale.data + out = compress_model(input_tensors) + compressed_weights = fns.zeros_like(original_weight) + out - q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) - ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) - ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0)) - if self._weight_penalty > 0.0: - ideal_scale_diffs += self._weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1) + target, zero_mask = get_target_zero_mask(compressed_weights, zp) + zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) + near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) + near_to_ideal_scale = near_to_ideal_scale * scale_sign + + input_tensors[1] = near_to_ideal_scale.data + out = compress_decompress_model(input_tensors) + q_weights_ = fns.zeros_like(original_weight) + out - mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype) + q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) + ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) + ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0)) + if weight_penalty > 0.0: + ideal_scale_diffs += weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1) - best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs + mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype) - mask = fns.unsqueeze(mask, axis=2) + best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs - if result_scale is None: - near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale - else: - near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale - result_scale = near_to_ideal_scale + mask = fns.unsqueeze(mask, axis=2) + + if result_scale is None: + near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale + else: + near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale + result_scale = near_to_ideal_scale - if config.group_size == -1: - result_scale = fns.squeeze(result_scale, axis=1) - res[weight_name] = result_scale + if config.group_size == -1: + result_scale = fns.squeeze(result_scale, axis=1) - return res + return result_scale, zp -def get_target_zero_mask(compressed_weights: TTensor, zp: Optional[TTensor] = None) -> Tuple[TTensor, TTensor]: +def get_target_zero_mask(compressed_weights: Tensor, zp: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]: """ Computes the target values and a mask indicating zero values in the target. @@ -310,7 +360,7 @@ def get_target_zero_mask(compressed_weights: TTensor, zp: Optional[TTensor] = No return target, zero_mask -def estimate_scales(weight: TTensor, target: TTensor, zero_mask: TTensor, importance: TTensor) -> TTensor: +def estimate_scales(weight: Tensor, target: Tensor, zero_mask: Tensor, importance: Tensor) -> Tensor: """ Estimates scales for the given weight, target, zero mask, and importance. diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py index e96c4526c51..60baeacc48e 100644 --- a/nncf/quantization/quantize_model.py +++ b/nncf/quantization/quantize_model.py @@ -482,11 +482,6 @@ def compress_weights( if any((gptq, lora_correction)) and (dataset is None or mode == CompressWeightsMode.E2M1): raise AttributeError("GPTQ or Lora Correction algorithm is defined, but dataset is None or mode is E2M1.") - if gptq and scale_estimation: - raise AttributeError( - "Simultaneous use of Scale estimation and GPTQ algorithms is not supported. Select one of them." - ) - if gptq and lora_correction: raise AttributeError( "Simultaneous use of Lora correction and GPTQ algorithms is not supported. Select one of them." diff --git a/tests/openvino/native/quantization/test_gptq.py b/tests/openvino/native/quantization/test_gptq.py index 1202b216ec7..ad19990eac0 100644 --- a/tests/openvino/native/quantization/test_gptq.py +++ b/tests/openvino/native/quantization/test_gptq.py @@ -341,7 +341,8 @@ def test_calculate_scale_linear(): gptq._set_backend_entity(ov_model) nodes = graph.get_all_nodes() - H = gptq._calculate_hessian(nodes[1], [Tensor(inp) for inp in inputs]) + wrapped_inputs = [Tensor(inp) for inp in inputs] + H = gptq._calculate_hessian(nodes[1], wrapped_inputs) ref_H = ref_gptq.H.numpy() assert np.all(np.isclose(ref_H, H.data)) @@ -351,7 +352,7 @@ def test_calculate_scale_linear(): ) wc_params.compression_config = WeightCompressionConfig(mode=CompressWeightsMode.INT4_SYM, group_size=16) - scale, _ = gptq._quantize_weights(ov_model, graph, wc_params, H) + scale, _ = gptq._quantize_weights(ov_model, graph, wc_params, H, wrapped_inputs) ref_scale = ref_scale.numpy() scale = scale.reshape(ref_scale.shape) assert np.all(np.isclose(ref_scale, scale.data)) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index bb9b5c373c7..c51cf667ca2 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -713,10 +713,7 @@ def test_raise_error_with_unsupported_params_for_int8(mode, params): @pytest.mark.parametrize("mode", INT4_MODES) @pytest.mark.parametrize( "params", - ( - {"dataset": "anything", "scale_estimation": True, "gptq": True}, - {"dataset": "anything", "lora_correction": True, "gptq": True}, - ), + ({"dataset": "anything", "lora_correction": True, "gptq": True},), ) def test_raise_error_with_unsupported_params_for_int4(mode, params): with pytest.raises(AttributeError): From ee648777dcb951f4c7bdadd3997680a5083645a7 Mon Sep 17 00:00:00 2001 From: Aleksandr Suslov Date: Wed, 4 Sep 2024 13:25:22 +0400 Subject: [PATCH 02/11] fix for INT4_ASYM --- nncf/quantization/algorithms/weight_compression/gptq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py index b1101916da3..bd6518c86ad 100644 --- a/nncf/quantization/algorithms/weight_compression/gptq.py +++ b/nncf/quantization/algorithms/weight_compression/gptq.py @@ -273,7 +273,7 @@ def _quantize_weights( wc_params.compression_config, ) scales.append(scale.squeeze(axis=1)) - zero_points.append(zero_point) + zero_points.append(zero_point if zero_point is None else zero_point.squeeze(axis=1)) else: scale, zero_point = calculate_integer_quantization_params( weight_tensor[:, (i1 + i) : (i1 + i + group_size)], From 65aed7cc5f6350c7c45113a953c47c051af83ebd Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 15 May 2026 16:35:45 +0200 Subject: [PATCH 03/11] Fixed asym compression for case then all values positive or negative. --- src/nncf/openvino/optimized_functions/models.py | 4 ++++ .../algorithms/weight_compression/weight_lowering.py | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/src/nncf/openvino/optimized_functions/models.py b/src/nncf/openvino/optimized_functions/models.py index 55a9373fa6b..11396816f5b 100644 --- a/src/nncf/openvino/optimized_functions/models.py +++ b/src/nncf/openvino/optimized_functions/models.py @@ -531,6 +531,10 @@ def _build_integer_quantization_model( max_values = opset.reduce_max(weight, reduction_axes=reduction_axes, keep_dims=True) min_values, max_values = opset.convert(min_values, ov.Type.f32), opset.convert(max_values, ov.Type.f32) + zero = opset.constant(0.0, ov.Type.f32) + min_values = opset.minimum(zero, min_values) + max_values = opset.maximum(zero, max_values) + if is_asym_mode: levels = level_high - level_low + 1 scale = divide_op(max_values - min_values, opset.constant(levels - 1, ov.Type.f32)) diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 7f1bdf3dfae..86ba2153ee8 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -317,6 +317,11 @@ def calculate_integer_quantization_params( level_high = 2**num_bits - 1 min_values = fns.min(weight, axis=reduction_axes, keepdims=True) # [a1, r, a2] -> [a1, 1, a2] max_values = fns.max(weight, axis=reduction_axes, keepdims=True) # [a1, r, a2] -> [a1, 1, a2] + + zero = fns.zeros_like(min_values) + min_values = fns.minimum(zero, min_values) + max_values = fns.maximum(zero, max_values) + scale, zero_point = calculate_scale_zero_point( min_values, max_values, level_low, level_high, narrow_range=False ) From 6364003341fb5656284040537c7f9c65158fba0b Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 15 May 2026 16:52:39 +0200 Subject: [PATCH 04/11] Fixed OV optimization. --- src/nncf/openvino/optimized_functions/models.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/nncf/openvino/optimized_functions/models.py b/src/nncf/openvino/optimized_functions/models.py index 11396816f5b..de5c36970c6 100644 --- a/src/nncf/openvino/optimized_functions/models.py +++ b/src/nncf/openvino/optimized_functions/models.py @@ -531,11 +531,10 @@ def _build_integer_quantization_model( max_values = opset.reduce_max(weight, reduction_axes=reduction_axes, keep_dims=True) min_values, max_values = opset.convert(min_values, ov.Type.f32), opset.convert(max_values, ov.Type.f32) - zero = opset.constant(0.0, ov.Type.f32) - min_values = opset.minimum(zero, min_values) - max_values = opset.maximum(zero, max_values) - if is_asym_mode: + zero = opset.constant(0.0, ov.Type.f32) + min_values = opset.minimum(zero, min_values) + max_values = opset.maximum(zero, max_values) levels = level_high - level_low + 1 scale = divide_op(max_values - min_values, opset.constant(levels - 1, ov.Type.f32)) scale = opset.select(opset.less(opset.abs(scale), eps), eps, scale) From 5d39420c6f12a075d8a1611c6a9d57c274a15cb4 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Thu, 21 May 2026 16:03:47 +0200 Subject: [PATCH 05/11] Updated OV test references. --- .../template_test_weights_compression.py | 10 ++--- .../quantization/test_weights_compression.py | 38 ++++++++++--------- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py index 6f55e6a2c5f..257481edb05 100644 --- a/tests/cross_fw/test_templates/template_test_weights_compression.py +++ b/tests/cross_fw/test_templates/template_test_weights_compression.py @@ -209,20 +209,20 @@ def wrap_model(model, data) -> CompressionParams: ("mode", "all_layers", "ratio", "ref_ids"), ( (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 1, [0, 1, 2, 3, 4]), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, [0, 3, 4]), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, [0, 1, 4]), (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.4, [0]), (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.2, []), (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 1, [0, 1, 2, 3]), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.8, [0, 1, 3]), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.8, [0, 1, 2]), (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.4, [0]), (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.2, []), (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, True, 0.8, [0, 1, 2]), (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, False, 0.8, [0, 1, 2]), - (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, True, 0.8, [0, 1, 2]), + (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, True, 0.8, [0, 1, 4]), (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, False, 0.8, [0, 1, 2]), - (SensitivityMetric.MAX_ACTIVATION_VARIANCE, True, 0.8, [0, 1, 2]), + (SensitivityMetric.MAX_ACTIVATION_VARIANCE, True, 0.8, [0, 1, 4]), (SensitivityMetric.MAX_ACTIVATION_VARIANCE, False, 0.8, [0, 1, 2]), - (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, True, 0.8, [0, 1, 2]), + (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, True, 0.8, [0, 1, 4]), (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, False, 0.8, [0, 1, 2]), ), ) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index d8c6932d72d..cb7adea7c60 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -739,11 +739,18 @@ def __str__(self): weight=TWO_GROUPS_IN_TWO_ROWS_ASYM, config=int4_asym_grouped_config, ), - # non-zero error - QuantErrorDesc(name="2 rows scaled [1, 254] linspace", weight=TWO_ROWS_LINSPACE[:, 1:-1], ref_error=239, atol=1), + QuantErrorDesc(name="2 rows scaled [1, 254] linspace", weight=TWO_ROWS_LINSPACE[:, 1:-1], ref_error=0, atol=1), + QuantErrorDesc( + name="2 columns of scaled [0, 255] linspace", weight=np.transpose(TWO_ROWS_LINSPACE), ref_error=0, atol=1 + ), QuantErrorDesc( - name="2 columns of scaled [0, 255] linspace", weight=np.transpose(TWO_ROWS_LINSPACE), ref_error=46818, atol=1 + name="2 columns of [0-15] linspace for asym", + weight=np.transpose(TWO_ROWS_LINSPACE_INT4_ASYM), + config=int4_asym_config, + ref_error=0, + atol=1, ), + # non-zero error QuantErrorDesc( name="2 rows of scaled [0, 15] linspace for sym", weight=TWO_ROWS_LINSPACE_INT4_ASYM, @@ -765,13 +772,6 @@ def __str__(self): ref_error=1.49, atol=1, ), - QuantErrorDesc( - name="2 columns of [0-15] linspace for asym", - weight=np.transpose(TWO_ROWS_LINSPACE_INT4_ASYM), - config=int4_asym_config, - ref_error=162, - atol=1, - ), ] @@ -1286,12 +1286,12 @@ def test_call_gptq_with_dataset_scale_estimation_neg_group_size(mode): ("sensitivity_metric", "all_layers", "ratio", "ref_ids", "group_size"), ( (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 1, [0, 1, 2, 3, 4], None), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, [0, 1, 2], None), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.4, [1], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, [0, 3, 4], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.4, [0], None), (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.2, [], None), (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 1, [0, 1, 2, 3], None), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.8, [0, 1, 2], None), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.4, [1], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.8, [0, 1, 3], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.4, [0], None), (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.2, [], None), (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, True, 0.8, [0, 1, 2], None), (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, False, 0.8, [0, 1, 2], None), @@ -1338,6 +1338,7 @@ def test_mixed_precision_mxfp(sensitivity_metric, all_layers, ratio, ref_ids, mo names_fp = {op.get_friendly_name() for op in ops} ref_fp_nodes = {f"weights_{i}" for i in ref_ids} + assert ref_fp_nodes == names_fp names_e8m0 = { @@ -1351,12 +1352,12 @@ def test_mixed_precision_mxfp(sensitivity_metric, all_layers, ratio, ref_ids, mo ("sensitivity_metric", "all_layers", "ratio", "ref_ids", "group_size"), ( (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 1, [0, 1, 2, 3, 4], None), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, [0, 1, 2], None), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.4, [0], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, [0, 1, 4], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.4, [1], None), (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.2, [], None), (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 1, [0, 1, 2, 3], None), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.8, [0, 1, 2], None), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.4, [0], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.8, [0, 1, 3], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.4, [1], None), (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.2, [], None), (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, True, 0.8, [0, 1, 2], None), (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, False, 0.8, [0, 1, 2], None), @@ -1405,6 +1406,7 @@ def test_mixed_precision_fp(sensitivity_metric, all_layers, ratio, ref_ids, mode names_fp = {op.get_friendly_name() for op in ops} ref_fp_nodes = {f"weights_{i}" for i in ref_ids} + assert ref_fp_nodes == names_fp scale_dtypes = (ov.Type.f16, ov.Type.f8e4m3) names_scales = { From 2aa48d5f118967b9be6a582a076fb6bcef8366d5 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Thu, 21 May 2026 16:08:40 +0200 Subject: [PATCH 06/11] Updated OV test references. --- .../IntegerModel_compressed_weights_int4_asym.json | 10 +++++----- .../IntegerModel_compressed_weights_int8_asym.json | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_int4_asym.json b/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_int4_asym.json index b4528b858d5..fd00b6a4b09 100644 --- a/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_int4_asym.json +++ b/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_int4_asym.json @@ -3,7 +3,7 @@ "compressed_weight": [ [ [ - 7.0, + 8.0, 0.0, 13.0 ] @@ -17,8 +17,8 @@ ], [ [ - 10.0, - 1.0, + 12.0, + 4.0, 0.0 ] ], @@ -79,7 +79,7 @@ "scale": [ [ [ - 0.040008544921875 + 0.046630859375 ] ], [ @@ -89,7 +89,7 @@ ], [ [ - 0.041839599609375 + 0.0545654296875 ] ], [ diff --git a/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_int8_asym.json b/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_int8_asym.json index a1cb92a00d5..1729fac42cf 100644 --- a/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_int8_asym.json +++ b/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_int8_asym.json @@ -2,9 +2,9 @@ "matmul_2_data": { "compressed_weight": [ [ - 116, + 136, 0, - 213 + 219 ], [ 255, @@ -12,8 +12,8 @@ 0 ], [ - 177, - 10, + 196, + 67, 0 ], [ @@ -54,13 +54,13 @@ ], "scale": [ [ - 0.002353668212890625 + 0.002742767333984375 ], [ 0.00583648681640625 ], [ - 0.002460479736328125 + 0.0032100677490234375 ], [ 0.0029277801513671875 From 20040964f1943045ef0217e529b1ed097e9ed682 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Thu, 21 May 2026 17:34:52 +0200 Subject: [PATCH 07/11] Updated references for OV test_scale_estimation --- .../template_test_weights_compression.py | 1 + .../quantization/test_weights_compression.py | 182 +++++++++--------- 2 files changed, 92 insertions(+), 91 deletions(-) diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py index 257481edb05..b99ef81d13b 100644 --- a/tests/cross_fw/test_templates/template_test_weights_compression.py +++ b/tests/cross_fw/test_templates/template_test_weights_compression.py @@ -323,6 +323,7 @@ def test_scale_estimation(self, mocker, transpose_a, is_moe, check_sampling_acti reference = self.get_moe_scale_estimation_ref(check_sampling_activation_stats_flow) else: reference = self.get_scale_estimation_ref(check_sampling_activation_stats_flow) + assert fns.allclose(Tensor(reference), computed_scale) @staticmethod diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index cb7adea7c60..aa9ea746ea3 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -2471,42 +2471,42 @@ def get_scale_estimation_ref(check_sampling_activation_stats_flow): return ( np.array( [ - [[0.473328]], - [[0.929023]], - [[1.446527]], - [[1.920595]], - [[2.517054]], - [[3.030102]], - [[3.584279]], - [[4.043509]], - [[4.620008]], - [[5.165322]], - [[5.710637]], - [[6.122581]], - [[6.655914]], - [[7.237174]], - [[7.722580]], + [[0.47332805]], + [[1.0]], + [[1.4732642]], + [[2.0380495]], + [[2.6054149]], + [[3.0301015]], + [[3.679056]], + [[4.175322]], + [[4.700384]], + [[5.2552223]], + [[5.8100615]], + [[6.3083715]], + [[6.858295]], + [[7.4082184]], + [[7.722581]], [[8.255914]], ] ), np.array( [ [[0.47344488]], - [[0.9287766]], - [[1.4463282]], - [[1.920052]], - [[2.5167778]], - [[3.02987]], - [[3.5842714]], - [[4.0429296]], - [[4.619769]], - [[5.165224]], - [[5.7106786]], - [[6.121212]], - [[6.654546]], - [[7.2366524]], - [[7.7212124]], - [[8.254545]], + [[1. ]], + [[1.5450557 ]], + [[2.0380037 ]], + [[2.6055446 ]], + [[3.02987 ]], + [[3.679132 ]], + [[4.1754694 ]], + [[4.7001443 ]], + [[5.2551227 ]], + [[5.810101 ]], + [[6.308658 ]], + [[6.8587303 ]], + [[7.4 ]], + [[7.7212124 ]], + [[8.254545 ]], ] ), )[check_sampling_activation_stats_flow] @@ -2519,44 +2519,44 @@ def get_moe_scale_estimation_ref(check_sampling_activation_stats_flow): [ [ [ - 7.5732, - 7.4667, - 7.4667, - 7.4667, - 7.4667, - 7.2602, - 7.4667, - 7.4667, - 7.4667, - 7.4667, - 7.3083, - 7.8467, - 7.2233, - 7.2715, - 7.4205, - 7.4667, + 7.573249, + 7.58195, + 7.6, + 7.6666665, + 7.1209445, + 7.260152, + 7.866667, + 7.9333334, + 8.0, + 8.066667, + 8.528544, + 8.659291, + 8.879055, + 8.469787, + 8.4, + 8.364824, ] ] ], [ [ [ - 14.8205, - 14.9032, - 14.9858, - 15.0685, - 15.1512, - 14.3400, - 14.4173, - 14.4945, - 14.5718, - 14.6491, - 14.7264, - 14.8037, - 14.8810, - 14.9583, - 15.0355, - 15.1128, + 16.0, + 16.089771, + 16.179543, + 16.269318, + 16.359089, + 16.44886, + 16.538631, + 16.628407, + 16.718176, + 16.80795, + 16.89772, + 16.987492, + 15.812495, + 15.89516, + 15.977826, + 16.060493, ] ] ], @@ -2568,43 +2568,43 @@ def get_moe_scale_estimation_ref(check_sampling_activation_stats_flow): [ [ 7.575118, - 7.4666667, - 7.4666667, - 7.4666667, - 7.4666667, + 7.5841107, + 7.6, + 7.6666665, + 7.112954, 7.254837, - 7.4666667, - 7.4666667, - 7.4666667, - 7.4666667, - 7.495066, + 7.866667, + 7.9333334, + 8.0, + 8.066667, + 8.531546, 7.850108, - 7.219489, - 7.2685375, - 7.418597, - 7.4666667, + 8.887045, + 8.468656, + 8.4, + 8.361673, ] ] ], [ [ [ - 14.820066, - 14.902746, - 14.985427, - 15.068108, - 15.150787, - 14.3391285, - 14.416424, - 14.493721, - 14.571016, - 14.648311, - 14.725608, - 14.802904, - 14.8801985, - 14.957496, - 15.034791, - 15.112087, + 16.0, + 16.089788, + 16.17958, + 16.269371, + 16.359161, + 16.448954, + 16.538742, + 16.628534, + 16.718325, + 16.808115, + 16.897905, + 16.987696, + 15.812232, + 15.894914, + 15.977593, + 16.060274, ] ] ], From 275b9dcc7028bdc7b9b9fa32c561145e4da81375 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Thu, 21 May 2026 21:22:16 +0200 Subject: [PATCH 08/11] Updated refernces for torch scale estimation test. --- .../quantization/test_weights_compression.py | 190 +++++++++--------- 1 file changed, 95 insertions(+), 95 deletions(-) diff --git a/tests/torch/function_hook/quantization/test_weights_compression.py b/tests/torch/function_hook/quantization/test_weights_compression.py index a70e0f83879..c7446a3a27d 100644 --- a/tests/torch/function_hook/quantization/test_weights_compression.py +++ b/tests/torch/function_hook/quantization/test_weights_compression.py @@ -659,42 +659,42 @@ def get_scale_estimation_ref(check_sampling_activation_stats_flow): return ( torch.tensor( [ - [[0.473328]], - [[0.929023]], - [[1.446527]], - [[1.920595]], - [[2.517054]], - [[3.030102]], - [[3.584279]], - [[4.043509]], - [[4.620008]], - [[5.165322]], - [[5.710637]], - [[6.122581]], - [[6.655914]], - [[7.237174]], - [[7.722580]], + [[0.47332805]], + [[1.0]], + [[1.4732642]], + [[2.0380495]], + [[2.6054149]], + [[3.0301015]], + [[3.679056]], + [[4.175322]], + [[4.700384]], + [[5.2552223]], + [[5.8100615]], + [[6.3083715]], + [[6.858295]], + [[7.4082184]], + [[7.722581]], [[8.255914]], ] ), torch.tensor( [ - [[0.473445]], - [[0.928777]], - [[1.446328]], - [[1.920052]], - [[2.516778]], - [[3.029870]], - [[3.584271]], - [[4.042929]], - [[4.619769]], - [[5.165224]], - [[5.710679]], - [[6.121212]], - [[6.654546]], - [[7.236652]], - [[7.721212]], - [[8.254545]], + [[0.47344488]], + [[1. ]], + [[1.5450557 ]], + [[2.0380037 ]], + [[2.6055446 ]], + [[3.02987 ]], + [[3.679132 ]], + [[4.1754694 ]], + [[4.7001443 ]], + [[5.2551227 ]], + [[5.810101 ]], + [[6.308658 ]], + [[6.8587303 ]], + [[7.4 ]], + [[7.7212124 ]], + [[8.254545 ]], ] ), )[check_sampling_activation_stats_flow] @@ -707,44 +707,44 @@ def get_moe_scale_estimation_ref(check_sampling_activation_stats_flow): [ [ [ - 7.5732, - 7.4667, - 7.4667, - 7.4667, - 7.4667, - 7.2602, - 7.4667, - 7.4667, - 7.4667, - 7.4667, - 7.3083, - 7.8467, - 7.2233, - 7.2715, - 7.4205, - 7.4667, + 7.573249, + 7.58195, + 7.6, + 7.6666665, + 7.1209445, + 7.260152, + 7.866667, + 7.9333334, + 8.0, + 8.066667, + 8.528544, + 8.659291, + 8.879055, + 8.469787, + 8.4, + 8.364824, ] ] ], [ [ [ - 14.8205, - 14.9032, - 14.9858, - 15.0685, - 15.1512, - 14.3400, - 14.4173, - 14.4945, - 14.5718, - 14.6491, - 14.7264, - 14.8037, - 14.8810, - 14.9583, - 15.0355, - 15.1128, + 16.0, + 16.089771, + 16.179543, + 16.269318, + 16.359089, + 16.44886, + 16.538631, + 16.628407, + 16.718176, + 16.80795, + 16.89772, + 16.987492, + 15.812495, + 15.89516, + 15.977826, + 16.060493, ] ] ], @@ -755,44 +755,44 @@ def get_moe_scale_estimation_ref(check_sampling_activation_stats_flow): [ [ [ - 7.5751, - 7.4667, - 7.4667, - 7.4667, - 7.4667, - 7.2548, - 7.4667, - 7.4667, - 7.4667, - 7.4667, - 7.4951, - 7.8501, - 7.2195, - 7.2685, - 7.4186, - 7.4667, + 7.575118, + 7.5841107, + 7.6, + 7.6666665, + 7.112954, + 7.254837, + 7.866667, + 7.9333334, + 8.0, + 8.066667, + 8.531546, + 7.850108, + 8.887045, + 8.468656, + 8.4, + 8.361673, ] ] ], [ [ [ - 14.8201, - 14.9027, - 14.9854, - 15.0681, - 15.1508, - 14.3391, - 14.4164, - 14.4937, - 14.5710, - 14.6483, - 14.7256, - 14.8029, - 14.8802, - 14.9575, - 15.0348, - 15.1121, + 16.0, + 16.089788, + 16.17958, + 16.269371, + 16.359161, + 16.448954, + 16.538742, + 16.628534, + 16.718325, + 16.808115, + 16.897905, + 16.987696, + 15.812232, + 15.894914, + 15.977593, + 16.060274, ] ] ], From 18feba325e5e9e094aae7eeb1413beaf730bea38 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 22 May 2026 12:52:28 +0200 Subject: [PATCH 09/11] Updated reference values for test_scale_estimation ONNX backend. --- .../quantization/test_weights_compression.py | 176 ++++++++-------- .../quantization/test_weights_compression.py | 30 +-- .../quantization/test_weights_compression.py | 30 +-- tests/torch/fx/test_weights_compression.py | 188 +++++++++--------- 4 files changed, 212 insertions(+), 212 deletions(-) diff --git a/tests/onnx/quantization/test_weights_compression.py b/tests/onnx/quantization/test_weights_compression.py index 7c52a61764e..2501c360c0a 100644 --- a/tests/onnx/quantization/test_weights_compression.py +++ b/tests/onnx/quantization/test_weights_compression.py @@ -553,40 +553,40 @@ def get_scale_estimation_ref(check_sampling_activation_stats_flow): return ( np.array( [ - [[0.473328]], - [[0.929023]], - [[1.446527]], - [[1.920595]], - [[2.517054]], - [[3.030102]], - [[3.584279]], - [[4.043509]], - [[4.620008]], - [[5.165322]], - [[5.710637]], - [[6.122581]], - [[6.655914]], - [[7.237174]], - [[7.722580]], + [[0.47332805]], + [[1.0]], + [[1.4732642]], + [[2.0380495]], + [[2.6054149]], + [[3.0301015]], + [[3.679056]], + [[4.175322]], + [[4.700384]], + [[5.2552223]], + [[5.8100615]], + [[6.3083715]], + [[6.858295]], + [[7.4082184]], + [[7.722581]], [[8.255914]], ] ).T, np.array( [ [[0.47344488]], - [[0.9287766]], - [[1.4463282]], - [[1.920052]], - [[2.5167778]], + [[1.0]], + [[1.5450557]], + [[2.0380037]], + [[2.6055446]], [[3.02987]], - [[3.5842714]], - [[4.0429296]], - [[4.619769]], - [[5.165224]], - [[5.7106786]], - [[6.121212]], - [[6.654546]], - [[7.2366524]], + [[3.679132]], + [[4.1754694]], + [[4.7001443]], + [[5.2551227]], + [[5.810101]], + [[6.308658]], + [[6.8587303]], + [[7.4]], [[7.7212124]], [[8.254545]], ] @@ -601,44 +601,44 @@ def get_moe_scale_estimation_ref(check_sampling_activation_stats_flow): [ [ [ - 7.5732, - 7.4667, - 7.4667, - 7.4667, - 7.4667, - 7.2602, - 7.4667, - 7.4667, - 7.4667, - 7.4667, - 7.3083, - 7.8467, - 7.2233, - 7.2715, - 7.4205, - 7.4667, + 7.573249, + 7.58195, + 7.6, + 7.6666665, + 7.1209445, + 7.260152, + 7.866667, + 7.9333334, + 8.0, + 8.066667, + 8.528544, + 8.659291, + 8.879055, + 8.469787, + 8.4, + 8.364824, ] ] ], [ [ [ - 14.8205, - 14.9032, - 14.9858, - 15.0685, - 15.1512, - 14.3400, - 14.4173, - 14.4945, - 14.5718, - 14.6491, - 14.7264, - 14.8037, - 14.8810, - 14.9583, - 15.0355, - 15.1128, + 16.0, + 16.089771, + 16.179543, + 16.269318, + 16.359089, + 16.44886, + 16.538631, + 16.628407, + 16.718176, + 16.80795, + 16.89772, + 16.987492, + 15.812495, + 15.89516, + 15.977826, + 16.060493, ] ] ], @@ -650,43 +650,43 @@ def get_moe_scale_estimation_ref(check_sampling_activation_stats_flow): [ [ 7.575118, - 7.4666667, - 7.4666667, - 7.4666667, - 7.4666667, + 7.5841107, + 7.6, + 7.6666665, + 7.112954, 7.254837, - 7.4666667, - 7.4666667, - 7.4666667, - 7.4666667, - 7.495066, + 7.866667, + 7.9333334, + 8.0, + 8.066667, + 8.531546, 7.850108, - 7.219489, - 7.2685375, - 7.418597, - 7.4666667, + 8.887045, + 8.468656, + 8.4, + 8.361673, ] ] ], [ [ [ - 14.820066, - 14.902746, - 14.985427, - 15.068108, - 15.150787, - 14.3391285, - 14.416424, - 14.493721, - 14.571016, - 14.648311, - 14.725608, - 14.802904, - 14.8801985, - 14.957496, - 15.034791, - 15.112087, + 16.0, + 16.089788, + 16.17958, + 16.269371, + 16.359161, + 16.448954, + 16.538742, + 16.628534, + 16.718325, + 16.808115, + 16.897905, + 16.987696, + 15.812232, + 15.894914, + 15.977593, + 16.060274, ] ] ], diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index aa9ea746ea3..a5df7c0c735 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -2492,21 +2492,21 @@ def get_scale_estimation_ref(check_sampling_activation_stats_flow): np.array( [ [[0.47344488]], - [[1. ]], - [[1.5450557 ]], - [[2.0380037 ]], - [[2.6055446 ]], - [[3.02987 ]], - [[3.679132 ]], - [[4.1754694 ]], - [[4.7001443 ]], - [[5.2551227 ]], - [[5.810101 ]], - [[6.308658 ]], - [[6.8587303 ]], - [[7.4 ]], - [[7.7212124 ]], - [[8.254545 ]], + [[1.0]], + [[1.5450557]], + [[2.0380037]], + [[2.6055446]], + [[3.02987]], + [[3.679132]], + [[4.1754694]], + [[4.7001443]], + [[5.2551227]], + [[5.810101]], + [[6.308658]], + [[6.8587303]], + [[7.4]], + [[7.7212124]], + [[8.254545]], ] ), )[check_sampling_activation_stats_flow] diff --git a/tests/torch/function_hook/quantization/test_weights_compression.py b/tests/torch/function_hook/quantization/test_weights_compression.py index c7446a3a27d..9276b8c9cc6 100644 --- a/tests/torch/function_hook/quantization/test_weights_compression.py +++ b/tests/torch/function_hook/quantization/test_weights_compression.py @@ -680,21 +680,21 @@ def get_scale_estimation_ref(check_sampling_activation_stats_flow): torch.tensor( [ [[0.47344488]], - [[1. ]], - [[1.5450557 ]], - [[2.0380037 ]], - [[2.6055446 ]], - [[3.02987 ]], - [[3.679132 ]], - [[4.1754694 ]], - [[4.7001443 ]], - [[5.2551227 ]], - [[5.810101 ]], - [[6.308658 ]], - [[6.8587303 ]], - [[7.4 ]], - [[7.7212124 ]], - [[8.254545 ]], + [[1.0]], + [[1.5450557]], + [[2.0380037]], + [[2.6055446]], + [[3.02987]], + [[3.679132]], + [[4.1754694]], + [[4.7001443]], + [[5.2551227]], + [[5.810101]], + [[6.308658]], + [[6.8587303]], + [[7.4]], + [[7.7212124]], + [[8.254545]], ] ), )[check_sampling_activation_stats_flow] diff --git a/tests/torch/fx/test_weights_compression.py b/tests/torch/fx/test_weights_compression.py index d05e002cd7d..004f0e018d0 100644 --- a/tests/torch/fx/test_weights_compression.py +++ b/tests/torch/fx/test_weights_compression.py @@ -437,41 +437,41 @@ def get_scale_estimation_ref(check_sampling_activation_stats_flow): return ( torch.tensor( [ - [[0.473328]], - [[0.929023]], - [[1.446527]], - [[1.920595]], - [[2.517054]], - [[3.030102]], - [[3.584279]], - [[4.043509]], - [[4.620008]], - [[5.165322]], - [[5.710637]], - [[6.122581]], - [[6.655914]], - [[7.237174]], - [[7.722580]], + [[0.47332805]], + [[1.0]], + [[1.4732642]], + [[2.0380495]], + [[2.6054149]], + [[3.0301015]], + [[3.679056]], + [[4.175322]], + [[4.700384]], + [[5.2552223]], + [[5.8100615]], + [[6.3083715]], + [[6.858295]], + [[7.4082184]], + [[7.722581]], [[8.255914]], ] ), torch.tensor( [ - [[0.473445]], - [[0.928777]], - [[1.446328]], - [[1.920052]], - [[2.516778]], - [[3.029870]], - [[3.584271]], - [[4.042929]], - [[4.619769]], - [[5.165224]], - [[5.710679]], - [[6.121212]], - [[6.654546]], - [[7.236652]], - [[7.721212]], + [[0.47344488]], + [[1.0]], + [[1.5450557]], + [[2.0380037]], + [[2.6055446]], + [[3.02987]], + [[3.679132]], + [[4.1754694]], + [[4.7001443]], + [[5.2551227]], + [[5.810101]], + [[6.308658]], + [[6.8587303]], + [[7.4]], + [[7.7212124]], [[8.254545]], ] ), @@ -485,44 +485,44 @@ def get_moe_scale_estimation_ref(check_sampling_activation_stats_flow): [ [ [ - 7.5732, - 7.4667, - 7.4667, - 7.4667, - 7.4667, - 7.2602, - 7.4667, - 7.4667, - 7.4667, - 7.4667, - 7.3083, - 7.8467, - 7.2233, - 7.2715, - 7.4205, - 7.4667, + 7.573249, + 7.58195, + 7.6, + 7.6666665, + 7.1209445, + 7.260152, + 7.866667, + 7.9333334, + 8.0, + 8.066667, + 8.528544, + 8.659291, + 8.879055, + 8.469787, + 8.4, + 8.364824, ] ] ], [ [ [ - 14.8205, - 14.9032, - 14.9858, - 15.0685, - 15.1512, - 14.3400, - 14.4173, - 14.4945, - 14.5718, - 14.6491, - 14.7264, - 14.8037, - 14.8810, - 14.9583, - 15.0355, - 15.1128, + 16.0, + 16.089771, + 16.179543, + 16.269318, + 16.359089, + 16.44886, + 16.538631, + 16.628407, + 16.718176, + 16.80795, + 16.89772, + 16.987492, + 15.812495, + 15.89516, + 15.977826, + 16.060493, ] ] ], @@ -533,44 +533,44 @@ def get_moe_scale_estimation_ref(check_sampling_activation_stats_flow): [ [ [ - 7.5751, - 7.4667, - 7.4667, - 7.4667, - 7.4667, - 7.2548, - 7.4667, - 7.4667, - 7.4667, - 7.4667, - 7.4951, - 7.8501, - 7.2195, - 7.2685, - 7.4186, - 7.4667, + 7.575118, + 7.5841107, + 7.6, + 7.6666665, + 7.112954, + 7.254837, + 7.866667, + 7.9333334, + 8.0, + 8.066667, + 8.531546, + 7.850108, + 8.887045, + 8.468656, + 8.4, + 8.361673, ] ] ], [ [ [ - 14.8201, - 14.9027, - 14.9854, - 15.0681, - 15.1508, - 14.3391, - 14.4164, - 14.4937, - 14.5710, - 14.6483, - 14.7256, - 14.8029, - 14.8802, - 14.9575, - 15.0348, - 15.1121, + 16.0, + 16.089788, + 16.17958, + 16.269371, + 16.359161, + 16.448954, + 16.538742, + 16.628534, + 16.718325, + 16.808115, + 16.897905, + 16.987696, + 15.812232, + 15.894914, + 15.977593, + 16.060274, ] ] ], From 02239dacce4596526af2c5dbff56e7c66fc6008d Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 22 May 2026 13:56:15 +0200 Subject: [PATCH 10/11] Fixed test_fq_lora_export. --- tests/torch/function_hook/quantization/test_fq_lora.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/torch/function_hook/quantization/test_fq_lora.py b/tests/torch/function_hook/quantization/test_fq_lora.py index 9b4ab58912b..81073f47c8a 100644 --- a/tests/torch/function_hook/quantization/test_fq_lora.py +++ b/tests/torch/function_hook/quantization/test_fq_lora.py @@ -118,7 +118,8 @@ def test_fq_lora_export(compression_kwargs, _seed): Tests FQ-LoRA (Fake-Quantize with Low-Rank Adaptation) can be stripped and exported to OpenVINO. """ device = "cuda" - example_input = 0.01 * torch.arange(0, 4 * 8, device=device).reshape(1, 4, 8) + 0.02 + example_input = 0.01 * torch.arange(0, 4 * 8, device=device).reshape(1, 4, 8) + example_input = example_input - example_input.mean(dim=-1, keepdim=True) model = AWQLinearModel().to(device) model = nncf.compress_weights( @@ -140,6 +141,7 @@ def test_fq_lora_export(compression_kwargs, _seed): example_inputs_numpy = example_input.detach().cpu().numpy() stripped_ov_output = torch.tensor(model(example_inputs_numpy)[0], device=example_input.device) + # TODO(aanuf): fix input_low, input_range computation for AsymmetricQuantizer assert torch.allclose(tuned_output, stripped_output, atol=1e-1) assert torch.allclose(tuned_output, stripped_ov_output, atol=1e-1) From daace3bd701544c25619876e4f239c6091b987a5 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 22 May 2026 15:44:59 +0200 Subject: [PATCH 11/11] Aligned weight values between OV and Torch. --- .../algorithms/weight_compression/weight_lowering.py | 1 + .../test_templates/template_test_weights_compression.py | 1 + .../function_hook/quantization/test_weights_compression.py | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 86ba2153ee8..548c5252fd3 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -370,6 +370,7 @@ def get_integer_quantization_error( decompressed_weight = integer_quantize_dequantize_weight(weight, config, reduction_axes) decompressed_weight = decompressed_weight.reshape(weight.shape) + if reduction == "max_mean": diff = (decompressed_weight - weight) ** 2 layer_err = fns.mean(diff, axis=reduction_axes) diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py index b99ef81d13b..c72c1964f2b 100644 --- a/tests/cross_fw/test_templates/template_test_weights_compression.py +++ b/tests/cross_fw/test_templates/template_test_weights_compression.py @@ -230,6 +230,7 @@ def wrap_model(model, data) -> CompressionParams: def test_mixed_precision(self, mode, all_layers, ratio, ref_ids, transpose_a, mocker): model = self.get_sequential_matmul_model(transpose_a=transpose_a) input_shape = (4, 4) if transpose_a else (1, 4, 4) + first = self.to_tensor(np.ones(input_shape, dtype=np.float32)) second = self.to_tensor(np.arange(16, dtype=np.float32)).reshape(input_shape) dataset = Dataset([first, second], self.get_transform_func()) diff --git a/tests/torch/function_hook/quantization/test_weights_compression.py b/tests/torch/function_hook/quantization/test_weights_compression.py index 9276b8c9cc6..a970b808557 100644 --- a/tests/torch/function_hook/quantization/test_weights_compression.py +++ b/tests/torch/function_hook/quantization/test_weights_compression.py @@ -75,7 +75,7 @@ def __init__(self): weights_data[-1, -1] = main_value weight_tensor = weights_data.detach().clone() layer = nn.Linear(4, 4, bias=False) - layer.weight = nn.Parameter(weight_tensor.t()) + layer.weight = nn.Parameter(weight_tensor) self.layers.append(layer) def forward(self, x):