From de9e8070054268e30c7f722a1ecb293ae6d6a3aa Mon Sep 17 00:00:00 2001 From: abhayuvi Date: Fri, 13 Mar 2026 12:48:58 +0530 Subject: [PATCH 1/2] feat: add HQQ data-free weight compression algorithm --- .ci/cspell_dict.txt | 1 + src/nncf/__init__.py | 1 + .../torch/fx/quantization/quantize_model.py | 10 +- src/nncf/onnx/quantization/quantize_model.py | 10 +- .../openvino/quantization/quantize_model.py | 10 +- src/nncf/quantization/advanced_parameters.py | 18 ++ .../weight_compression/algorithm.py | 27 ++ .../algorithms/weight_compression/hqq.py | 240 ++++++++++++++++++ src/nncf/quantization/quantize_model.py | 14 +- src/nncf/torch/quantization/quantize_model.py | 10 +- .../openvino/native/quantization/test_hqq.py | 189 ++++++++++++++ 11 files changed, 511 insertions(+), 19 deletions(-) create mode 100644 src/nncf/quantization/algorithms/weight_compression/hqq.py create mode 100644 tests/openvino/native/quantization/test_hqq.py diff --git a/.ci/cspell_dict.txt b/.ci/cspell_dict.txt index 1f5f531275e..ecd435d4dc0 100644 --- a/.ci/cspell_dict.txt +++ b/.ci/cspell_dict.txt @@ -182,6 +182,7 @@ hellaswag hiddens hparam hparams +hqq hswish huggingface hutter diff --git a/src/nncf/__init__.py b/src/nncf/__init__.py index 475d7e836fc..4077bb19ed1 100644 --- a/src/nncf/__init__.py +++ b/src/nncf/__init__.py @@ -61,6 +61,7 @@ from nncf.quantization.advanced_parameters import AdvancedBiasCorrectionParameters as AdvancedBiasCorrectionParameters from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as AdvancedCompressionParameters from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as AdvancedGPTQParameters +from nncf.quantization.advanced_parameters import AdvancedHQQParameters as AdvancedHQQParameters from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as AdvancedLoraCorrectionParameters from nncf.quantization.advanced_parameters import AdvancedQuantizationParameters as AdvancedQuantizationParameters from nncf.quantization.advanced_parameters import AdvancedScaleEstimationParameters as AdvancedScaleEstimationParameters diff --git a/src/nncf/experimental/torch/fx/quantization/quantize_model.py b/src/nncf/experimental/torch/fx/quantization/quantize_model.py index 8eff430895b..c5fa6c0b291 100644 --- a/src/nncf/experimental/torch/fx/quantization/quantize_model.py +++ b/src/nncf/experimental/torch/fx/quantization/quantize_model.py @@ -129,8 +129,9 @@ def compress_weights_impl( scale_estimation: bool, gptq: bool, lora_correction: bool, - backup_mode: BackupMode, - compression_format: CompressionFormat, + hqq: bool = False, + backup_mode: BackupMode = BackupMode.INT8_ASYM, + compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters | None = None, ) -> torch.fx.GraphModule: """ @@ -149,8 +150,9 @@ def compress_weights_impl( gptq, lora_correction, backup_mode, - compression_format, - advanced_parameters, + hqq=hqq, + compression_format=compression_format, + advanced_parameters=advanced_parameters, ) graph = build_graph(model) compressed_model = compression_algorithm.apply(model, graph, dataset=dataset) diff --git a/src/nncf/onnx/quantization/quantize_model.py b/src/nncf/onnx/quantization/quantize_model.py index 4ec6b8c6111..fddaee53aa1 100644 --- a/src/nncf/onnx/quantization/quantize_model.py +++ b/src/nncf/onnx/quantization/quantize_model.py @@ -324,8 +324,9 @@ def compress_weights_impl( scale_estimation: bool, gptq: bool, lora_correction: bool, - backup_mode: BackupMode, - compression_format: CompressionFormat, + hqq: bool = False, + backup_mode: BackupMode = BackupMode.INT8_ASYM, + compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters | None = None, ) -> onnx.ModelProto: if model.opset_import[0].version < 13: @@ -357,8 +358,9 @@ def compress_weights_impl( gptq, lora_correction, backup_mode, - compression_format, - advanced_parameters, + hqq=hqq, + compression_format=compression_format, + advanced_parameters=advanced_parameters, ) graph = build_graph(model) diff --git a/src/nncf/openvino/quantization/quantize_model.py b/src/nncf/openvino/quantization/quantize_model.py index 19031fb1674..37f9d77601f 100644 --- a/src/nncf/openvino/quantization/quantize_model.py +++ b/src/nncf/openvino/quantization/quantize_model.py @@ -376,8 +376,9 @@ def compress_weights_impl( scale_estimation: bool, gptq: bool, lora_correction: bool, - backup_mode: BackupMode, - compression_format: CompressionFormat, + hqq: bool = False, + backup_mode: BackupMode = BackupMode.INT8_ASYM, + compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters | None = None, ) -> ov.Model: """ @@ -398,8 +399,9 @@ def compress_weights_impl( gptq, lora_correction, backup_mode, - compression_format, - advanced_parameters, + hqq=hqq, + compression_format=compression_format, + advanced_parameters=advanced_parameters, ) statistics_points = None diff --git a/src/nncf/quantization/advanced_parameters.py b/src/nncf/quantization/advanced_parameters.py index df39661f164..71a4b925742 100644 --- a/src/nncf/quantization/advanced_parameters.py +++ b/src/nncf/quantization/advanced_parameters.py @@ -382,6 +382,21 @@ class AdvancedLoraCorrectionParameters: use_int8_adapters: bool = True +@api() +@dataclass +class AdvancedHQQParameters: + """ + Contains advanced parameters for the HQQ (Half-Quadratic Quantization) algorithm. + + :param num_iterations: Number of alternating optimization iterations used to jointly + refine scale and zero point. More iterations improve quality at the cost of runtime. + Defaults to 20. + :type num_iterations: int + """ + + num_iterations: int = 20 + + @api() @dataclass class AdvancedAdaptiveCodebookParameters: @@ -423,6 +438,8 @@ class AdvancedCompressionParameters: :type scale_estimation_params: AdvancedScaleEstimationParameters :param gptq_params: Advanced parameters for GPTQ algorithm. :type gptq_params: AdvancedGPTQParameters + :param hqq_params: Advanced parameters for HQQ algorithm. + :type hqq_params: AdvancedHQQParameters :param lora_correction_params: Advanced parameters for Lora Correction algorithm. :type lora_correction_params: AdvancedLoraCorrectionParameters :param backend_params: Backend-specific parameters. @@ -443,6 +460,7 @@ class AdvancedCompressionParameters: default_factory=AdvancedScaleEstimationParameters ) gptq_params: AdvancedGPTQParameters = field(default_factory=AdvancedGPTQParameters) + hqq_params: AdvancedHQQParameters = field(default_factory=AdvancedHQQParameters) lora_correction_params: AdvancedLoraCorrectionParameters = field(default_factory=AdvancedLoraCorrectionParameters) backend_params: dict[str, Any] = field(default_factory=dict) codebook: TTensor | None = None diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py index af9c4c2661a..4166f1e16aa 100644 --- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -45,6 +45,7 @@ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.constants import CB4_QUANTILES from nncf.quantization.algorithms.weight_compression.gptq import GPTQ +from nncf.quantization.algorithms.weight_compression.hqq import HQQ from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation @@ -81,6 +82,7 @@ def get_weight_compression_configuration( scale_estimation: bool | None = None, gptq: bool | None = None, lora_correction: bool | None = None, + hqq: bool | None = None, ignored_scope: IgnoredScope | None = None, sensitivity_metric: SensitivityMetric | None = None, backup_mode: BackupMode | None = None, @@ -120,6 +122,7 @@ def get_weight_compression_configuration( "scale_estimation": scale_estimation or False, "gptq": gptq or False, "lora_correction": lora_correction or False, + "hqq": hqq or False, "ignored_scope": ignored_scope or IgnoredScope(), "sensitivity_metric": ( ( @@ -146,6 +149,7 @@ def check_user_compression_configuration( scale_estimation: bool | None, gptq: bool | None, lora_correction: bool | None, + hqq: bool | None, ignored_scope: IgnoredScope | None, sensitivity_metric: SensitivityMetric | None, backup_mode: BackupMode | None, @@ -175,6 +179,7 @@ def check_user_compression_configuration( "scale_estimation": scale_estimation, "gptq": gptq, "lora_correction": lora_correction, + "hqq": hqq, "backup_mode": backup_mode, } unsupported_for_int8 = [name for name, value in unsupported_options.items() if value is not None] @@ -257,6 +262,10 @@ def check_user_compression_configuration( requires a dataset, but it's not provided." raise nncf.ValidationError(msg) + if hqq and gptq: + msg = "Simultaneous use of HQQ and GPTQ algorithms is not supported. Select one of them." + raise nncf.ParameterNotSupportedError(msg) + if lora_correction and compression_format in [ CompressionFormat.FQ, CompressionFormat.FQ_LORA, @@ -311,6 +320,7 @@ def __init__( gptq: bool, lora_correction: bool, backup_mode: BackupMode, + hqq: bool = False, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters | None = None, ): @@ -355,6 +365,10 @@ def __init__( INT8_ASYM stands for 8-bit integer asymmetric quantization with a typical non-fixed zero point. MXFP8_E4M3 stands for MX-compliant FP8 format with E4M3 values sharing group-level E8M0 scale. FP8_E4M3 stands for FP8 format with E4M3 values sharing group-level fp16 scale. + :param hqq: determines whether to use the HQQ (Half-Quadratic Quantization) algorithm. + HQQ is a data-free method that optimizes scale and zero-point jointly via alternating + least-squares, typically producing lower quantization error than standard min-max + initialization, especially for 4-bit group-wise compression. :param compression_format: Describes the format in which the model is saved after weight compression. :param advanced_parameters: advanced parameters for algorithms in compression pipeline. """ @@ -376,6 +390,7 @@ def __init__( self._codebook_estimation = mode == CompressWeightsMode.ADAPTIVE_CODEBOOK self._backup_mode = backup_mode self._compression_format = compression_format + self._hqq = hqq self._advanced_parameters = ( advanced_parameters if advanced_parameters is not None else AdvancedCompressionParameters() ) @@ -405,6 +420,9 @@ def __init__( subset_size=gptq_params.subset_size, scale_estimation=self._scale_estimation, ) + if self._hqq: + hqq_params = self._advanced_parameters.hqq_params + self._hqq_algo = HQQ(num_iterations=hqq_params.num_iterations) if self._scale_estimation: scale_estimation_params = self._advanced_parameters.scale_estimation_params self._scale_estimation_algo = ScaleEstimation( @@ -1163,6 +1181,14 @@ def apply_with_parameters( backend_entity=self._backend_entity, ) else: + if self._hqq: + precomputed_compressed_weights = self._hqq_algo.apply( + model=model, + graph=graph, + all_weight_params=all_weight_params, + backend_entity=self._backend_entity, + ) + if self._scale_estimation: precomputed_compressed_weights = self._scale_estimation_algo.apply( model=model, @@ -1211,6 +1237,7 @@ def apply_with_parameters( "scale_estimation": self._scale_estimation, "gptq": self._gptq, "lora_correction": self._lora_correction, + "hqq": self._hqq, "backup_mode": self._backup_mode.value, "compression_format": self._compression_format.value, "advanced_parameters": convert_to_dict_recursively(self._advanced_parameters), diff --git a/src/nncf/quantization/algorithms/weight_compression/hqq.py b/src/nncf/quantization/algorithms/weight_compression/hqq.py new file mode 100644 index 00000000000..fe47cfda306 --- /dev/null +++ b/src/nncf/quantization/algorithms/weight_compression/hqq.py @@ -0,0 +1,240 @@ +# Copyright (c) 2026 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TypeVar + +import nncf +from nncf.common.graph.graph import NNCFGraph +from nncf.common.logging.track_progress import track +from nncf.common.utils.backend import BackendType +from nncf.common.utils.backend import get_backend +from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight +from nncf.quantization.algorithms.weight_compression.weight_lowering import ReductionAxes +from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_integer_quantization_params +from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization +from nncf.tensor import Tensor +from nncf.tensor import functions as fns +from nncf.tensor.definitions import TensorBackend +from nncf.tensor.definitions import TensorDataType + +TModel = TypeVar("TModel") + + +class HQQ: + """ + Half-Quadratic Quantization (HQQ) algorithm implementation. + + HQQ is a data-free weight quantization algorithm that minimizes quantization error + without requiring calibration data. It uses alternating least-squares optimization + to jointly find optimal scale and zero-point parameters, producing floating-point + zero points for asymmetric quantization. + + Reference: "Half-Quadratic Quantization of Large Machine Learning Models" + (https://mobiusml.github.io/hqq_blog/) + """ + + def __init__(self, num_iterations: int = 20): + """ + :param num_iterations: Number of alternating optimization iterations. + More iterations improve quantization quality at the cost of compute time. + Defaults to 20. + """ + self._num_iterations = num_iterations + self._backend_entity = None + + @property + def available_backends(self) -> list[BackendType]: + return [BackendType.OPENVINO, BackendType.TORCH, BackendType.ONNX] + + def _set_backend_entity(self, model: TModel) -> None: + """ + Creates a helper class with a backend-specific logic of the algorithm. + + :param model: Backend-specific input model. + """ + model_backend = get_backend(model) + if model_backend == BackendType.OPENVINO: + from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend + + self._backend_entity = OVWeightCompressionAlgoBackend(model) + elif model_backend == BackendType.TORCH: + from nncf.quantization.algorithms.weight_compression.torch_backend import PTWeightCompressionAlgoBackend + + self._backend_entity = PTWeightCompressionAlgoBackend() + elif model_backend == BackendType.TORCH_FX: + from nncf.quantization.algorithms.weight_compression.torch_fx_backend import FXWeightCompressionAlgoBackend + + self._backend_entity = FXWeightCompressionAlgoBackend() + elif model_backend == BackendType.ONNX: + from nncf.quantization.algorithms.weight_compression.onnx_backend import ONNXWeightCompressionAlgoBackend + + self._backend_entity = ONNXWeightCompressionAlgoBackend(model) + else: + msg = ( + "Cannot return backend-specific HQQ entity because" + f" {model_backend.value} is not supported!" + ) + raise nncf.UnsupportedBackendError(msg) + + def apply( + self, + model: TModel, + graph: NNCFGraph, + all_weight_params: list[WeightCompressionParameters], + backend_entity: WeightCompressionAlgoBackend | None = None, + ) -> dict[str, CompressedWeight]: + """ + Applies the HQQ algorithm to compute optimized scale and zero-point parameters. + + For each eligible weight, HQQ alternately: + 1. Quantizes the weight with the current (scale, zero_point), and + 2. Updates (scale, zero_point) via a closed-form least-squares step. + + The resulting CompressedWeight objects contain None for the compressed tensor + (quantization is deferred) but carry the HQQ-optimized float scale and, for + asymmetric modes, a float-valued zero point. + + :param model: Model for applying algorithm. + :param graph: Model graph. + :param all_weight_params: List of all weight parameters. + :param backend_entity: Weight compression algorithm backend. + :return: A dictionary mapping weight names to CompressedWeight instances with + HQQ-optimized scale and zero point. + """ + self._backend_entity = backend_entity + if self._backend_entity is None: + self._set_backend_entity(model) + + res = {} + + for wp in track(all_weight_params, description="Applying HQQ"): + weight_name = wp.weight_name + config = wp.compression_config + + if not config.is_integer: + res[weight_name] = CompressedWeight() + continue + + weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph) + if len(weight_data) != 1: # not supported by the algorithm + continue + _, weight_port_id = weight_data[0] + + weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph) + + # Convert to numpy for stable in-loop arithmetic, avoiding the OV-optimized + # quantization path, which may not handle float zero points. + if weight.backend == TensorBackend.ov: + weight = weight.as_numpy_tensor() + weight = fns.astype(weight, TensorDataType.float32) + + scale, zero_point = self._calculate_hqq_params(weight, config, wp.reduction_axes) + res[weight_name] = CompressedWeight(None, scale, zero_point, None) + + return res + + def _calculate_hqq_params( + self, + weight: Tensor, + config: WeightCompressionConfig, + reduction_axes: ReductionAxes, + ) -> tuple[Tensor, Tensor | None]: + """ + Computes HQQ-optimized scale and zero point for integer quantization. + + The algorithm alternates between two steps until convergence: + - Quantization step: Q = clamp(round(W / s + z), q_min, q_max) + - Parameter update: + * Asymmetric (W ≈ s * (Q - z)): joint closed-form least-squares for s and z. + * Symmetric (W ≈ s * Q): closed-form update for s alone. + + For the asymmetric case the zero point z is float-valued (not rounded to an integer), + which gives HQQ better reconstruction quality than standard min-max initialization. + + :param weight: Weight tensor in float32. + :param config: Weight compression configuration. + :param reduction_axes: Reduction axes for the weight tensor. + :return: Tuple of (scale, zero_point). zero_point is float for asymmetric mode, + None for symmetric mode. + """ + group_size = config.group_size + group_reduction_axes = reduction_axes + + # Reshape weights for grouped quantization when a group size is specified. + if group_size != -1: + weight, group_reduction_axes = reshape_weight_for_grouped_quantization( + weight, reduction_axes, group_size + ) + + # Number of elements along the reduction axis (i.e. per group). + if isinstance(group_reduction_axes, int): + n = weight.shape[group_reduction_axes] + else: + n = 1 + for ax in group_reduction_axes: + n *= weight.shape[ax] + + num_bits = config.num_bits + is_asym = config.is_asym_mode + level_low = 0 if is_asym else -(2 ** (num_bits - 1)) + level_high = 2**num_bits - 1 if is_asym else 2 ** (num_bits - 1) - 1 + + eps = fns.finfo(weight).eps + + # Initialize with standard min-max quantization parameters. + scale, zero_point = calculate_integer_quantization_params(weight, group_reduction_axes, config) + + # Cast integer zero point to float32 so arithmetic below is uniform. + if zero_point is not None: + zero_point = fns.astype(zero_point, TensorDataType.float32) + + for _ in range(self._num_iterations): + # Quantization step: Q = clamp(round(W / s + z), q_min, q_max) + q_float = weight / scale + if zero_point is not None: + q_float = q_float + zero_point + q_float = fns.round(q_float) + q_float = fns.clip(q_float, level_low, level_high) + + if is_asym: + # Asymmetric least-squares update for (s, z): minimize ||W - s*(Q - z)||^2. + # Letting b = s*z, normal equations give: + # det = n * sum_QQ - sum_Q^2 + # s = (n * sum_QW - sum_Q * sum_W) / det + # z = (sum_Q * sum_QW - sum_QQ * sum_W) / (det * s) + sum_q = fns.sum(q_float, axis=group_reduction_axes, keepdims=True) + sum_w = fns.sum(weight, axis=group_reduction_axes, keepdims=True) + sum_qq = fns.sum(q_float * q_float, axis=group_reduction_axes, keepdims=True) + sum_qw = fns.sum(q_float * weight, axis=group_reduction_axes, keepdims=True) + + det = n * sum_qq - sum_q * sum_q + safe_det = fns.where(fns.abs(det) < eps, eps, det) + + new_scale = (n * sum_qw - sum_q * sum_w) / safe_det + new_scale = fns.where(fns.abs(new_scale) < eps, eps, new_scale) + new_zero_point = (sum_q * sum_qw - sum_qq * sum_w) / (safe_det * new_scale) + + scale = new_scale + zero_point = new_zero_point + + else: + # Symmetric least-squares update for s: minimize ||W - s*Q||^2. + # s = sum(W*Q) / sum(Q^2) + sum_qw = fns.sum(q_float * weight, axis=group_reduction_axes, keepdims=True) + sum_qq = fns.sum(q_float * q_float, axis=group_reduction_axes, keepdims=True) + denom = fns.where(fns.abs(sum_qq) < eps, eps, sum_qq) + scale = sum_qw / denom + scale = fns.where(fns.abs(scale) < eps, eps, scale) + + return scale, zero_point diff --git a/src/nncf/quantization/quantize_model.py b/src/nncf/quantization/quantize_model.py index ffd1dacc833..b030283643e 100644 --- a/src/nncf/quantization/quantize_model.py +++ b/src/nncf/quantization/quantize_model.py @@ -422,6 +422,7 @@ def compress_weights( scale_estimation: bool | None = None, gptq: bool | None = None, lora_correction: bool | None = None, + hqq: bool | None = None, backup_mode: BackupMode | None = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters | None = None, @@ -478,6 +479,11 @@ def compress_weights( :type gptq: bool :param lora_correction: Indicates whether to use Lora Correction algorithm. :type lora_correction: bool + :param hqq: Indicates whether to use the HQQ (Half-Quadratic Quantization) algorithm. + HQQ is a data-free method that optimizes scale and zero-point via alternating least-squares, + producing lower quantization error than standard min-max initialization. + Currently supported for OpenVINO backend only. + :type hqq: bool :param backup_mode: Defines a backup mode for mixed-precision weight compression. NONE stands for original floating-point precision of the model weights. In this mode, weights are retained in their original precision without any quantization. @@ -522,7 +528,7 @@ def compress_weights( ) raise nncf.ParameterNotSupportedError(msg) - options = {"gptq": gptq, "lora_correction": lora_correction} + options = {"gptq": gptq, "lora_correction": lora_correction, "hqq": hqq} unsupported_options = [name for name, value in options.items() if value is not None] if unsupported_options: msg = f"Torch backend does not support {', '.join(unsupported_options)} option(s). Set them to None." @@ -571,7 +577,7 @@ def compress_weights( ) raise nncf.ParameterNotSupportedError(msg) - options = {"gptq": gptq, "lora_correction": lora_correction} + options = {"gptq": gptq, "lora_correction": lora_correction, "hqq": hqq} unsupported_options = [name for name, value in options.items() if value is not None] if unsupported_options: msg = f"TorchFX backend does not support {', '.join(unsupported_options)} option(s). Set them to None." @@ -644,7 +650,7 @@ def compress_weights( ) raise nncf.ParameterNotSupportedError(msg) - options = {"gptq": gptq, "lora_correction": lora_correction} + options = {"gptq": gptq, "lora_correction": lora_correction, "hqq": hqq} unsupported_options = [name for name, value in options.items() if value is not None] if unsupported_options: msg = f"ONNX backend does not support {', '.join(unsupported_options)} option(s). Set them to None." @@ -669,6 +675,7 @@ def compress_weights( scale_estimation, gptq, lora_correction, + hqq, ignored_scope, sensitivity_metric, backup_mode, @@ -685,6 +692,7 @@ def compress_weights( scale_estimation, gptq, lora_correction, + hqq, ignored_scope, sensitivity_metric, backup_mode, diff --git a/src/nncf/torch/quantization/quantize_model.py b/src/nncf/torch/quantization/quantize_model.py index 306286bdd18..e9eccfbbed7 100644 --- a/src/nncf/torch/quantization/quantize_model.py +++ b/src/nncf/torch/quantization/quantize_model.py @@ -99,8 +99,9 @@ def compress_weights_impl( scale_estimation: bool, gptq: bool, lora_correction: bool, - backup_mode: BackupMode, - compression_format: CompressionFormat, + hqq: bool = False, + backup_mode: BackupMode = BackupMode.INT8_ASYM, + compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters | None = None, ) -> torch.nn.Module: """ @@ -119,8 +120,9 @@ def compress_weights_impl( gptq, lora_correction, backup_mode, - compression_format, - advanced_parameters, + hqq=hqq, + compression_format=compression_format, + advanced_parameters=advanced_parameters, ) graph = build_graph(model) diff --git a/tests/openvino/native/quantization/test_hqq.py b/tests/openvino/native/quantization/test_hqq.py new file mode 100644 index 00000000000..45c73889a79 --- /dev/null +++ b/tests/openvino/native/quantization/test_hqq.py @@ -0,0 +1,189 @@ +# Copyright (c) 2026 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest + +import nncf +from nncf.parameters import CompressWeightsMode +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.quantization.algorithms.weight_compression.hqq import HQQ +from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_integer_quantization_params +from nncf.quantization.algorithms.weight_compression.weight_lowering import ( + reshape_weight_for_grouped_quantization, +) +from nncf.tensor import Tensor +from nncf.tensor import TensorDataType +from nncf.tensor import functions as fns + + +def _make_weight(shape, seed=42, scale=10.0): + """Create a deterministic float32 weight tensor.""" + rng = np.random.default_rng(seed) + data = rng.standard_normal(shape).astype(np.float32) * scale + return Tensor(data) + + +def _quantization_error(weight: Tensor, scale: Tensor, zero_point: Tensor | None, config: WeightCompressionConfig, reduction_axes) -> float: + """Compute mean squared quantization error: E[|W - s*(Q - z)|^2].""" + group_size = config.group_size + w = weight + reduction = reduction_axes + + if group_size != -1: + w, reduction = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size) + + q = w / scale + if zero_point is not None: + q = q + zero_point + q = fns.round(q) + + num_bits = config.num_bits + is_asym = config.is_asym_mode + level_low = 0 if is_asym else -(2 ** (num_bits - 1)) + level_high = 2**num_bits - 1 if is_asym else 2 ** (num_bits - 1) - 1 + q = fns.clip(q, level_low, level_high) + + if zero_point is not None: + reconstructed = scale * (q - zero_point) + else: + reconstructed = scale * q + + diff = w - reconstructed + return float(fns.mean(diff * diff).data) + + +@pytest.mark.parametrize("mode,group_size,reduction_axes", [ + (CompressWeightsMode.INT4_ASYM, 16, 1), + (CompressWeightsMode.INT4_SYM, 16, 1), + (CompressWeightsMode.INT4_ASYM, -1, 1), +]) +def test_hqq_reduces_quantization_error(mode, group_size, reduction_axes): + """HQQ-optimized params should produce <= quantization error than min-max init.""" + weight = _make_weight((32, 64), seed=7) + config = WeightCompressionConfig(mode=mode, group_size=group_size) + + hqq = HQQ(num_iterations=20) + hqq_scale, hqq_zp = hqq._calculate_hqq_params(weight, config, reduction_axes) + + # Baseline: standard min-max initialization + w = weight + r = reduction_axes + if group_size != -1: + w, r = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size) + baseline_scale, baseline_zp = calculate_integer_quantization_params(w, r, config) + + hqq_err = _quantization_error(weight, hqq_scale, hqq_zp, config, reduction_axes) + baseline_err = _quantization_error(weight, baseline_scale, baseline_zp, config, reduction_axes) + + assert hqq_err <= baseline_err + 1e-6, ( + f"HQQ error ({hqq_err:.6f}) should not exceed min-max error ({baseline_err:.6f}) " + f"for mode={mode.value}, group_size={group_size}" + ) + + +def test_hqq_asymmetric_float_zero_point(): + """For asymmetric modes HQQ should return a float-valued (non-integer) zero point.""" + weight = _make_weight((32, 64), seed=13, scale=5.0) + config = WeightCompressionConfig(mode=CompressWeightsMode.INT4_ASYM, group_size=16) + + hqq = HQQ(num_iterations=20) + _, zero_point = hqq._calculate_hqq_params(weight, config, reduction_axes=1) + + assert zero_point is not None, "Expected non-None zero_point for asymmetric mode" + + zp_np = zero_point.data + # The zero point should be float32 + assert zp_np.dtype == np.float32, f"Expected float32 zero point, got {zp_np.dtype}" + # At least some values should be non-integer (HQQ doesn't round to integers) + is_integer_valued = np.allclose(zp_np, np.round(zp_np), atol=1e-3) + assert not is_integer_valued, "HQQ zero point should be float-valued, not integer-valued" + + +def test_hqq_symmetric_no_zero_point(): + """For symmetric modes HQQ should return None as zero point.""" + weight = _make_weight((32, 64), seed=17) + config = WeightCompressionConfig(mode=CompressWeightsMode.INT4_SYM, group_size=16) + + hqq = HQQ(num_iterations=20) + _, zero_point = hqq._calculate_hqq_params(weight, config, reduction_axes=1) + + assert zero_point is None, "Expected None zero_point for symmetric mode" + + +@pytest.mark.parametrize("num_iterations", [0, 1, 5, 20]) +def test_hqq_num_iterations_parameter(num_iterations): + """HQQ should be callable with various num_iterations values including zero.""" + weight = _make_weight((16, 32), seed=3) + config = WeightCompressionConfig(mode=CompressWeightsMode.INT4_ASYM, group_size=16) + + hqq = HQQ(num_iterations=num_iterations) + scale, zero_point = hqq._calculate_hqq_params(weight, config, reduction_axes=1) + + assert scale is not None + assert zero_point is not None + assert scale.shape == zero_point.shape + + +def test_hqq_advanced_parameters_exposed(): + """AdvancedHQQParameters must be importable from the nncf public namespace.""" + params = nncf.AdvancedHQQParameters(num_iterations=10) + assert params.num_iterations == 10 + + +def test_hqq_gptq_mutual_exclusion(): + """Specifying both hqq=True and gptq=True should raise ParameterNotSupportedError.""" + from nncf.quantization.algorithms.weight_compression.algorithm import check_user_compression_configuration + + with pytest.raises(nncf.ParameterNotSupportedError, match="HQQ and GPTQ"): + check_user_compression_configuration( + mode=CompressWeightsMode.INT4_ASYM, + subset_size=128, + dataset=None, + ratio=1.0, + group_size=128, + all_layers=None, + awq=None, + scale_estimation=None, + gptq=True, + lora_correction=None, + hqq=True, + ignored_scope=None, + sensitivity_metric=None, + backup_mode=None, + compression_format=None, + advanced_parameters=None, + ) + + +def test_hqq_int8_unsupported(): + """HQQ should not be accepted for INT8 modes.""" + from nncf.quantization.algorithms.weight_compression.algorithm import check_user_compression_configuration + + with pytest.raises(nncf.ParameterNotSupportedError, match="hqq"): + check_user_compression_configuration( + mode=CompressWeightsMode.INT8_ASYM, + subset_size=128, + dataset=None, + ratio=None, + group_size=None, + all_layers=None, + awq=None, + scale_estimation=None, + gptq=None, + lora_correction=None, + hqq=True, + ignored_scope=None, + sensitivity_metric=None, + backup_mode=None, + compression_format=None, + advanced_parameters=None, + ) From c40b11afee23f81826cfb0a2b0352fe0e10a35dc Mon Sep 17 00:00:00 2001 From: abhayuvi Date: Fri, 13 Mar 2026 16:13:09 +0530 Subject: [PATCH 2/2] fix: round HQQ zero_point to integer for consistent quant/dequant --- .../algorithms/weight_compression/hqq.py | 81 +++++++++++-------- .../openvino/native/quantization/test_hqq.py | 17 ++-- 2 files changed, 59 insertions(+), 39 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/hqq.py b/src/nncf/quantization/algorithms/weight_compression/hqq.py index fe47cfda306..d075d9ae713 100644 --- a/src/nncf/quantization/algorithms/weight_compression/hqq.py +++ b/src/nncf/quantization/algorithms/weight_compression/hqq.py @@ -37,8 +37,9 @@ class HQQ: HQQ is a data-free weight quantization algorithm that minimizes quantization error without requiring calibration data. It uses alternating least-squares optimization - to jointly find optimal scale and zero-point parameters, producing floating-point - zero points for asymmetric quantization. + to find optimal scale and zero-point parameters. For asymmetric quantization, HQQ + optimizes the zero-point as a continuous float during iterations, then rounds it to + the nearest integer before returning so that quantization and dequantization agree. Reference: "Half-Quadratic Quantization of Large Machine Learning Models" (https://mobiusml.github.io/hqq_blog/) @@ -154,19 +155,23 @@ def _calculate_hqq_params( Computes HQQ-optimized scale and zero point for integer quantization. The algorithm alternates between two steps until convergence: - - Quantization step: Q = clamp(round(W / s + z), q_min, q_max) + - Quantization step: Q = clamp(round(W * inv_s + z), q_min, q_max) - Parameter update: - * Asymmetric (W ≈ s * (Q - z)): joint closed-form least-squares for s and z. - * Symmetric (W ≈ s * Q): closed-form update for s alone. + * Asymmetric: scale is fixed (min-max init); only z is updated via + closed-form `z = mean(Q - W * inv_s)` (per the paper). + * Symmetric: z = None; scale is updated via `s = sum(W*Q) / sum(Q²)`. - For the asymmetric case the zero point z is float-valued (not rounded to an integer), - which gives HQQ better reconstruction quality than standard min-max initialization. + For the asymmetric case the zero point z is optimized as a continuous float during + iterations (giving better reconstruction than integer-only search), then rounded and + clipped to the valid integer range before being returned. This ensures consistency + between quantization (which uses the returned z) and dequantization (which loads z + as a stored integer, e.g. uint4). :param weight: Weight tensor in float32. :param config: Weight compression configuration. :param reduction_axes: Reduction axes for the weight tensor. - :return: Tuple of (scale, zero_point). zero_point is float for asymmetric mode, - None for symmetric mode. + :return: Tuple of (scale, zero_point). zero_point is an integer-valued float32 + tensor for asymmetric mode, None for symmetric mode. """ group_size = config.group_size group_reduction_axes = reduction_axes @@ -199,42 +204,52 @@ def _calculate_hqq_params( if zero_point is not None: zero_point = fns.astype(zero_point, TensorDataType.float32) + # Pre-compute inv_scale once; scale is fixed for asymmetric iterations. + inv_scale = 1.0 / fns.where(fns.abs(scale) < eps, eps, scale) + for _ in range(self._num_iterations): - # Quantization step: Q = clamp(round(W / s + z), q_min, q_max) - q_float = weight / scale + # Quantization step: Q = clamp(round(W * inv_s + z), q_min, q_max) + q_float = weight * inv_scale if zero_point is not None: q_float = q_float + zero_point q_float = fns.round(q_float) q_float = fns.clip(q_float, level_low, level_high) if is_asym: - # Asymmetric least-squares update for (s, z): minimize ||W - s*(Q - z)||^2. - # Letting b = s*z, normal equations give: - # det = n * sum_QQ - sum_Q^2 - # s = (n * sum_QW - sum_Q * sum_W) / det - # z = (sum_Q * sum_QW - sum_QQ * sum_W) / (det * s) - sum_q = fns.sum(q_float, axis=group_reduction_axes, keepdims=True) - sum_w = fns.sum(weight, axis=group_reduction_axes, keepdims=True) - sum_qq = fns.sum(q_float * q_float, axis=group_reduction_axes, keepdims=True) - sum_qw = fns.sum(q_float * weight, axis=group_reduction_axes, keepdims=True) - - det = n * sum_qq - sum_q * sum_q - safe_det = fns.where(fns.abs(det) < eps, eps, det) - - new_scale = (n * sum_qw - sum_q * sum_w) / safe_det - new_scale = fns.where(fns.abs(new_scale) < eps, eps, new_scale) - new_zero_point = (sum_q * sum_qw - sum_qq * sum_w) / (safe_det * new_scale) - - scale = new_scale - zero_point = new_zero_point - + # Asymmetric: fix scale, update zero_point only (per the paper). + # Minimizing ||W - s*(Q - z)||² w.r.t. z gives: + # z = mean(Q - W/s) = sum(Q - W*inv_s) / n + zero_point = fns.sum(q_float - weight * inv_scale, axis=group_reduction_axes, keepdims=True) + zero_point = zero_point / n else: - # Symmetric least-squares update for s: minimize ||W - s*Q||^2. - # s = sum(W*Q) / sum(Q^2) + # Symmetric OLS update for scale: minimize ||W - s*Q||². + # s = sum(W*Q) / sum(Q²) sum_qw = fns.sum(q_float * weight, axis=group_reduction_axes, keepdims=True) sum_qq = fns.sum(q_float * q_float, axis=group_reduction_axes, keepdims=True) denom = fns.where(fns.abs(sum_qq) < eps, eps, sum_qq) scale = sum_qw / denom scale = fns.where(fns.abs(scale) < eps, eps, scale) + inv_scale = 1.0 / scale + + # Round and clip zero_point to the valid integer range so that quantization + # and dequantization (which stores zp as uint4) use the exact same value. + if zero_point is not None: + zero_point = self._round_zero_point(zero_point, level_low, level_high) return scale, zero_point + + @staticmethod + def _round_zero_point(zero_point: Tensor, level_low: int, level_high: int) -> Tensor: + """ + Rounds the float zero_point to the nearest integer and clips it to the valid quantization range. + + HQQ optimizes z as a continuous value during iterations, but the OV backend stores + zero_point as integer (uint4 for INT4). To ensure that quantization and dequantization + use the same z, the final float z is rounded and clipped before returning. + + :param zero_point: Float zero_point tensor from HQQ iterations. + :param level_low: Minimum valid zero_point value. + :param level_high: Maximum valid zero_point value. + :return: Rounded and clipped zero_point. + """ + return fns.clip(fns.round(zero_point), level_low, level_high) diff --git a/tests/openvino/native/quantization/test_hqq.py b/tests/openvino/native/quantization/test_hqq.py index 45c73889a79..97688e29edf 100644 --- a/tests/openvino/native/quantization/test_hqq.py +++ b/tests/openvino/native/quantization/test_hqq.py @@ -90,8 +90,13 @@ def test_hqq_reduces_quantization_error(mode, group_size, reduction_axes): ) -def test_hqq_asymmetric_float_zero_point(): - """For asymmetric modes HQQ should return a float-valued (non-integer) zero point.""" +def test_hqq_asymmetric_zero_point_rounded(): + """HQQ should return an integer-valued zero point for use with uint4 storage. + + HQQ optimizes z as a continuous float during iterations, but the final value is + rounded and clipped so that quantization and dequantization use the same integer z. + The tensor dtype stays float32 (no cast), but all values should be integer-valued. + """ weight = _make_weight((32, 64), seed=13, scale=5.0) config = WeightCompressionConfig(mode=CompressWeightsMode.INT4_ASYM, group_size=16) @@ -101,11 +106,11 @@ def test_hqq_asymmetric_float_zero_point(): assert zero_point is not None, "Expected non-None zero_point for asymmetric mode" zp_np = zero_point.data - # The zero point should be float32 + # dtype remains float32 (no explicit cast); values are integer-valued after rounding. assert zp_np.dtype == np.float32, f"Expected float32 zero point, got {zp_np.dtype}" - # At least some values should be non-integer (HQQ doesn't round to integers) - is_integer_valued = np.allclose(zp_np, np.round(zp_np), atol=1e-3) - assert not is_integer_valued, "HQQ zero point should be float-valued, not integer-valued" + assert np.allclose(zp_np, np.round(zp_np), atol=1e-5), ( + "HQQ zero point should be integer-valued after rounding for consistent uint4 storage" + ) def test_hqq_symmetric_no_zero_point():