From de9e8070054268e30c7f722a1ecb293ae6d6a3aa Mon Sep 17 00:00:00 2001
From: abhayuvi <abhayuvi.raj@gmail.com>
Date: Fri, 13 Mar 2026 12:48:58 +0530
Subject: [PATCH 1/2] feat: add HQQ data-free weight compression algorithm

---
 .ci/cspell_dict.txt                           |   1 +
 src/nncf/__init__.py                          |   1 +
 .../torch/fx/quantization/quantize_model.py   |  10 +-
 src/nncf/onnx/quantization/quantize_model.py  |  10 +-
 .../openvino/quantization/quantize_model.py   |  10 +-
 src/nncf/quantization/advanced_parameters.py  |  18 ++
 .../weight_compression/algorithm.py           |  27 ++
 .../algorithms/weight_compression/hqq.py      | 240 ++++++++++++++++++
 src/nncf/quantization/quantize_model.py       |  14 +-
 src/nncf/torch/quantization/quantize_model.py |  10 +-
 .../openvino/native/quantization/test_hqq.py  | 189 ++++++++++++++
 11 files changed, 511 insertions(+), 19 deletions(-)
 create mode 100644 src/nncf/quantization/algorithms/weight_compression/hqq.py
 create mode 100644 tests/openvino/native/quantization/test_hqq.py

diff --git a/.ci/cspell_dict.txt b/.ci/cspell_dict.txt
index 1f5f531275e..ecd435d4dc0 100644
--- a/.ci/cspell_dict.txt
+++ b/.ci/cspell_dict.txt
@@ -182,6 +182,7 @@ hellaswag
 hiddens
 hparam
 hparams
+hqq
 hswish
 huggingface
 hutter
diff --git a/src/nncf/__init__.py b/src/nncf/__init__.py
index 475d7e836fc..4077bb19ed1 100644
--- a/src/nncf/__init__.py
+++ b/src/nncf/__init__.py
@@ -61,6 +61,7 @@
 from nncf.quantization.advanced_parameters import AdvancedBiasCorrectionParameters as AdvancedBiasCorrectionParameters
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as AdvancedCompressionParameters
 from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as AdvancedGPTQParameters
+from nncf.quantization.advanced_parameters import AdvancedHQQParameters as AdvancedHQQParameters
 from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as AdvancedLoraCorrectionParameters
 from nncf.quantization.advanced_parameters import AdvancedQuantizationParameters as AdvancedQuantizationParameters
 from nncf.quantization.advanced_parameters import AdvancedScaleEstimationParameters as AdvancedScaleEstimationParameters
diff --git a/src/nncf/experimental/torch/fx/quantization/quantize_model.py b/src/nncf/experimental/torch/fx/quantization/quantize_model.py
index 8eff430895b..c5fa6c0b291 100644
--- a/src/nncf/experimental/torch/fx/quantization/quantize_model.py
+++ b/src/nncf/experimental/torch/fx/quantization/quantize_model.py
@@ -129,8 +129,9 @@ def compress_weights_impl(
     scale_estimation: bool,
     gptq: bool,
     lora_correction: bool,
-    backup_mode: BackupMode,
-    compression_format: CompressionFormat,
+    hqq: bool = False,
+    backup_mode: BackupMode = BackupMode.INT8_ASYM,
+    compression_format: CompressionFormat = CompressionFormat.DQ,
     advanced_parameters: AdvancedCompressionParameters | None = None,
 ) -> torch.fx.GraphModule:
     """
@@ -149,8 +150,9 @@ def compress_weights_impl(
         gptq,
         lora_correction,
         backup_mode,
-        compression_format,
-        advanced_parameters,
+        hqq=hqq,
+        compression_format=compression_format,
+        advanced_parameters=advanced_parameters,
     )
     graph = build_graph(model)
     compressed_model = compression_algorithm.apply(model, graph, dataset=dataset)
diff --git a/src/nncf/onnx/quantization/quantize_model.py b/src/nncf/onnx/quantization/quantize_model.py
index 4ec6b8c6111..fddaee53aa1 100644
--- a/src/nncf/onnx/quantization/quantize_model.py
+++ b/src/nncf/onnx/quantization/quantize_model.py
@@ -324,8 +324,9 @@ def compress_weights_impl(
     scale_estimation: bool,
     gptq: bool,
     lora_correction: bool,
-    backup_mode: BackupMode,
-    compression_format: CompressionFormat,
+    hqq: bool = False,
+    backup_mode: BackupMode = BackupMode.INT8_ASYM,
+    compression_format: CompressionFormat = CompressionFormat.DQ,
     advanced_parameters: AdvancedCompressionParameters | None = None,
 ) -> onnx.ModelProto:
     if model.opset_import[0].version < 13:
@@ -357,8 +358,9 @@ def compress_weights_impl(
         gptq,
         lora_correction,
         backup_mode,
-        compression_format,
-        advanced_parameters,
+        hqq=hqq,
+        compression_format=compression_format,
+        advanced_parameters=advanced_parameters,
     )
     graph = build_graph(model)
 
diff --git a/src/nncf/openvino/quantization/quantize_model.py b/src/nncf/openvino/quantization/quantize_model.py
index 19031fb1674..37f9d77601f 100644
--- a/src/nncf/openvino/quantization/quantize_model.py
+++ b/src/nncf/openvino/quantization/quantize_model.py
@@ -376,8 +376,9 @@ def compress_weights_impl(
     scale_estimation: bool,
     gptq: bool,
     lora_correction: bool,
-    backup_mode: BackupMode,
-    compression_format: CompressionFormat,
+    hqq: bool = False,
+    backup_mode: BackupMode = BackupMode.INT8_ASYM,
+    compression_format: CompressionFormat = CompressionFormat.DQ,
     advanced_parameters: AdvancedCompressionParameters | None = None,
 ) -> ov.Model:
     """
@@ -398,8 +399,9 @@ def compress_weights_impl(
         gptq,
         lora_correction,
         backup_mode,
-        compression_format,
-        advanced_parameters,
+        hqq=hqq,
+        compression_format=compression_format,
+        advanced_parameters=advanced_parameters,
     )
 
     statistics_points = None
diff --git a/src/nncf/quantization/advanced_parameters.py b/src/nncf/quantization/advanced_parameters.py
index df39661f164..71a4b925742 100644
--- a/src/nncf/quantization/advanced_parameters.py
+++ b/src/nncf/quantization/advanced_parameters.py
@@ -382,6 +382,21 @@ class AdvancedLoraCorrectionParameters:
     use_int8_adapters: bool = True
 
 
+@api()
+@dataclass
+class AdvancedHQQParameters:
+    """
+    Contains advanced parameters for the HQQ (Half-Quadratic Quantization) algorithm.
+
+    :param num_iterations: Number of alternating optimization iterations used to jointly
+        refine scale and zero point. More iterations improve quality at the cost of runtime.
+        Defaults to 20.
+    :type num_iterations: int
+    """
+
+    num_iterations: int = 20
+
+
 @api()
 @dataclass
 class AdvancedAdaptiveCodebookParameters:
@@ -423,6 +438,8 @@ class AdvancedCompressionParameters:
     :type scale_estimation_params: AdvancedScaleEstimationParameters
     :param gptq_params: Advanced parameters for GPTQ algorithm.
     :type gptq_params: AdvancedGPTQParameters
+    :param hqq_params: Advanced parameters for HQQ algorithm.
+    :type hqq_params: AdvancedHQQParameters
     :param lora_correction_params: Advanced parameters for Lora Correction algorithm.
     :type lora_correction_params: AdvancedLoraCorrectionParameters
     :param backend_params: Backend-specific parameters.
@@ -443,6 +460,7 @@ class AdvancedCompressionParameters:
         default_factory=AdvancedScaleEstimationParameters
     )
     gptq_params: AdvancedGPTQParameters = field(default_factory=AdvancedGPTQParameters)
+    hqq_params: AdvancedHQQParameters = field(default_factory=AdvancedHQQParameters)
     lora_correction_params: AdvancedLoraCorrectionParameters = field(default_factory=AdvancedLoraCorrectionParameters)
     backend_params: dict[str, Any] = field(default_factory=dict)
     codebook: TTensor | None = None
diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py
index af9c4c2661a..4166f1e16aa 100644
--- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -45,6 +45,7 @@
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.constants import CB4_QUANTILES
 from nncf.quantization.algorithms.weight_compression.gptq import GPTQ
+from nncf.quantization.algorithms.weight_compression.hqq import HQQ
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
 from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
 from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation
@@ -81,6 +82,7 @@ def get_weight_compression_configuration(
     scale_estimation: bool | None = None,
     gptq: bool | None = None,
     lora_correction: bool | None = None,
+    hqq: bool | None = None,
     ignored_scope: IgnoredScope | None = None,
     sensitivity_metric: SensitivityMetric | None = None,
     backup_mode: BackupMode | None = None,
@@ -120,6 +122,7 @@ def get_weight_compression_configuration(
         "scale_estimation": scale_estimation or False,
         "gptq": gptq or False,
         "lora_correction": lora_correction or False,
+        "hqq": hqq or False,
         "ignored_scope": ignored_scope or IgnoredScope(),
         "sensitivity_metric": (
             (
@@ -146,6 +149,7 @@ def check_user_compression_configuration(
     scale_estimation: bool | None,
     gptq: bool | None,
     lora_correction: bool | None,
+    hqq: bool | None,
     ignored_scope: IgnoredScope | None,
     sensitivity_metric: SensitivityMetric | None,
     backup_mode: BackupMode | None,
@@ -175,6 +179,7 @@ def check_user_compression_configuration(
             "scale_estimation": scale_estimation,
             "gptq": gptq,
             "lora_correction": lora_correction,
+            "hqq": hqq,
             "backup_mode": backup_mode,
         }
         unsupported_for_int8 = [name for name, value in unsupported_options.items() if value is not None]
@@ -257,6 +262,10 @@ def check_user_compression_configuration(
             requires a dataset, but it's not provided."
         raise nncf.ValidationError(msg)
 
+    if hqq and gptq:
+        msg = "Simultaneous use of HQQ and GPTQ algorithms is not supported. Select one of them."
+        raise nncf.ParameterNotSupportedError(msg)
+
     if lora_correction and compression_format in [
         CompressionFormat.FQ,
         CompressionFormat.FQ_LORA,
@@ -311,6 +320,7 @@ def __init__(
         gptq: bool,
         lora_correction: bool,
         backup_mode: BackupMode,
+        hqq: bool = False,
         compression_format: CompressionFormat = CompressionFormat.DQ,
         advanced_parameters: AdvancedCompressionParameters | None = None,
     ):
@@ -355,6 +365,10 @@ def __init__(
             INT8_ASYM stands for 8-bit integer asymmetric quantization with a typical non-fixed zero point.
             MXFP8_E4M3 stands for MX-compliant FP8 format with E4M3 values sharing group-level E8M0 scale.
             FP8_E4M3 stands for FP8 format with E4M3 values sharing group-level fp16 scale.
+        :param hqq: determines whether to use the HQQ (Half-Quadratic Quantization) algorithm.
+            HQQ is a data-free method that optimizes scale and zero-point jointly via alternating
+            least-squares, typically producing lower quantization error than standard min-max
+            initialization, especially for 4-bit group-wise compression.
         :param compression_format: Describes the format in which the model is saved after weight compression.
         :param advanced_parameters: advanced parameters for algorithms in compression pipeline.
         """
@@ -376,6 +390,7 @@ def __init__(
         self._codebook_estimation = mode == CompressWeightsMode.ADAPTIVE_CODEBOOK
         self._backup_mode = backup_mode
         self._compression_format = compression_format
+        self._hqq = hqq
         self._advanced_parameters = (
             advanced_parameters if advanced_parameters is not None else AdvancedCompressionParameters()
         )
@@ -405,6 +420,9 @@ def __init__(
                 subset_size=gptq_params.subset_size,
                 scale_estimation=self._scale_estimation,
             )
+        if self._hqq:
+            hqq_params = self._advanced_parameters.hqq_params
+            self._hqq_algo = HQQ(num_iterations=hqq_params.num_iterations)
         if self._scale_estimation:
             scale_estimation_params = self._advanced_parameters.scale_estimation_params
             self._scale_estimation_algo = ScaleEstimation(
@@ -1163,6 +1181,14 @@ def apply_with_parameters(
                 backend_entity=self._backend_entity,
             )
         else:
+            if self._hqq:
+                precomputed_compressed_weights = self._hqq_algo.apply(
+                    model=model,
+                    graph=graph,
+                    all_weight_params=all_weight_params,
+                    backend_entity=self._backend_entity,
+                )
+
             if self._scale_estimation:
                 precomputed_compressed_weights = self._scale_estimation_algo.apply(
                     model=model,
@@ -1211,6 +1237,7 @@ def apply_with_parameters(
                 "scale_estimation": self._scale_estimation,
                 "gptq": self._gptq,
                 "lora_correction": self._lora_correction,
+                "hqq": self._hqq,
                 "backup_mode": self._backup_mode.value,
                 "compression_format": self._compression_format.value,
                 "advanced_parameters": convert_to_dict_recursively(self._advanced_parameters),
diff --git a/src/nncf/quantization/algorithms/weight_compression/hqq.py b/src/nncf/quantization/algorithms/weight_compression/hqq.py
new file mode 100644
index 00000000000..fe47cfda306
--- /dev/null
+++ b/src/nncf/quantization/algorithms/weight_compression/hqq.py
@@ -0,0 +1,240 @@
+# Copyright (c) 2026 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TypeVar
+
+import nncf
+from nncf.common.graph.graph import NNCFGraph
+from nncf.common.logging.track_progress import track
+from nncf.common.utils.backend import BackendType
+from nncf.common.utils.backend import get_backend
+from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
+from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
+from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
+from nncf.quantization.algorithms.weight_compression.weight_lowering import ReductionAxes
+from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_integer_quantization_params
+from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
+from nncf.tensor import Tensor
+from nncf.tensor import functions as fns
+from nncf.tensor.definitions import TensorBackend
+from nncf.tensor.definitions import TensorDataType
+
+TModel = TypeVar("TModel")
+
+
+class HQQ:
+    """
+    Half-Quadratic Quantization (HQQ) algorithm implementation.
+
+    HQQ is a data-free weight quantization algorithm that minimizes quantization error
+    without requiring calibration data. It uses alternating least-squares optimization
+    to jointly find optimal scale and zero-point parameters, producing floating-point
+    zero points for asymmetric quantization.
+
+    Reference: "Half-Quadratic Quantization of Large Machine Learning Models"
+    (https://mobiusml.github.io/hqq_blog/)
+    """
+
+    def __init__(self, num_iterations: int = 20):
+        """
+        :param num_iterations: Number of alternating optimization iterations.
+            More iterations improve quantization quality at the cost of compute time.
+            Defaults to 20.
+        """
+        self._num_iterations = num_iterations
+        self._backend_entity = None
+
+    @property
+    def available_backends(self) -> list[BackendType]:
+        return [BackendType.OPENVINO, BackendType.TORCH, BackendType.ONNX]
+
+    def _set_backend_entity(self, model: TModel) -> None:
+        """
+        Creates a helper class with a backend-specific logic of the algorithm.
+
+        :param model: Backend-specific input model.
+        """
+        model_backend = get_backend(model)
+        if model_backend == BackendType.OPENVINO:
+            from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend
+
+            self._backend_entity = OVWeightCompressionAlgoBackend(model)
+        elif model_backend == BackendType.TORCH:
+            from nncf.quantization.algorithms.weight_compression.torch_backend import PTWeightCompressionAlgoBackend
+
+            self._backend_entity = PTWeightCompressionAlgoBackend()
+        elif model_backend == BackendType.TORCH_FX:
+            from nncf.quantization.algorithms.weight_compression.torch_fx_backend import FXWeightCompressionAlgoBackend
+
+            self._backend_entity = FXWeightCompressionAlgoBackend()
+        elif model_backend == BackendType.ONNX:
+            from nncf.quantization.algorithms.weight_compression.onnx_backend import ONNXWeightCompressionAlgoBackend
+
+            self._backend_entity = ONNXWeightCompressionAlgoBackend(model)
+        else:
+            msg = (
+                "Cannot return backend-specific HQQ entity because"
+                f" {model_backend.value} is not supported!"
+            )
+            raise nncf.UnsupportedBackendError(msg)
+
+    def apply(
+        self,
+        model: TModel,
+        graph: NNCFGraph,
+        all_weight_params: list[WeightCompressionParameters],
+        backend_entity: WeightCompressionAlgoBackend | None = None,
+    ) -> dict[str, CompressedWeight]:
+        """
+        Applies the HQQ algorithm to compute optimized scale and zero-point parameters.
+
+        For each eligible weight, HQQ alternately:
+          1. Quantizes the weight with the current (scale, zero_point), and
+          2. Updates (scale, zero_point) via a closed-form least-squares step.
+
+        The resulting CompressedWeight objects contain None for the compressed tensor
+        (quantization is deferred) but carry the HQQ-optimized float scale and, for
+        asymmetric modes, a float-valued zero point.
+
+        :param model: Model for applying algorithm.
+        :param graph: Model graph.
+        :param all_weight_params: List of all weight parameters.
+        :param backend_entity: Weight compression algorithm backend.
+        :return: A dictionary mapping weight names to CompressedWeight instances with
+            HQQ-optimized scale and zero point.
+        """
+        self._backend_entity = backend_entity
+        if self._backend_entity is None:
+            self._set_backend_entity(model)
+
+        res = {}
+
+        for wp in track(all_weight_params, description="Applying HQQ"):
+            weight_name = wp.weight_name
+            config = wp.compression_config
+
+            if not config.is_integer:
+                res[weight_name] = CompressedWeight()
+                continue
+
+            weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph)
+            if len(weight_data) != 1:  # not supported by the algorithm
+                continue
+            _, weight_port_id = weight_data[0]
+
+            weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph)
+
+            # Convert to numpy for stable in-loop arithmetic, avoiding the OV-optimized
+            # quantization path, which may not handle float zero points.
+            if weight.backend == TensorBackend.ov:
+                weight = weight.as_numpy_tensor()
+            weight = fns.astype(weight, TensorDataType.float32)
+
+            scale, zero_point = self._calculate_hqq_params(weight, config, wp.reduction_axes)
+            res[weight_name] = CompressedWeight(None, scale, zero_point, None)
+
+        return res
+
+    def _calculate_hqq_params(
+        self,
+        weight: Tensor,
+        config: WeightCompressionConfig,
+        reduction_axes: ReductionAxes,
+    ) -> tuple[Tensor, Tensor | None]:
+        """
+        Computes HQQ-optimized scale and zero point for integer quantization.
+
+        The algorithm alternates between two steps until convergence:
+          - Quantization step: Q = clamp(round(W / s + z), q_min, q_max)
+          - Parameter update:
+            * Asymmetric (W ≈ s * (Q - z)): joint closed-form least-squares for s and z.
+            * Symmetric (W ≈ s * Q): closed-form update for s alone.
+
+        For the asymmetric case the zero point z is float-valued (not rounded to an integer),
+        which gives HQQ better reconstruction quality than standard min-max initialization.
+
+        :param weight: Weight tensor in float32.
+        :param config: Weight compression configuration.
+        :param reduction_axes: Reduction axes for the weight tensor.
+        :return: Tuple of (scale, zero_point). zero_point is float for asymmetric mode,
+            None for symmetric mode.
+        """
+        group_size = config.group_size
+        group_reduction_axes = reduction_axes
+
+        # Reshape weights for grouped quantization when a group size is specified.
+        if group_size != -1:
+            weight, group_reduction_axes = reshape_weight_for_grouped_quantization(
+                weight, reduction_axes, group_size
+            )
+
+        # Number of elements along the reduction axis (i.e. per group).
+        if isinstance(group_reduction_axes, int):
+            n = weight.shape[group_reduction_axes]
+        else:
+            n = 1
+            for ax in group_reduction_axes:
+                n *= weight.shape[ax]
+
+        num_bits = config.num_bits
+        is_asym = config.is_asym_mode
+        level_low = 0 if is_asym else -(2 ** (num_bits - 1))
+        level_high = 2**num_bits - 1 if is_asym else 2 ** (num_bits - 1) - 1
+
+        eps = fns.finfo(weight).eps
+
+        # Initialize with standard min-max quantization parameters.
+        scale, zero_point = calculate_integer_quantization_params(weight, group_reduction_axes, config)
+
+        # Cast integer zero point to float32 so arithmetic below is uniform.
+        if zero_point is not None:
+            zero_point = fns.astype(zero_point, TensorDataType.float32)
+
+        for _ in range(self._num_iterations):
+            # Quantization step: Q = clamp(round(W / s + z), q_min, q_max)
+            q_float = weight / scale
+            if zero_point is not None:
+                q_float = q_float + zero_point
+            q_float = fns.round(q_float)
+            q_float = fns.clip(q_float, level_low, level_high)
+
+            if is_asym:
+                # Asymmetric least-squares update for (s, z): minimize ||W - s*(Q - z)||^2.
+                # Letting b = s*z, normal equations give:
+                #   det = n * sum_QQ - sum_Q^2
+                #   s   = (n * sum_QW - sum_Q * sum_W) / det
+                #   z   = (sum_Q * sum_QW - sum_QQ * sum_W) / (det * s)
+                sum_q = fns.sum(q_float, axis=group_reduction_axes, keepdims=True)
+                sum_w = fns.sum(weight, axis=group_reduction_axes, keepdims=True)
+                sum_qq = fns.sum(q_float * q_float, axis=group_reduction_axes, keepdims=True)
+                sum_qw = fns.sum(q_float * weight, axis=group_reduction_axes, keepdims=True)
+
+                det = n * sum_qq - sum_q * sum_q
+                safe_det = fns.where(fns.abs(det) < eps, eps, det)
+
+                new_scale = (n * sum_qw - sum_q * sum_w) / safe_det
+                new_scale = fns.where(fns.abs(new_scale) < eps, eps, new_scale)
+                new_zero_point = (sum_q * sum_qw - sum_qq * sum_w) / (safe_det * new_scale)
+
+                scale = new_scale
+                zero_point = new_zero_point
+
+            else:
+                # Symmetric least-squares update for s: minimize ||W - s*Q||^2.
+                #   s = sum(W*Q) / sum(Q^2)
+                sum_qw = fns.sum(q_float * weight, axis=group_reduction_axes, keepdims=True)
+                sum_qq = fns.sum(q_float * q_float, axis=group_reduction_axes, keepdims=True)
+                denom = fns.where(fns.abs(sum_qq) < eps, eps, sum_qq)
+                scale = sum_qw / denom
+                scale = fns.where(fns.abs(scale) < eps, eps, scale)
+
+        return scale, zero_point
diff --git a/src/nncf/quantization/quantize_model.py b/src/nncf/quantization/quantize_model.py
index ffd1dacc833..b030283643e 100644
--- a/src/nncf/quantization/quantize_model.py
+++ b/src/nncf/quantization/quantize_model.py
@@ -422,6 +422,7 @@ def compress_weights(
     scale_estimation: bool | None = None,
     gptq: bool | None = None,
     lora_correction: bool | None = None,
+    hqq: bool | None = None,
     backup_mode: BackupMode | None = None,
     compression_format: CompressionFormat = CompressionFormat.DQ,
     advanced_parameters: AdvancedCompressionParameters | None = None,
@@ -478,6 +479,11 @@ def compress_weights(
     :type gptq: bool
     :param lora_correction: Indicates whether to use Lora Correction algorithm.
     :type lora_correction: bool
+    :param hqq: Indicates whether to use the HQQ (Half-Quadratic Quantization) algorithm.
+        HQQ is a data-free method that optimizes scale and zero-point via alternating least-squares,
+        producing lower quantization error than standard min-max initialization.
+        Currently supported for OpenVINO backend only.
+    :type hqq: bool
     :param backup_mode: Defines a backup mode for mixed-precision weight compression.
         NONE stands for original floating-point precision of the model weights.
             In this mode, weights are retained in their original precision without any quantization.
@@ -522,7 +528,7 @@ def compress_weights(
             )
             raise nncf.ParameterNotSupportedError(msg)
 
-        options = {"gptq": gptq, "lora_correction": lora_correction}
+        options = {"gptq": gptq, "lora_correction": lora_correction, "hqq": hqq}
         unsupported_options = [name for name, value in options.items() if value is not None]
         if unsupported_options:
             msg = f"Torch backend does not support {', '.join(unsupported_options)} option(s). Set them to None."
@@ -571,7 +577,7 @@ def compress_weights(
             )
             raise nncf.ParameterNotSupportedError(msg)
 
-        options = {"gptq": gptq, "lora_correction": lora_correction}
+        options = {"gptq": gptq, "lora_correction": lora_correction, "hqq": hqq}
         unsupported_options = [name for name, value in options.items() if value is not None]
         if unsupported_options:
             msg = f"TorchFX backend does not support {', '.join(unsupported_options)} option(s). Set them to None."
@@ -644,7 +650,7 @@ def compress_weights(
             )
             raise nncf.ParameterNotSupportedError(msg)
 
-        options = {"gptq": gptq, "lora_correction": lora_correction}
+        options = {"gptq": gptq, "lora_correction": lora_correction, "hqq": hqq}
         unsupported_options = [name for name, value in options.items() if value is not None]
         if unsupported_options:
             msg = f"ONNX backend does not support {', '.join(unsupported_options)} option(s). Set them to None."
@@ -669,6 +675,7 @@ def compress_weights(
         scale_estimation,
         gptq,
         lora_correction,
+        hqq,
         ignored_scope,
         sensitivity_metric,
         backup_mode,
@@ -685,6 +692,7 @@ def compress_weights(
         scale_estimation,
         gptq,
         lora_correction,
+        hqq,
         ignored_scope,
         sensitivity_metric,
         backup_mode,
diff --git a/src/nncf/torch/quantization/quantize_model.py b/src/nncf/torch/quantization/quantize_model.py
index 306286bdd18..e9eccfbbed7 100644
--- a/src/nncf/torch/quantization/quantize_model.py
+++ b/src/nncf/torch/quantization/quantize_model.py
@@ -99,8 +99,9 @@ def compress_weights_impl(
     scale_estimation: bool,
     gptq: bool,
     lora_correction: bool,
-    backup_mode: BackupMode,
-    compression_format: CompressionFormat,
+    hqq: bool = False,
+    backup_mode: BackupMode = BackupMode.INT8_ASYM,
+    compression_format: CompressionFormat = CompressionFormat.DQ,
     advanced_parameters: AdvancedCompressionParameters | None = None,
 ) -> torch.nn.Module:
     """
@@ -119,8 +120,9 @@ def compress_weights_impl(
         gptq,
         lora_correction,
         backup_mode,
-        compression_format,
-        advanced_parameters,
+        hqq=hqq,
+        compression_format=compression_format,
+        advanced_parameters=advanced_parameters,
     )
     graph = build_graph(model)
 
diff --git a/tests/openvino/native/quantization/test_hqq.py b/tests/openvino/native/quantization/test_hqq.py
new file mode 100644
index 00000000000..45c73889a79
--- /dev/null
+++ b/tests/openvino/native/quantization/test_hqq.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2026 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+
+import nncf
+from nncf.parameters import CompressWeightsMode
+from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.quantization.algorithms.weight_compression.hqq import HQQ
+from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_integer_quantization_params
+from nncf.quantization.algorithms.weight_compression.weight_lowering import (
+    reshape_weight_for_grouped_quantization,
+)
+from nncf.tensor import Tensor
+from nncf.tensor import TensorDataType
+from nncf.tensor import functions as fns
+
+
+def _make_weight(shape, seed=42, scale=10.0):
+    """Create a deterministic float32 weight tensor."""
+    rng = np.random.default_rng(seed)
+    data = rng.standard_normal(shape).astype(np.float32) * scale
+    return Tensor(data)
+
+
+def _quantization_error(weight: Tensor, scale: Tensor, zero_point: Tensor | None, config: WeightCompressionConfig, reduction_axes) -> float:
+    """Compute mean squared quantization error: E[|W - s*(Q - z)|^2]."""
+    group_size = config.group_size
+    w = weight
+    reduction = reduction_axes
+
+    if group_size != -1:
+        w, reduction = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size)
+
+    q = w / scale
+    if zero_point is not None:
+        q = q + zero_point
+    q = fns.round(q)
+
+    num_bits = config.num_bits
+    is_asym = config.is_asym_mode
+    level_low = 0 if is_asym else -(2 ** (num_bits - 1))
+    level_high = 2**num_bits - 1 if is_asym else 2 ** (num_bits - 1) - 1
+    q = fns.clip(q, level_low, level_high)
+
+    if zero_point is not None:
+        reconstructed = scale * (q - zero_point)
+    else:
+        reconstructed = scale * q
+
+    diff = w - reconstructed
+    return float(fns.mean(diff * diff).data)
+
+
+@pytest.mark.parametrize("mode,group_size,reduction_axes", [
+    (CompressWeightsMode.INT4_ASYM, 16, 1),
+    (CompressWeightsMode.INT4_SYM, 16, 1),
+    (CompressWeightsMode.INT4_ASYM, -1, 1),
+])
+def test_hqq_reduces_quantization_error(mode, group_size, reduction_axes):
+    """HQQ-optimized params should produce <= quantization error than min-max init."""
+    weight = _make_weight((32, 64), seed=7)
+    config = WeightCompressionConfig(mode=mode, group_size=group_size)
+
+    hqq = HQQ(num_iterations=20)
+    hqq_scale, hqq_zp = hqq._calculate_hqq_params(weight, config, reduction_axes)
+
+    # Baseline: standard min-max initialization
+    w = weight
+    r = reduction_axes
+    if group_size != -1:
+        w, r = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size)
+    baseline_scale, baseline_zp = calculate_integer_quantization_params(w, r, config)
+
+    hqq_err = _quantization_error(weight, hqq_scale, hqq_zp, config, reduction_axes)
+    baseline_err = _quantization_error(weight, baseline_scale, baseline_zp, config, reduction_axes)
+
+    assert hqq_err <= baseline_err + 1e-6, (
+        f"HQQ error ({hqq_err:.6f}) should not exceed min-max error ({baseline_err:.6f}) "
+        f"for mode={mode.value}, group_size={group_size}"
+    )
+
+
+def test_hqq_asymmetric_float_zero_point():
+    """For asymmetric modes HQQ should return a float-valued (non-integer) zero point."""
+    weight = _make_weight((32, 64), seed=13, scale=5.0)
+    config = WeightCompressionConfig(mode=CompressWeightsMode.INT4_ASYM, group_size=16)
+
+    hqq = HQQ(num_iterations=20)
+    _, zero_point = hqq._calculate_hqq_params(weight, config, reduction_axes=1)
+
+    assert zero_point is not None, "Expected non-None zero_point for asymmetric mode"
+
+    zp_np = zero_point.data
+    # The zero point should be float32
+    assert zp_np.dtype == np.float32, f"Expected float32 zero point, got {zp_np.dtype}"
+    # At least some values should be non-integer (HQQ doesn't round to integers)
+    is_integer_valued = np.allclose(zp_np, np.round(zp_np), atol=1e-3)
+    assert not is_integer_valued, "HQQ zero point should be float-valued, not integer-valued"
+
+
+def test_hqq_symmetric_no_zero_point():
+    """For symmetric modes HQQ should return None as zero point."""
+    weight = _make_weight((32, 64), seed=17)
+    config = WeightCompressionConfig(mode=CompressWeightsMode.INT4_SYM, group_size=16)
+
+    hqq = HQQ(num_iterations=20)
+    _, zero_point = hqq._calculate_hqq_params(weight, config, reduction_axes=1)
+
+    assert zero_point is None, "Expected None zero_point for symmetric mode"
+
+
+@pytest.mark.parametrize("num_iterations", [0, 1, 5, 20])
+def test_hqq_num_iterations_parameter(num_iterations):
+    """HQQ should be callable with various num_iterations values including zero."""
+    weight = _make_weight((16, 32), seed=3)
+    config = WeightCompressionConfig(mode=CompressWeightsMode.INT4_ASYM, group_size=16)
+
+    hqq = HQQ(num_iterations=num_iterations)
+    scale, zero_point = hqq._calculate_hqq_params(weight, config, reduction_axes=1)
+
+    assert scale is not None
+    assert zero_point is not None
+    assert scale.shape == zero_point.shape
+
+
+def test_hqq_advanced_parameters_exposed():
+    """AdvancedHQQParameters must be importable from the nncf public namespace."""
+    params = nncf.AdvancedHQQParameters(num_iterations=10)
+    assert params.num_iterations == 10
+
+
+def test_hqq_gptq_mutual_exclusion():
+    """Specifying both hqq=True and gptq=True should raise ParameterNotSupportedError."""
+    from nncf.quantization.algorithms.weight_compression.algorithm import check_user_compression_configuration
+
+    with pytest.raises(nncf.ParameterNotSupportedError, match="HQQ and GPTQ"):
+        check_user_compression_configuration(
+            mode=CompressWeightsMode.INT4_ASYM,
+            subset_size=128,
+            dataset=None,
+            ratio=1.0,
+            group_size=128,
+            all_layers=None,
+            awq=None,
+            scale_estimation=None,
+            gptq=True,
+            lora_correction=None,
+            hqq=True,
+            ignored_scope=None,
+            sensitivity_metric=None,
+            backup_mode=None,
+            compression_format=None,
+            advanced_parameters=None,
+        )
+
+
+def test_hqq_int8_unsupported():
+    """HQQ should not be accepted for INT8 modes."""
+    from nncf.quantization.algorithms.weight_compression.algorithm import check_user_compression_configuration
+
+    with pytest.raises(nncf.ParameterNotSupportedError, match="hqq"):
+        check_user_compression_configuration(
+            mode=CompressWeightsMode.INT8_ASYM,
+            subset_size=128,
+            dataset=None,
+            ratio=None,
+            group_size=None,
+            all_layers=None,
+            awq=None,
+            scale_estimation=None,
+            gptq=None,
+            lora_correction=None,
+            hqq=True,
+            ignored_scope=None,
+            sensitivity_metric=None,
+            backup_mode=None,
+            compression_format=None,
+            advanced_parameters=None,
+        )

From c40b11afee23f81826cfb0a2b0352fe0e10a35dc Mon Sep 17 00:00:00 2001
From: abhayuvi <abhayuvi.raj@gmail.com>
Date: Fri, 13 Mar 2026 16:13:09 +0530
Subject: [PATCH 2/2] fix: round HQQ zero_point to integer for consistent
 quant/dequant

---
 .../algorithms/weight_compression/hqq.py      | 81 +++++++++++--------
 .../openvino/native/quantization/test_hqq.py  | 17 ++--
 2 files changed, 59 insertions(+), 39 deletions(-)

diff --git a/src/nncf/quantization/algorithms/weight_compression/hqq.py b/src/nncf/quantization/algorithms/weight_compression/hqq.py
index fe47cfda306..d075d9ae713 100644
--- a/src/nncf/quantization/algorithms/weight_compression/hqq.py
+++ b/src/nncf/quantization/algorithms/weight_compression/hqq.py
@@ -37,8 +37,9 @@ class HQQ:
 
     HQQ is a data-free weight quantization algorithm that minimizes quantization error
     without requiring calibration data. It uses alternating least-squares optimization
-    to jointly find optimal scale and zero-point parameters, producing floating-point
-    zero points for asymmetric quantization.
+    to find optimal scale and zero-point parameters. For asymmetric quantization, HQQ
+    optimizes the zero-point as a continuous float during iterations, then rounds it to
+    the nearest integer before returning so that quantization and dequantization agree.
 
     Reference: "Half-Quadratic Quantization of Large Machine Learning Models"
     (https://mobiusml.github.io/hqq_blog/)
@@ -154,19 +155,23 @@ def _calculate_hqq_params(
         Computes HQQ-optimized scale and zero point for integer quantization.
 
         The algorithm alternates between two steps until convergence:
-          - Quantization step: Q = clamp(round(W / s + z), q_min, q_max)
+          - Quantization step: Q = clamp(round(W * inv_s + z), q_min, q_max)
           - Parameter update:
-            * Asymmetric (W ≈ s * (Q - z)): joint closed-form least-squares for s and z.
-            * Symmetric (W ≈ s * Q): closed-form update for s alone.
+            * Asymmetric: scale is fixed (min-max init); only z is updated via
+              closed-form `z = mean(Q - W * inv_s)` (per the paper).
+            * Symmetric: z = None; scale is updated via `s = sum(W*Q) / sum(Q²)`.
 
-        For the asymmetric case the zero point z is float-valued (not rounded to an integer),
-        which gives HQQ better reconstruction quality than standard min-max initialization.
+        For the asymmetric case the zero point z is optimized as a continuous float during
+        iterations (giving better reconstruction than integer-only search), then rounded and
+        clipped to the valid integer range before being returned. This ensures consistency
+        between quantization (which uses the returned z) and dequantization (which loads z
+        as a stored integer, e.g. uint4).
 
         :param weight: Weight tensor in float32.
         :param config: Weight compression configuration.
         :param reduction_axes: Reduction axes for the weight tensor.
-        :return: Tuple of (scale, zero_point). zero_point is float for asymmetric mode,
-            None for symmetric mode.
+        :return: Tuple of (scale, zero_point). zero_point is an integer-valued float32
+            tensor for asymmetric mode, None for symmetric mode.
         """
         group_size = config.group_size
         group_reduction_axes = reduction_axes
@@ -199,42 +204,52 @@ def _calculate_hqq_params(
         if zero_point is not None:
             zero_point = fns.astype(zero_point, TensorDataType.float32)
 
+        # Pre-compute inv_scale once; scale is fixed for asymmetric iterations.
+        inv_scale = 1.0 / fns.where(fns.abs(scale) < eps, eps, scale)
+
         for _ in range(self._num_iterations):
-            # Quantization step: Q = clamp(round(W / s + z), q_min, q_max)
-            q_float = weight / scale
+            # Quantization step: Q = clamp(round(W * inv_s + z), q_min, q_max)
+            q_float = weight * inv_scale
             if zero_point is not None:
                 q_float = q_float + zero_point
             q_float = fns.round(q_float)
             q_float = fns.clip(q_float, level_low, level_high)
 
             if is_asym:
-                # Asymmetric least-squares update for (s, z): minimize ||W - s*(Q - z)||^2.
-                # Letting b = s*z, normal equations give:
-                #   det = n * sum_QQ - sum_Q^2
-                #   s   = (n * sum_QW - sum_Q * sum_W) / det
-                #   z   = (sum_Q * sum_QW - sum_QQ * sum_W) / (det * s)
-                sum_q = fns.sum(q_float, axis=group_reduction_axes, keepdims=True)
-                sum_w = fns.sum(weight, axis=group_reduction_axes, keepdims=True)
-                sum_qq = fns.sum(q_float * q_float, axis=group_reduction_axes, keepdims=True)
-                sum_qw = fns.sum(q_float * weight, axis=group_reduction_axes, keepdims=True)
-
-                det = n * sum_qq - sum_q * sum_q
-                safe_det = fns.where(fns.abs(det) < eps, eps, det)
-
-                new_scale = (n * sum_qw - sum_q * sum_w) / safe_det
-                new_scale = fns.where(fns.abs(new_scale) < eps, eps, new_scale)
-                new_zero_point = (sum_q * sum_qw - sum_qq * sum_w) / (safe_det * new_scale)
-
-                scale = new_scale
-                zero_point = new_zero_point
-
+                # Asymmetric: fix scale, update zero_point only (per the paper).
+                # Minimizing ||W - s*(Q - z)||² w.r.t. z gives:
+                #   z = mean(Q - W/s) = sum(Q - W*inv_s) / n
+                zero_point = fns.sum(q_float - weight * inv_scale, axis=group_reduction_axes, keepdims=True)
+                zero_point = zero_point / n
             else:
-                # Symmetric least-squares update for s: minimize ||W - s*Q||^2.
-                #   s = sum(W*Q) / sum(Q^2)
+                # Symmetric OLS update for scale: minimize ||W - s*Q||².
+                #   s = sum(W*Q) / sum(Q²)
                 sum_qw = fns.sum(q_float * weight, axis=group_reduction_axes, keepdims=True)
                 sum_qq = fns.sum(q_float * q_float, axis=group_reduction_axes, keepdims=True)
                 denom = fns.where(fns.abs(sum_qq) < eps, eps, sum_qq)
                 scale = sum_qw / denom
                 scale = fns.where(fns.abs(scale) < eps, eps, scale)
+                inv_scale = 1.0 / scale
+
+        # Round and clip zero_point to the valid integer range so that quantization
+        # and dequantization (which stores zp as uint4) use the exact same value.
+        if zero_point is not None:
+            zero_point = self._round_zero_point(zero_point, level_low, level_high)
 
         return scale, zero_point
+
+    @staticmethod
+    def _round_zero_point(zero_point: Tensor, level_low: int, level_high: int) -> Tensor:
+        """
+        Rounds the float zero_point to the nearest integer and clips it to the valid quantization range.
+
+        HQQ optimizes z as a continuous value during iterations, but the OV backend stores
+        zero_point as integer (uint4 for INT4). To ensure that quantization and dequantization
+        use the same z, the final float z is rounded and clipped before returning.
+
+        :param zero_point: Float zero_point tensor from HQQ iterations.
+        :param level_low: Minimum valid zero_point value.
+        :param level_high: Maximum valid zero_point value.
+        :return: Rounded and clipped zero_point.
+        """
+        return fns.clip(fns.round(zero_point), level_low, level_high)
diff --git a/tests/openvino/native/quantization/test_hqq.py b/tests/openvino/native/quantization/test_hqq.py
index 45c73889a79..97688e29edf 100644
--- a/tests/openvino/native/quantization/test_hqq.py
+++ b/tests/openvino/native/quantization/test_hqq.py
@@ -90,8 +90,13 @@ def test_hqq_reduces_quantization_error(mode, group_size, reduction_axes):
     )
 
 
-def test_hqq_asymmetric_float_zero_point():
-    """For asymmetric modes HQQ should return a float-valued (non-integer) zero point."""
+def test_hqq_asymmetric_zero_point_rounded():
+    """HQQ should return an integer-valued zero point for use with uint4 storage.
+
+    HQQ optimizes z as a continuous float during iterations, but the final value is
+    rounded and clipped so that quantization and dequantization use the same integer z.
+    The tensor dtype stays float32 (no cast), but all values should be integer-valued.
+    """
     weight = _make_weight((32, 64), seed=13, scale=5.0)
     config = WeightCompressionConfig(mode=CompressWeightsMode.INT4_ASYM, group_size=16)
 
@@ -101,11 +106,11 @@ def test_hqq_asymmetric_float_zero_point():
     assert zero_point is not None, "Expected non-None zero_point for asymmetric mode"
 
     zp_np = zero_point.data
-    # The zero point should be float32
+    # dtype remains float32 (no explicit cast); values are integer-valued after rounding.
     assert zp_np.dtype == np.float32, f"Expected float32 zero point, got {zp_np.dtype}"
-    # At least some values should be non-integer (HQQ doesn't round to integers)
-    is_integer_valued = np.allclose(zp_np, np.round(zp_np), atol=1e-3)
-    assert not is_integer_valued, "HQQ zero point should be float-valued, not integer-valued"
+    assert np.allclose(zp_np, np.round(zp_np), atol=1e-5), (
+        "HQQ zero point should be integer-valued after rounding for consistent uint4 storage"
+    )
 
 
 def test_hqq_symmetric_no_zero_point():