Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .ci/cspell_dict.txt
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ hellaswag
hiddens
hparam
hparams
hqq
hswish
huggingface
hutter
Expand Down
1 change: 1 addition & 0 deletions src/nncf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
from nncf.quantization.advanced_parameters import AdvancedBiasCorrectionParameters as AdvancedBiasCorrectionParameters
from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as AdvancedCompressionParameters
from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as AdvancedGPTQParameters
from nncf.quantization.advanced_parameters import AdvancedHQQParameters as AdvancedHQQParameters
from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as AdvancedLoraCorrectionParameters
from nncf.quantization.advanced_parameters import AdvancedQuantizationParameters as AdvancedQuantizationParameters
from nncf.quantization.advanced_parameters import AdvancedScaleEstimationParameters as AdvancedScaleEstimationParameters
Expand Down
10 changes: 6 additions & 4 deletions src/nncf/experimental/torch/fx/quantization/quantize_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,9 @@ def compress_weights_impl(
scale_estimation: bool,
gptq: bool,
lora_correction: bool,
backup_mode: BackupMode,
compression_format: CompressionFormat,
hqq: bool = False,
backup_mode: BackupMode = BackupMode.INT8_ASYM,
compression_format: CompressionFormat = CompressionFormat.DQ,
advanced_parameters: AdvancedCompressionParameters | None = None,
) -> torch.fx.GraphModule:
"""
Expand All @@ -149,8 +150,9 @@ def compress_weights_impl(
gptq,
lora_correction,
backup_mode,
compression_format,
advanced_parameters,
hqq=hqq,
compression_format=compression_format,
advanced_parameters=advanced_parameters,
)
graph = build_graph(model)
compressed_model = compression_algorithm.apply(model, graph, dataset=dataset)
Expand Down
10 changes: 6 additions & 4 deletions src/nncf/onnx/quantization/quantize_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,9 @@ def compress_weights_impl(
scale_estimation: bool,
gptq: bool,
lora_correction: bool,
backup_mode: BackupMode,
compression_format: CompressionFormat,
hqq: bool = False,
backup_mode: BackupMode = BackupMode.INT8_ASYM,
compression_format: CompressionFormat = CompressionFormat.DQ,
advanced_parameters: AdvancedCompressionParameters | None = None,
) -> onnx.ModelProto:
if model.opset_import[0].version < 13:
Expand Down Expand Up @@ -357,8 +358,9 @@ def compress_weights_impl(
gptq,
lora_correction,
backup_mode,
compression_format,
advanced_parameters,
hqq=hqq,
compression_format=compression_format,
advanced_parameters=advanced_parameters,
)
graph = build_graph(model)

Expand Down
10 changes: 6 additions & 4 deletions src/nncf/openvino/quantization/quantize_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,8 +376,9 @@ def compress_weights_impl(
scale_estimation: bool,
gptq: bool,
lora_correction: bool,
backup_mode: BackupMode,
compression_format: CompressionFormat,
hqq: bool = False,
backup_mode: BackupMode = BackupMode.INT8_ASYM,
compression_format: CompressionFormat = CompressionFormat.DQ,
advanced_parameters: AdvancedCompressionParameters | None = None,
) -> ov.Model:
"""
Expand All @@ -398,8 +399,9 @@ def compress_weights_impl(
gptq,
lora_correction,
backup_mode,
compression_format,
advanced_parameters,
hqq=hqq,
compression_format=compression_format,
advanced_parameters=advanced_parameters,
)

statistics_points = None
Expand Down
18 changes: 18 additions & 0 deletions src/nncf/quantization/advanced_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,21 @@ class AdvancedLoraCorrectionParameters:
use_int8_adapters: bool = True


@api()
@dataclass
class AdvancedHQQParameters:
"""
Contains advanced parameters for the HQQ (Half-Quadratic Quantization) algorithm.

:param num_iterations: Number of alternating optimization iterations used to jointly
refine scale and zero point. More iterations improve quality at the cost of runtime.
Defaults to 20.
:type num_iterations: int
"""

num_iterations: int = 20


@api()
@dataclass
class AdvancedAdaptiveCodebookParameters:
Expand Down Expand Up @@ -423,6 +438,8 @@ class AdvancedCompressionParameters:
:type scale_estimation_params: AdvancedScaleEstimationParameters
:param gptq_params: Advanced parameters for GPTQ algorithm.
:type gptq_params: AdvancedGPTQParameters
:param hqq_params: Advanced parameters for HQQ algorithm.
:type hqq_params: AdvancedHQQParameters
:param lora_correction_params: Advanced parameters for Lora Correction algorithm.
:type lora_correction_params: AdvancedLoraCorrectionParameters
:param backend_params: Backend-specific parameters.
Expand All @@ -443,6 +460,7 @@ class AdvancedCompressionParameters:
default_factory=AdvancedScaleEstimationParameters
)
gptq_params: AdvancedGPTQParameters = field(default_factory=AdvancedGPTQParameters)
hqq_params: AdvancedHQQParameters = field(default_factory=AdvancedHQQParameters)
lora_correction_params: AdvancedLoraCorrectionParameters = field(default_factory=AdvancedLoraCorrectionParameters)
backend_params: dict[str, Any] = field(default_factory=dict)
codebook: TTensor | None = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
from nncf.quantization.algorithms.weight_compression.constants import CB4_QUANTILES
from nncf.quantization.algorithms.weight_compression.gptq import GPTQ
from nncf.quantization.algorithms.weight_compression.hqq import HQQ
from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation
Expand Down Expand Up @@ -81,6 +82,7 @@ def get_weight_compression_configuration(
scale_estimation: bool | None = None,
gptq: bool | None = None,
lora_correction: bool | None = None,
hqq: bool | None = None,
ignored_scope: IgnoredScope | None = None,
sensitivity_metric: SensitivityMetric | None = None,
backup_mode: BackupMode | None = None,
Expand Down Expand Up @@ -120,6 +122,7 @@ def get_weight_compression_configuration(
"scale_estimation": scale_estimation or False,
"gptq": gptq or False,
"lora_correction": lora_correction or False,
"hqq": hqq or False,
"ignored_scope": ignored_scope or IgnoredScope(),
"sensitivity_metric": (
(
Expand All @@ -146,6 +149,7 @@ def check_user_compression_configuration(
scale_estimation: bool | None,
gptq: bool | None,
lora_correction: bool | None,
hqq: bool | None,
ignored_scope: IgnoredScope | None,
sensitivity_metric: SensitivityMetric | None,
backup_mode: BackupMode | None,
Expand Down Expand Up @@ -175,6 +179,7 @@ def check_user_compression_configuration(
"scale_estimation": scale_estimation,
"gptq": gptq,
"lora_correction": lora_correction,
"hqq": hqq,
"backup_mode": backup_mode,
}
unsupported_for_int8 = [name for name, value in unsupported_options.items() if value is not None]
Expand Down Expand Up @@ -257,6 +262,10 @@ def check_user_compression_configuration(
requires a dataset, but it's not provided."
raise nncf.ValidationError(msg)

if hqq and gptq:
msg = "Simultaneous use of HQQ and GPTQ algorithms is not supported. Select one of them."
raise nncf.ParameterNotSupportedError(msg)

if lora_correction and compression_format in [
CompressionFormat.FQ,
CompressionFormat.FQ_LORA,
Expand Down Expand Up @@ -311,6 +320,7 @@ def __init__(
gptq: bool,
lora_correction: bool,
backup_mode: BackupMode,
hqq: bool = False,
compression_format: CompressionFormat = CompressionFormat.DQ,
advanced_parameters: AdvancedCompressionParameters | None = None,
):
Expand Down Expand Up @@ -355,6 +365,10 @@ def __init__(
INT8_ASYM stands for 8-bit integer asymmetric quantization with a typical non-fixed zero point.
MXFP8_E4M3 stands for MX-compliant FP8 format with E4M3 values sharing group-level E8M0 scale.
FP8_E4M3 stands for FP8 format with E4M3 values sharing group-level fp16 scale.
:param hqq: determines whether to use the HQQ (Half-Quadratic Quantization) algorithm.
HQQ is a data-free method that optimizes scale and zero-point jointly via alternating
least-squares, typically producing lower quantization error than standard min-max
initialization, especially for 4-bit group-wise compression.
:param compression_format: Describes the format in which the model is saved after weight compression.
:param advanced_parameters: advanced parameters for algorithms in compression pipeline.
"""
Expand All @@ -376,6 +390,7 @@ def __init__(
self._codebook_estimation = mode == CompressWeightsMode.ADAPTIVE_CODEBOOK
self._backup_mode = backup_mode
self._compression_format = compression_format
self._hqq = hqq
self._advanced_parameters = (
advanced_parameters if advanced_parameters is not None else AdvancedCompressionParameters()
)
Expand Down Expand Up @@ -405,6 +420,9 @@ def __init__(
subset_size=gptq_params.subset_size,
scale_estimation=self._scale_estimation,
)
if self._hqq:
hqq_params = self._advanced_parameters.hqq_params
self._hqq_algo = HQQ(num_iterations=hqq_params.num_iterations)
if self._scale_estimation:
scale_estimation_params = self._advanced_parameters.scale_estimation_params
self._scale_estimation_algo = ScaleEstimation(
Expand Down Expand Up @@ -1163,6 +1181,14 @@ def apply_with_parameters(
backend_entity=self._backend_entity,
)
else:
if self._hqq:
precomputed_compressed_weights = self._hqq_algo.apply(
model=model,
graph=graph,
all_weight_params=all_weight_params,
backend_entity=self._backend_entity,
)

if self._scale_estimation:
precomputed_compressed_weights = self._scale_estimation_algo.apply(
model=model,
Expand Down Expand Up @@ -1211,6 +1237,7 @@ def apply_with_parameters(
"scale_estimation": self._scale_estimation,
"gptq": self._gptq,
"lora_correction": self._lora_correction,
"hqq": self._hqq,
"backup_mode": self._backup_mode.value,
"compression_format": self._compression_format.value,
"advanced_parameters": convert_to_dict_recursively(self._advanced_parameters),
Expand Down
Loading