Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/nncf/onnx/graph/model_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,13 +284,17 @@ def _get_scale_zero_point_tensors(
dims = scale.shape if per_channel else []
onnx_scale = [scale.tolist()] if not per_channel else scale
onnx_zero_point = [zero_point.tolist()] if not per_channel else zero_point

if tensor_type == np.uint8:
onnx_tensor_type = onnx.TensorProto.UINT8
elif tensor_type == np.int8:
onnx_tensor_type = onnx.TensorProto.INT8
elif tensor_type in (onnx.TensorProto.FLOAT8E5M2, onnx.TensorProto.FLOAT8E4M3FN):
onnx_tensor_type = tensor_type
else:
msg = f"Incorrect tensor type - {tensor_type}."
raise nncf.ValidationError(msg)

assert quantizer.input[1] == dequantizer.input[1] and quantizer.input[2] == dequantizer.input[2]
scale_tensor_name = quantizer.input[1]
zero_point_tensor_name = quantizer.input[2]
Expand Down
12 changes: 7 additions & 5 deletions src/nncf/onnx/quantization/quantize_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,13 +139,14 @@ def quantize_impl(
if target_device == TargetDevice.CPU_SPR:
msg = "target_device == CPU_SPR is not supported."
raise nncf.ValidationError(msg)
if mode is not None:
msg = f"mode={mode} is not supported"
raise ValueError(msg)
if model.opset_import[0].version < 10:

opset_version = model.opset_import[0].version
if opset_version < 21 and mode is not None:
msg = f"FP8 quantization requires opset >= 21, got {opset_version}"
if opset_version < 10:
msg = "ONNX models with opset version < 10 do not support quantization."
raise nncf.ValidationError(msg)
if model.opset_import[0].version < 13:
if opset_version < 13:
nncf_logger.warning(
"ONNX models with 10 < opset version < 13 do not support per-channel quantization."
" Per-tensor quantization will be applied."
Expand All @@ -163,6 +164,7 @@ def quantize_impl(
model = apply_preprocess_passes(model)

quantization_algorithm = PostTrainingQuantization(
mode=mode,
preset=preset,
target_device=target_device,
subset_size=subset_size,
Expand Down
39 changes: 38 additions & 1 deletion src/nncf/onnx/quantization/quantizer_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
from dataclasses import dataclass

import numpy as np
import onnx

from nncf.quantization.advanced_parameters import FP8Type
from nncf.quantization.fake_quantize import FakeConvertParameters
from nncf.quantization.fake_quantize import FakeQuantizeParameters
from nncf.quantization.fake_quantize import calculate_scale_zero_point
from nncf.tensor import functions as fns
Expand All @@ -31,10 +34,44 @@ class ONNXQuantizerLayerParameters:

scale: np.ndarray
zero_point: np.ndarray
tensor_type: np.dtype
tensor_type: onnx.TensorProto.DataType | np.dtype
axis: int | None = None


def convert_fc_params_to_onnx_params(
parameters: FakeConvertParameters, axis: int | None
) -> ONNXQuantizerLayerParameters:
"""
Converts common FakeConvertParameters to ONNXQuantizerLayerParameters.

:param parameters: FakeConvertParameters representation.
:param axis: Axis for per-channel quantization.
:return: Quantizer layer attributes.
"""
if parameters.destination_type == FP8Type.E4M3:
tensor_type = onnx.TensorProto.FLOAT8E4M3FN
elif parameters.destination_type == FP8Type.E5M2:
tensor_type = onnx.TensorProto.FLOAT8E5M2
else:
msg = f"Unsupported FP8type: {parameters.destination_type}. Expected FP8Type.E4M3 or FP8Type.E5M2"
raise ValueError(msg)

scale = parameters.scale
zero_point = parameters.shift

# TODO(andrey-churkin): Check that scale and zero_point are calculated correctly.

# NOTE: adding machine epsilon to avoid division by zero
eps = fns.finfo(scale).eps
scale = fns.where(fns.abs(scale) < eps, eps, scale)
scale = 1.0 / scale
# ONNX demands parameters to be a scalar or 1-D Tensor.
scale = fns.squeeze(scale)
zero_point = fns.squeeze(zero_point)

return ONNXQuantizerLayerParameters(scale.data, zero_point.data, tensor_type, axis)


def convert_fq_params_to_onnx_params(
parameters: FakeQuantizeParameters, num_bits: int, tensor_type: np.dtype, axis: tuple[int]
) -> ONNXQuantizerLayerParameters:
Expand Down
6 changes: 4 additions & 2 deletions src/nncf/quantization/algorithms/min_max/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1030,7 +1030,9 @@ def filter_func(point: StatisticPoint) -> bool:
)
for quantization_target_point in unified_scale_group:
transformation_layout.register(
self._backend_entity.create_convert_insertion_command(quantization_target_point, parameters)
self._backend_entity.create_convert_insertion_command(
graph, quantization_target_point, qconfig, parameters
)
)
unified_ops_list.add(quantization_target_point)
continue
Expand Down Expand Up @@ -1069,7 +1071,7 @@ def filter_func(point: StatisticPoint) -> bool:
statistics, is_per_channel=qconfig.per_channel, destination_type=destination_type
)
command = self._backend_entity.create_convert_insertion_command(
quantization_target_point, parameters
graph, quantization_target_point, qconfig, parameters
)
else:
parameters = calculate_quantizer_parameters(statistics, qconfig, quant_group, half_range)
Expand Down
4 changes: 4 additions & 0 deletions src/nncf/quantization/algorithms/min_max/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,13 +198,17 @@ def create_unified_scales_quantizers_insertion_commands(
@staticmethod
@abstractmethod
def create_convert_insertion_command(
nncf_graph: NNCFGraph,
target_point: TargetPoint,
quantizer_config: QuantizerConfig,
parameters: FakeConvertParameters,
) -> Command:
"""
Returns backend-specific convert insertion command.

:param nncf_graph: NNCFGraph to get input/output shapes for the target point.
:param target_point: Target location for the correction.
:param quantizer_config: QuantizerConfig instance for the current layer.
:param parameters: FakeConvertParameters to calculate activation quantization parameters.
:return: Backend-specific Command for the quantizer insertion operation.
"""
Expand Down
15 changes: 12 additions & 3 deletions src/nncf/quantization/algorithms/min_max/onnx_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

import numpy as np

import nncf
from nncf.common.graph.graph import NNCFGraph
from nncf.common.graph.graph import NNCFNode
from nncf.common.graph.operator_metatypes import OperatorMetatype
Expand All @@ -33,6 +32,7 @@
from nncf.onnx.graph.transformations.commands import ONNXTargetPoint
from nncf.onnx.hardware.config import ONNXHWConfig
from nncf.onnx.quantization.default_quantization import DEFAULT_ONNX_QUANT_TRAIT_TO_OP_DICT
from nncf.onnx.quantization.quantizer_parameters import convert_fc_params_to_onnx_params
from nncf.onnx.quantization.quantizer_parameters import convert_fq_params_to_onnx_params
from nncf.parameters import ModelType
from nncf.parameters import TargetDevice
Expand Down Expand Up @@ -158,11 +158,20 @@ def create_unified_scales_quantizers_insertion_commands(

@staticmethod
def create_convert_insertion_command(
nncf_graph: NNCFGraph,
target_point: ONNXTargetPoint,
quantizer_config: QuantizerConfig,
parameters: FakeConvertParameters,
) -> TransformationCommand:
msg = "FakeConvert insertion not implemented in ONNX backend!"
raise nncf.InternalError(msg)
axis = None
if quantizer_config.per_channel:
node = nncf_graph.get_node_by_name(target_point.target_node_name)
axis = (
get_weight_quantization_axis(node, target_point.port_id) if target_point.is_weight_target_point() else 1
)
onnx_parameters = convert_fc_params_to_onnx_params(parameters, axis)
nncf_input_node_next_nodes = ONNXMinMaxAlgoBackend._get_input_edges_mapping(nncf_graph)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential point for an optimization in future. Maybe we could add a comment to highligt it

return ONNXQuantizerInsertionCommand(target_point, nncf_input_node_next_nodes, onnx_parameters)

@staticmethod
def _get_input_edges_mapping(nncf_graph: NNCFGraph):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,9 @@ def create_unified_scales_quantizers_insertion_commands(

@staticmethod
def create_convert_insertion_command(
nncf_graph: NNCFGraph,
target_point: OVTargetPoint,
quantizer_config: QuantizerConfig,
parameters: FakeConvertParameters,
) -> OVQuantizerInsertionCommand:
return OVConvertInsertionCommand(target_point, parameters)
Expand Down
2 changes: 2 additions & 0 deletions src/nncf/quantization/algorithms/min_max/torch_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,9 @@ def target_point(target_type: TargetType, target_node_name: str, port_id: int) -

@staticmethod
def create_convert_insertion_command(
nncf_graph: NNCFGraph,
target_point: PTTargetPoint,
quantizer_config: QuantizerConfig,
parameters: FakeConvertParameters,
) -> TransformationCommand:
msg = "FakeConvert insertion not implemented in PyTorch backend!"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,9 @@ def target_point(target_type: TargetType, target_node_name: str, port_id: int) -

@staticmethod
def create_convert_insertion_command(
nncf_graph: NNCFGraph,
target_point: PTTargetPoint,
quantizer_config: QuantizerConfig,
parameters: FakeConvertParameters,
) -> TransformationCommand:
msg = "FakeConvert insertion not implemented in PyTorch backend!"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ class ONNXWeightCompressionAlgoBackend(WeightCompressionAlgoBackend):
CompressWeightsMode.INT8_ASYM: onnx.TensorProto.UINT8,
CompressWeightsMode.INT4_SYM: onnx.TensorProto.INT4,
CompressWeightsMode.INT4_ASYM: onnx.TensorProto.UINT4,
CompressWeightsMode.FP8_E4M3: onnx.TensorProto.FLOAT8E4M3FN,
}

def __init__(self, model: onnx.ModelProto):
Expand Down Expand Up @@ -363,8 +364,14 @@ def _add_dequantize_linear_layer(
zero_point = pack_4_bits(zero_point)

# Create initializers for the quantized weights, scale, and zero point
if weight_dtype == onnx.TensorProto.FLOAT8E4M3FN:
np_dtype = helper.tensor_dtype_to_np_dtype(weight_dtype)
vals = onnx.numpy_helper.saturate_cast(np.asarray(quantized_weights), np_dtype).flatten()
else:
vals = quantized_weights
Comment on lines +367 to +371
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two similar code blocks, maybe worth a private method?


quantized_weights_initializer = onnx.helper.make_tensor(
quantized_weight_name, weight_dtype, orig_shape, quantized_weights.tobytes(), raw=True
quantized_weight_name, weight_dtype, orig_shape, vals.tobytes(), raw=True
)
scale_initializer = numpy_helper.from_array(
np.array(scale, dtype=helper.tensor_dtype_to_np_dtype(scale_dtype)), name=scale_name
Expand All @@ -374,8 +381,15 @@ def _add_dequantize_linear_layer(

if zero_point is not None:
deq_inputs.append(weight_name + "_zero_point")

if weight_dtype == onnx.TensorProto.FLOAT8E4M3FN:
np_dtype = helper.tensor_dtype_to_np_dtype(weight_dtype)
vals = onnx.numpy_helper.saturate_cast(np.asarray(zero_point), np_dtype).flatten()
else:
vals = zero_point

zero_point_initializer = onnx.helper.make_tensor(
weight_name + "_zero_point", weight_dtype, orig_zero_point_shape, zero_point.tobytes(), raw=True
weight_name + "_zero_point", weight_dtype, orig_zero_point_shape, vals.tobytes(), raw=True
)
new_initializers.append(zero_point_initializer)

Expand Down
1 change: 0 additions & 1 deletion src/nncf/quantization/quantize_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,6 @@ def compress_weights(
CompressWeightsMode.NF4,
CompressWeightsMode.MXFP4,
CompressWeightsMode.MXFP8_E4M3,
CompressWeightsMode.FP8_E4M3,
CompressWeightsMode.FP4,
CompressWeightsMode.NVFP4,
CompressWeightsMode.CODEBOOK,
Expand Down
1 change: 0 additions & 1 deletion tests/onnx/quantization/test_weights_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
CompressWeightsMode.NVFP4,
CompressWeightsMode.MXFP4,
CompressWeightsMode.MXFP8_E4M3,
CompressWeightsMode.FP8_E4M3,
CompressWeightsMode.FP4,
)

Expand Down