diff --git a/tensorflow/compiler/mlir/lite/core/c/builtin_op_data.h b/tensorflow/compiler/mlir/lite/core/c/builtin_op_data.h index 1327162f232..c580bf03cd3 100644 --- a/tensorflow/compiler/mlir/lite/core/c/builtin_op_data.h +++ b/tensorflow/compiler/mlir/lite/core/c/builtin_op_data.h @@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ + /// WARNING: Users of TensorFlow Lite should not include this file directly, -/// but should instead include -/// "third_party/tensorflow/lite/c/builtin_op_data.h". -/// Only the TensorFlow Lite implementation itself should include this -/// file directly. +/// only the TensorFlow Lite implementation itself should. + +// IWYU pragma: private, include "third_party/tensorflow/lite/c/builtin_op_data.h" + #ifndef TENSORFLOW_COMPILER_MLIR_LITE_CORE_C_BUILTIN_OP_DATA_H_ #define TENSORFLOW_COMPILER_MLIR_LITE_CORE_C_BUILTIN_OP_DATA_H_ diff --git a/tensorflow/lite/core/api/tensor_utils.cc b/tensorflow/lite/core/api/tensor_utils.cc index 18a643c78dc..c5052c78f84 100644 --- a/tensorflow/lite/core/api/tensor_utils.cc +++ b/tensorflow/lite/core/api/tensor_utils.cc @@ -33,8 +33,8 @@ TfLiteStatus ResetVariableTensor(TfLiteTensor* tensor) { } // TODO(b/139446230): Provide a platform header to better handle these // specific scenarios. -#if __ANDROID__ || defined(__x86_64__) || defined(__i386__) || \ - defined(__i386) || defined(__x86__) || defined(__X86__) || \ +#if defined(__ANDROID__) || defined(__x86_64__) || defined(__i386__) || \ + defined(__i386) || defined(__x86__) || defined(__X86__) || \ defined(_X86_) || defined(_M_IX86) || defined(_M_X64) memset(tensor->data.raw, value, tensor->bytes); #else diff --git a/tensorflow/lite/core/c/common.cc b/tensorflow/lite/core/c/common.cc index d458b1eb29b..baa6282fd5b 100644 --- a/tensorflow/lite/core/c/common.cc +++ b/tensorflow/lite/core/c/common.cc @@ -113,14 +113,25 @@ TfLiteQuantization TfLiteQuantizationClone(const TfLiteQuantization& src) { case kTfLiteAffineQuantization: { dst.params = calloc(1, sizeof(TfLiteAffineQuantization)); const TfLiteAffineQuantization* const src_params = - (TfLiteAffineQuantization*)(src.params); + reinterpret_cast(src.params); TfLiteAffineQuantization* const dst_params = - (TfLiteAffineQuantization*)(dst.params); + reinterpret_cast(dst.params); dst_params->quantized_dimension = src_params->quantized_dimension; dst_params->scale = TfLiteFloatArrayCopy(src_params->scale); dst_params->zero_point = TfLiteIntArrayCopy(src_params->zero_point); break; } + case kTfLiteBlockwiseQuantization: { + dst.params = calloc(1, sizeof(TfLiteBlockwiseQuantization)); + const TfLiteBlockwiseQuantization* const src_params = + (TfLiteBlockwiseQuantization*)(src.params); + TfLiteBlockwiseQuantization* const dst_params = + (TfLiteBlockwiseQuantization*)(dst.params); + dst_params->blocksize = src_params->blocksize; + dst_params->scale = src_params->scale; + dst_params->zero_point = src_params->zero_point; + break; + } } return dst; } @@ -225,7 +236,7 @@ void TfLiteTensorDataFree(TfLiteTensor* t) { void TfLiteQuantizationFree(TfLiteQuantization* quantization) { if (quantization->type == kTfLiteAffineQuantization) { TfLiteAffineQuantization* q_params = - (TfLiteAffineQuantization*)(quantization->params); + reinterpret_cast(quantization->params); if (q_params->scale) { TfLiteFloatArrayFree(q_params->scale); q_params->scale = nullptr; diff --git a/tensorflow/lite/core/c/common.h b/tensorflow/lite/core/c/common.h index 87a9b1a5075..57caa3b759a 100644 --- a/tensorflow/lite/core/c/common.h +++ b/tensorflow/lite/core/c/common.h @@ -328,6 +328,8 @@ typedef enum TfLiteQuantizationType : int { /// Affine quantization (with support for per-channel quantization). /// Corresponds to TfLiteAffineQuantization. kTfLiteAffineQuantization = 1, + /// Blockwise quantization. + kTfLiteBlockwiseQuantization = 2, } TfLiteQuantizationType; /// Structure specifying the quantization used by the tensor, if-any. @@ -353,6 +355,20 @@ typedef struct TfLiteAffineQuantization { int32_t quantized_dimension; } TfLiteAffineQuantization; +/// Parameters for blockwise quantization across the output channels dimension. +/// For a particular value in quantized_dimension, quantized values can be +/// converted back to float using: +/// `real_value = scale * (quantized_value - zero_point)` +typedef struct TfLiteBlockwiseQuantization { + // Index of the tensor containing the scales. + int32_t scale; + // Index of the tensor containing the zero points. + int32_t zero_point; + // Quantization blocksize. + int32_t blocksize; + int32_t quantized_dimension; +} TfLiteBlockwiseQuantization; + /// A union of pointers that points to memory for a given tensor. /// /// Do not access these members directly, if possible, use diff --git a/tensorflow/lite/kernels/internal/reference/fully_connected.h b/tensorflow/lite/kernels/internal/reference/fully_connected.h index ba51cbcfe3e..bccc6220062 100644 --- a/tensorflow/lite/kernels/internal/reference/fully_connected.h +++ b/tensorflow/lite/kernels/internal/reference/fully_connected.h @@ -16,6 +16,8 @@ limitations under the License. #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_ #include +#include +#include #include "ruy/profiler/instrumentation.h" // from @ruy #include "tensorflow/lite/kernels/internal/common.h" @@ -62,6 +64,59 @@ inline void FullyConnected( } } +// This implementation receives the scales in float and performs requant in +// float to avoid loss of precision. +inline void FullyConnected( + const FullyConnectedParams& params, const RuntimeShape& input_shape, + const uint8_t* input_data, const RuntimeShape& filter_shape, + const uint8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + float input_scale, float output_scale, float filter_scale, + uint8_t* output_data) { + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2); + TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); + + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + // TODO(b/62193649): This really should be: + // const int batches = ArraySize(output_dims, 1); + // but the current --variable_batch hack consists in overwriting the 3rd + // dimension with the runtime batch size, as we don't keep track for each + // array of which dimension is the batch dimension in it. + const int output_dim_count = output_shape.DimensionsCount(); + const int filter_dim_count = filter_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); + const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2, + output_shape, output_dim_count - 1); + const int accum_depth = filter_shape.Dims(filter_dim_count - 1); + for (int b = 0; b < batches; ++b) { + for (int out_c = 0; out_c < output_depth; ++out_c) { + int32_t acc = 0; + for (int d = 0; d < accum_depth; ++d) { + int32_t input_val = input_data[b * accum_depth + d]; + int32_t filter_val = filter_data[out_c * accum_depth + d]; + acc += (filter_val + filter_offset) * (input_val + input_offset); + } + if (bias_data) { + acc += bias_data[out_c]; + } + const double effective_output_scale = static_cast(input_scale) * + static_cast(filter_scale) / + static_cast(output_scale); + int32_t acc_scaled = static_cast( + round(static_cast(acc) * effective_output_scale)); + acc_scaled += output_offset; + acc_scaled = std::max(acc_scaled, output_activation_min); + acc_scaled = std::min(acc_scaled, output_activation_max); + output_data[out_c + output_depth * b] = static_cast(acc_scaled); + } + } +} + inline void FullyConnected( const FullyConnectedParams& params, const RuntimeShape& input_shape, const uint8_t* input_data, const RuntimeShape& filter_shape, @@ -164,6 +219,60 @@ inline void FullyConnected( } } +// This implementation receives the scales in float and performs requant in +// float to avoid loss of precision. +inline void FullyConnected( + const FullyConnectedParams& params, const RuntimeShape& input_shape, + const uint8_t* input_data, const RuntimeShape& filter_shape, + const uint8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + float input_scale, float output_scale, float filter_scale, + int16_t* output_data) { + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + TFLITE_DCHECK_EQ(output_offset, 0); + // TODO(b/62193649): This really should be: + // const int batches = ArraySize(output_dims, 1); + // but the current --variable_batch hack consists in overwriting the 3rd + // dimension with the runtime batch size, as we don't keep track for each + // array of which dimension is the batch dimension in it. + const int output_dim_count = output_shape.DimensionsCount(); + const int filter_dim_count = filter_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); + const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2, + output_shape, output_dim_count - 1); + const int accum_depth = filter_shape.Dims(filter_dim_count - 1); + for (int b = 0; b < batches; ++b) { + for (int out_c = 0; out_c < output_depth; ++out_c) { + // Internal accumulation. + // Initialize accumulator with the bias-value. + int32_t accum = bias_data[out_c]; + // Accumulation loop. + for (int d = 0; d < accum_depth; ++d) { + int16_t input_val = input_data[b * accum_depth + d] + input_offset; + int16_t filter_val = + filter_data[out_c * accum_depth + d] + filter_offset; + accum += filter_val * input_val; + } + const double effective_output_scale = static_cast(input_scale) * + static_cast(filter_scale) / + static_cast(output_scale); + int32_t acc_scaled = static_cast( + round(static_cast(accum) * effective_output_scale)); + // Saturate, cast to int16_t, and store to output array. + acc_scaled = std::max(acc_scaled, output_activation_min - output_offset); + acc_scaled = std::min(acc_scaled, output_activation_max - output_offset); + acc_scaled += output_offset; + output_data[out_c + output_depth * b] = acc_scaled; + } + } +} + inline void ShuffledFullyConnected( const FullyConnectedParams& params, const RuntimeShape& input_shape, const uint8_t* input_data, const RuntimeShape& weights_shape, diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h index c6d06077934..f249beef850 100644 --- a/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h +++ b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h @@ -16,6 +16,8 @@ limitations under the License. #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_ #include +#include +#include #include "tensorflow/lite/kernels/internal/common.h" @@ -74,6 +76,61 @@ void FullyConnectedPerChannel( } } +// This implementation receives the scales in float and performs requant in +// float to avoid loss of precision. +template +void FullyConnectedPerChannel( + const FullyConnectedParams& params, const RuntimeShape& input_shape, + const InputType* input_data, const RuntimeShape& filter_shape, + const WeightType* filter_data, const RuntimeShape& bias_shape, + const BiasType* bias_data, const RuntimeShape& output_shape, + float input_scale, float output_scale, const float* filter_scales, + OutputType* output_data) { + const int32_t input_offset = params.input_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2); + TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); + + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + const int filter_dim_count = filter_shape.DimensionsCount(); + + const int output_dim_count = output_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); + const int output_depth = output_shape.Dims(output_dim_count - 1); + TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2)); + const int accum_depth = filter_shape.Dims(filter_dim_count - 1); + for (int b = 0; b < batches; ++b) { + for (int out_c = 0; out_c < output_depth; ++out_c) { + BiasType acc = 0; + for (int d = 0; d < accum_depth; ++d) { + int32_t input_val = input_data[b * accum_depth + d]; + int32_t filter_val = filter_data[out_c * accum_depth + d]; + acc += filter_val * (input_val + input_offset); + } + if (bias_data) { + acc += bias_data[out_c]; + } + + const float scale = filter_scales[out_c]; + const double filter_scale = static_cast(scale); + const double effective_output_scale = static_cast(input_scale) * + filter_scale / + static_cast(output_scale); + int32_t acc_scaled = static_cast( + round(static_cast(acc) * effective_output_scale)); + + acc_scaled += output_offset; + acc_scaled = std::max(acc_scaled, output_activation_min); + acc_scaled = std::min(acc_scaled, output_activation_max); + output_data[out_c + output_depth * b] = + static_cast(acc_scaled); + } + } +} + template void FullyConnected(const FullyConnectedParams& params, @@ -122,6 +179,59 @@ void FullyConnected(const FullyConnectedParams& params, } } +// This implementation receives the scales in float and performs requant in +// float to avoid loss of precision. +template +void FullyConnected(const FullyConnectedParams& params, + const RuntimeShape& input_shape, + const InputType* input_data, + const RuntimeShape& filter_shape, + const WeightType* filter_data, + const RuntimeShape& bias_shape, const BiasType* bias_data, + const RuntimeShape& output_shape, float input_scale, + float output_scale, float filter_scale, + OutputType* output_data) { + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2); + TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); + + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + const int filter_dim_count = filter_shape.DimensionsCount(); + const int output_dim_count = output_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); + const int output_depth = output_shape.Dims(output_dim_count - 1); + TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2)); + const int accum_depth = filter_shape.Dims(filter_dim_count - 1); + for (int b = 0; b < batches; ++b) { + for (int out_c = 0; out_c < output_depth; ++out_c) { + BiasType acc = 0; + for (int d = 0; d < accum_depth; ++d) { + int32_t input_val = input_data[b * accum_depth + d]; + int32_t filter_val = filter_data[out_c * accum_depth + d]; + acc += (filter_val + filter_offset) * (input_val + input_offset); + } + if (bias_data) { + acc += bias_data[out_c]; + } + const double effective_output_scale = static_cast(input_scale) * + static_cast(filter_scale) / + static_cast(output_scale); + int32_t acc_scaled = static_cast( + round(static_cast(acc) * effective_output_scale)); + acc_scaled += output_offset; + acc_scaled = std::max(acc_scaled, output_activation_min); + acc_scaled = std::min(acc_scaled, output_activation_max); + output_data[out_c + output_depth * b] = + static_cast(acc_scaled); + } + } +} + } // namespace reference_integer_ops } // namespace tflite diff --git a/tensorflow/lite/tools/flatbuffer_utils.py b/tensorflow/lite/tools/flatbuffer_utils.py index 4a57dba24f6..71e1afed6b5 100644 --- a/tensorflow/lite/tools/flatbuffer_utils.py +++ b/tensorflow/lite/tools/flatbuffer_utils.py @@ -21,7 +21,6 @@ """ import copy -import functools import random import re import struct @@ -490,12 +489,7 @@ def get_options_as( ): raise err - @functools.singledispatch - def _get_opts(unused_op): - return None - - @_get_opts.register - def _(op: schema_fb.Operator): + if isinstance(op, schema_fb.Operator): if not is_opt_1_type: enum_val = getattr(schema_fb.BuiltinOptions2, base_type_name) opts_creator = schema_fb.BuiltinOptions2Creator @@ -510,8 +504,7 @@ def _(op: schema_fb.Operator): return None return opts_creator(enum_val, raw_ops) - @_get_opts.register - def _(op: schema_fb.OperatorT): + elif isinstance(op, schema_fb.OperatorT): if is_opt_1_type: raw_ops_t = op.builtinOptions else: @@ -520,4 +513,5 @@ def _(op: schema_fb.OperatorT): return None return raw_ops_t - return _get_opts(op) + else: + return None