diff --git a/tensorflow/compiler/mlir/lite/core/c/builtin_op_data.h b/tensorflow/compiler/mlir/lite/core/c/builtin_op_data.h
index 1327162f232..c580bf03cd3 100644
--- a/tensorflow/compiler/mlir/lite/core/c/builtin_op_data.h
+++ b/tensorflow/compiler/mlir/lite/core/c/builtin_op_data.h
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
 /// WARNING: Users of TensorFlow Lite should not include this file directly,
-/// but should instead include
-/// "third_party/tensorflow/lite/c/builtin_op_data.h".
-/// Only the TensorFlow Lite implementation itself should include this
-/// file directly.
+/// only the TensorFlow Lite implementation itself should.
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/c/builtin_op_data.h"
+
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_CORE_C_BUILTIN_OP_DATA_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_CORE_C_BUILTIN_OP_DATA_H_
 
diff --git a/tensorflow/lite/core/api/tensor_utils.cc b/tensorflow/lite/core/api/tensor_utils.cc
index 18a643c78dc..c5052c78f84 100644
--- a/tensorflow/lite/core/api/tensor_utils.cc
+++ b/tensorflow/lite/core/api/tensor_utils.cc
@@ -33,8 +33,8 @@ TfLiteStatus ResetVariableTensor(TfLiteTensor* tensor) {
   }
   // TODO(b/139446230): Provide a platform header to better handle these
   // specific scenarios.
-#if __ANDROID__ || defined(__x86_64__) || defined(__i386__) || \
-    defined(__i386) || defined(__x86__) || defined(__X86__) || \
+#if defined(__ANDROID__) || defined(__x86_64__) || defined(__i386__) || \
+    defined(__i386) || defined(__x86__) || defined(__X86__) ||          \
     defined(_X86_) || defined(_M_IX86) || defined(_M_X64)
   memset(tensor->data.raw, value, tensor->bytes);
 #else
diff --git a/tensorflow/lite/core/c/common.cc b/tensorflow/lite/core/c/common.cc
index d458b1eb29b..baa6282fd5b 100644
--- a/tensorflow/lite/core/c/common.cc
+++ b/tensorflow/lite/core/c/common.cc
@@ -113,14 +113,25 @@ TfLiteQuantization TfLiteQuantizationClone(const TfLiteQuantization& src) {
     case kTfLiteAffineQuantization: {
       dst.params = calloc(1, sizeof(TfLiteAffineQuantization));
       const TfLiteAffineQuantization* const src_params =
-          (TfLiteAffineQuantization*)(src.params);
+          reinterpret_cast<TfLiteAffineQuantization*>(src.params);
       TfLiteAffineQuantization* const dst_params =
-          (TfLiteAffineQuantization*)(dst.params);
+          reinterpret_cast<TfLiteAffineQuantization*>(dst.params);
       dst_params->quantized_dimension = src_params->quantized_dimension;
       dst_params->scale = TfLiteFloatArrayCopy(src_params->scale);
       dst_params->zero_point = TfLiteIntArrayCopy(src_params->zero_point);
       break;
     }
+    case kTfLiteBlockwiseQuantization: {
+      dst.params = calloc(1, sizeof(TfLiteBlockwiseQuantization));
+      const TfLiteBlockwiseQuantization* const src_params =
+          (TfLiteBlockwiseQuantization*)(src.params);
+      TfLiteBlockwiseQuantization* const dst_params =
+          (TfLiteBlockwiseQuantization*)(dst.params);
+      dst_params->blocksize = src_params->blocksize;
+      dst_params->scale = src_params->scale;
+      dst_params->zero_point = src_params->zero_point;
+      break;
+    }
   }
   return dst;
 }
@@ -225,7 +236,7 @@ void TfLiteTensorDataFree(TfLiteTensor* t) {
 void TfLiteQuantizationFree(TfLiteQuantization* quantization) {
   if (quantization->type == kTfLiteAffineQuantization) {
     TfLiteAffineQuantization* q_params =
-        (TfLiteAffineQuantization*)(quantization->params);
+        reinterpret_cast<TfLiteAffineQuantization*>(quantization->params);
     if (q_params->scale) {
       TfLiteFloatArrayFree(q_params->scale);
       q_params->scale = nullptr;
diff --git a/tensorflow/lite/core/c/common.h b/tensorflow/lite/core/c/common.h
index 87a9b1a5075..57caa3b759a 100644
--- a/tensorflow/lite/core/c/common.h
+++ b/tensorflow/lite/core/c/common.h
@@ -328,6 +328,8 @@ typedef enum TfLiteQuantizationType : int {
   /// Affine quantization (with support for per-channel quantization).
   /// Corresponds to TfLiteAffineQuantization.
   kTfLiteAffineQuantization = 1,
+  /// Blockwise quantization.
+  kTfLiteBlockwiseQuantization = 2,
 } TfLiteQuantizationType;
 
 /// Structure specifying the quantization used by the tensor, if-any.
@@ -353,6 +355,20 @@ typedef struct TfLiteAffineQuantization {
   int32_t quantized_dimension;
 } TfLiteAffineQuantization;
 
+/// Parameters for blockwise quantization across the output channels dimension.
+/// For a particular value in quantized_dimension, quantized values can be
+/// converted back to float using:
+///     `real_value = scale * (quantized_value - zero_point)`
+typedef struct TfLiteBlockwiseQuantization {
+  // Index of the tensor containing the scales.
+  int32_t scale;
+  // Index of the tensor containing the zero points.
+  int32_t zero_point;
+  // Quantization blocksize.
+  int32_t blocksize;
+  int32_t quantized_dimension;
+} TfLiteBlockwiseQuantization;
+
 /// A union of pointers that points to memory for a given tensor.
 ///
 /// Do not access these members directly, if possible, use
diff --git a/tensorflow/lite/kernels/internal/reference/fully_connected.h b/tensorflow/lite/kernels/internal/reference/fully_connected.h
index ba51cbcfe3e..bccc6220062 100644
--- a/tensorflow/lite/kernels/internal/reference/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/reference/fully_connected.h
@@ -16,6 +16,8 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
 
 #include <algorithm>
+#include <cmath>
+#include <cstdint>
 
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/internal/common.h"
@@ -62,6 +64,59 @@ inline void FullyConnected(
   }
 }
 
+// This implementation receives the scales in float and performs requant in
+// float to avoid loss of precision.
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    float input_scale, float output_scale, float filter_scale,
+    uint8_t* output_data) {
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  // TODO(b/62193649): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
+                                       output_shape, output_dim_count - 1);
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      int32_t acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+      }
+      const double effective_output_scale = static_cast<double>(input_scale) *
+                                            static_cast<double>(filter_scale) /
+                                            static_cast<double>(output_scale);
+      int32_t acc_scaled = static_cast<int32_t>(
+          round(static_cast<double>(acc) * effective_output_scale));
+      acc_scaled += output_offset;
+      acc_scaled = std::max(acc_scaled, output_activation_min);
+      acc_scaled = std::min(acc_scaled, output_activation_max);
+      output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc_scaled);
+    }
+  }
+}
+
 inline void FullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
     const uint8_t* input_data, const RuntimeShape& filter_shape,
@@ -164,6 +219,60 @@ inline void FullyConnected(
   }
 }
 
+// This implementation receives the scales in float and performs requant in
+// float to avoid loss of precision.
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    float input_scale, float output_scale, float filter_scale,
+    int16_t* output_data) {
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(output_offset, 0);
+  // TODO(b/62193649): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
+                                       output_shape, output_dim_count - 1);
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32_t accum = bias_data[out_c];
+      // Accumulation loop.
+      for (int d = 0; d < accum_depth; ++d) {
+        int16_t input_val = input_data[b * accum_depth + d] + input_offset;
+        int16_t filter_val =
+            filter_data[out_c * accum_depth + d] + filter_offset;
+        accum += filter_val * input_val;
+      }
+      const double effective_output_scale = static_cast<double>(input_scale) *
+                                            static_cast<double>(filter_scale) /
+                                            static_cast<double>(output_scale);
+      int32_t acc_scaled = static_cast<int32_t>(
+          round(static_cast<double>(accum) * effective_output_scale));
+      // Saturate, cast to int16_t, and store to output array.
+      acc_scaled = std::max(acc_scaled, output_activation_min - output_offset);
+      acc_scaled = std::min(acc_scaled, output_activation_max - output_offset);
+      acc_scaled += output_offset;
+      output_data[out_c + output_depth * b] = acc_scaled;
+    }
+  }
+}
+
 inline void ShuffledFullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
     const uint8_t* input_data, const RuntimeShape& weights_shape,
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
index c6d06077934..f249beef850 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
@@ -16,6 +16,8 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
 
 #include <algorithm>
+#include <cmath>
+#include <cstdint>
 
 #include "tensorflow/lite/kernels/internal/common.h"
 
@@ -74,6 +76,61 @@ void FullyConnectedPerChannel(
   }
 }
 
+// This implementation receives the scales in float and performs requant in
+// float to avoid loss of precision.
+template <typename InputType, typename WeightType, typename OutputType,
+          typename BiasType>
+void FullyConnectedPerChannel(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const InputType* input_data, const RuntimeShape& filter_shape,
+    const WeightType* filter_data, const RuntimeShape& bias_shape,
+    const BiasType* bias_data, const RuntimeShape& output_shape,
+    float input_scale, float output_scale, const float* filter_scales,
+    OutputType* output_data) {
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int filter_dim_count = filter_shape.DimensionsCount();
+
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = output_shape.Dims(output_dim_count - 1);
+  TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      BiasType acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
+        acc += filter_val * (input_val + input_offset);
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+      }
+
+      const float scale = filter_scales[out_c];
+      const double filter_scale = static_cast<double>(scale);
+      const double effective_output_scale = static_cast<double>(input_scale) *
+                                            filter_scale /
+                                            static_cast<double>(output_scale);
+      int32_t acc_scaled = static_cast<int32_t>(
+          round(static_cast<double>(acc) * effective_output_scale));
+
+      acc_scaled += output_offset;
+      acc_scaled = std::max(acc_scaled, output_activation_min);
+      acc_scaled = std::min(acc_scaled, output_activation_max);
+      output_data[out_c + output_depth * b] =
+          static_cast<OutputType>(acc_scaled);
+    }
+  }
+}
+
 template <typename InputType, typename WeightType, typename OutputType,
           typename BiasType>
 void FullyConnected(const FullyConnectedParams& params,
@@ -122,6 +179,59 @@ void FullyConnected(const FullyConnectedParams& params,
   }
 }
 
+// This implementation receives the scales in float and performs requant in
+// float to avoid loss of precision.
+template <typename InputType, typename WeightType, typename OutputType,
+          typename BiasType>
+void FullyConnected(const FullyConnectedParams& params,
+                    const RuntimeShape& input_shape,
+                    const InputType* input_data,
+                    const RuntimeShape& filter_shape,
+                    const WeightType* filter_data,
+                    const RuntimeShape& bias_shape, const BiasType* bias_data,
+                    const RuntimeShape& output_shape, float input_scale,
+                    float output_scale, float filter_scale,
+                    OutputType* output_data) {
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = output_shape.Dims(output_dim_count - 1);
+  TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      BiasType acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+      }
+      const double effective_output_scale = static_cast<double>(input_scale) *
+                                            static_cast<double>(filter_scale) /
+                                            static_cast<double>(output_scale);
+      int32_t acc_scaled = static_cast<int32_t>(
+          round(static_cast<double>(acc) * effective_output_scale));
+      acc_scaled += output_offset;
+      acc_scaled = std::max(acc_scaled, output_activation_min);
+      acc_scaled = std::min(acc_scaled, output_activation_max);
+      output_data[out_c + output_depth * b] =
+          static_cast<OutputType>(acc_scaled);
+    }
+  }
+}
+
 }  // namespace reference_integer_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/tools/flatbuffer_utils.py b/tensorflow/lite/tools/flatbuffer_utils.py
index 4a57dba24f6..71e1afed6b5 100644
--- a/tensorflow/lite/tools/flatbuffer_utils.py
+++ b/tensorflow/lite/tools/flatbuffer_utils.py
@@ -21,7 +21,6 @@
 """
 
 import copy
-import functools
 import random
 import re
 import struct
@@ -490,12 +489,7 @@ def get_options_as(
   ):
     raise err
 
-  @functools.singledispatch
-  def _get_opts(unused_op):
-    return None
-
-  @_get_opts.register
-  def _(op: schema_fb.Operator):
+  if isinstance(op, schema_fb.Operator):
     if not is_opt_1_type:
       enum_val = getattr(schema_fb.BuiltinOptions2, base_type_name)
       opts_creator = schema_fb.BuiltinOptions2Creator
@@ -510,8 +504,7 @@ def _(op: schema_fb.Operator):
       return None
     return opts_creator(enum_val, raw_ops)
 
-  @_get_opts.register
-  def _(op: schema_fb.OperatorT):
+  elif isinstance(op, schema_fb.OperatorT):
     if is_opt_1_type:
       raw_ops_t = op.builtinOptions
     else:
@@ -520,4 +513,5 @@ def _(op: schema_fb.OperatorT):
       return None
     return raw_ops_t
 
-  return _get_opts(op)
+  else:
+    return None