From b4101e1ceeda619e2819f494917b07ebd53ab5db Mon Sep 17 00:00:00 2001 From: TFLM-bot Date: Mon, 26 Jan 2026 14:06:53 +0000 Subject: [PATCH] Sync from upstream TF. --- .../internal/reference/concatenation.h | 72 +++++++++++++++++++ tensorflow/lite/kernels/internal/types.h | 5 ++ 2 files changed, 77 insertions(+) diff --git a/tensorflow/lite/kernels/internal/reference/concatenation.h b/tensorflow/lite/kernels/internal/reference/concatenation.h index 9d2ecbecc70..4a82d7c502d 100644 --- a/tensorflow/lite/kernels/internal/reference/concatenation.h +++ b/tensorflow/lite/kernels/internal/reference/concatenation.h @@ -70,6 +70,78 @@ inline void Concatenation(const ConcatenationParams& params, } } +template <> +inline void Concatenation(const ConcatenationParams& params, + const RuntimeShape* const* input_shapes, + const Int4* const* input_data, + const RuntimeShape& output_shape, + Int4* output_data) { + int axis = params.axis; + int inputs_count = params.inputs_count; + const int concat_dimensions = output_shape.DimensionsCount(); + TFLITE_DCHECK_LT(axis, concat_dimensions); + + int64_t concat_size = 0; + for (int i = 0; i < inputs_count; i++) { + TFLITE_DCHECK_EQ(input_shapes[i]->DimensionsCount(), concat_dimensions); + for (int j = 0; j < concat_dimensions; j++) { + if (j != axis) { + MatchingDim(*input_shapes[i], j, output_shape, j); + } + } + concat_size += input_shapes[i]->Dims(axis); + } + TFLITE_DCHECK_EQ(concat_size, output_shape.Dims(axis)); + int64_t outer_size = 1; + for (int i = 0; i < axis; ++i) { + outer_size *= output_shape.Dims(i); + } + // For all input arrays, + // FlatSize() = outer_size * Dims(axis) * base_inner_size; + int64_t base_inner_size = 1; + for (int i = axis + 1; i < concat_dimensions; ++i) { + base_inner_size *= output_shape.Dims(i); + } + + uint8_t* output_ptr = reinterpret_cast(output_data); + // We can't guarantee that the output buffer is initialized to 0, so we have + // to clear it to ensure the high/low nibbles not currently being written are + // not garbage. + // Note: output_shape.FlatSize() gives number of elements (nibbles). + // Bytes needed: (elements + 1) / 2. + memset(output_ptr, 0, (output_shape.FlatSize() + 1) / 2); + + int64_t output_offset = 0; + for (int k = 0; k < outer_size; k++) { + for (int i = 0; i < inputs_count; ++i) { + const int64_t copy_size = input_shapes[i]->Dims(axis) * base_inner_size; + const uint8_t* input_ptr = + reinterpret_cast(input_data[i]); + // The input_ptr points to the start of the tensor data. + // We need to calculate the offset for the current outer loop iteration + // 'k'. + // The tensor has total elements = outer_size * copy_size. + // So current offset in elements is k * copy_size. + int64_t input_offset = k * copy_size; + + for (int j = 0; j < copy_size; ++j) { + int64_t in_idx = input_offset + j; + uint8_t val = input_ptr[in_idx / 2]; + uint8_t nibble = (in_idx % 2 == 0) ? (val & 0x0F) : ((val >> 4) & 0x0F); + + int64_t out_idx = output_offset + j; + uint8_t* out_byte = output_ptr + (out_idx / 2); + if (out_idx % 2 == 0) { + *out_byte = (*out_byte & 0xF0) | nibble; + } else { + *out_byte = (*out_byte & 0x0F) | (nibble << 4); + } + } + output_offset += copy_size; + } + } +} + // TODO(b/174275780): The quantized implementation of concatentation isn't fully // quantized as it takes scale as a floating point value. This should be fixed // when optimizng this routine further. diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h index 510ffa30498..1cfc43d1662 100644 --- a/tensorflow/lite/kernels/internal/types.h +++ b/tensorflow/lite/kernels/internal/types.h @@ -34,6 +34,11 @@ enum class FusedActivationFunctionType : uint8_t { }; enum class PaddingType : uint8_t { kNone, kSame, kValid }; +// Empty struct to allow using a distinct type for Int4. +struct Int4 { + uint8_t val; +}; + struct PaddingValues { int16_t width; int16_t height;