From e32dd525489175e19fbb1701e43fcfdf545a4e38 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 20 Jan 2026 11:31:49 -0700
Subject: [PATCH 1/4] perf: optimize shuffle array element iteration with
 slice-based append
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use bulk-append methods for primitive types in SparkUnsafeArray:
- Non-nullable path uses append_slice() for optimal memcpy-style copy
- Nullable path uses pointer iteration with efficient null bitset reading

Supported types: i8, i16, i32, i64, f32, f64, date32, timestamp

Benchmark results (10K elements):

| Type | Baseline | Optimized | Speedup |
|------|----------|-----------|---------|
| i32/no_nulls | 6.08µs | 0.65µs | **9.3x** |
| i32/with_nulls | 22.49µs | 16.21µs | **1.39x** |
| i64/no_nulls | 6.15µs | 1.22µs | **5x** |
| i64/with_nulls | 16.41µs | 16.41µs | 1x |
| f64/no_nulls | 8.05µs | 1.22µs | **6.6x** |
| f64/with_nulls | 16.52µs | 16.21µs | 1.02x |
| date32/no_nulls | - | 0.66µs | ~9x |
| timestamp/no_nulls | - | 1.21µs | ~5x |

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 native/core/Cargo.toml                    |   4 +
 native/core/benches/array_conversion.rs   | 263 +++++++++++++++
 native/core/src/execution/shuffle/list.rs | 389 +++++++++++++++++++---
 native/core/src/execution/shuffle/mod.rs  |   2 +-
 4 files changed, 602 insertions(+), 56 deletions(-)
 create mode 100644 native/core/benches/array_conversion.rs
diff --git a/native/core/Cargo.toml b/native/core/Cargo.toml
index 5e30883e35..a4622eb932 100644
--- a/native/core/Cargo.toml
+++ b/native/core/Cargo.toml
@@ -131,3 +131,7 @@ harness = false
 [[bench]]
 name = "parquet_decode"
 harness = false
+
+[[bench]]
+name = "array_conversion"
+harness = false
diff --git a/native/core/benches/array_conversion.rs b/native/core/benches/array_conversion.rs
new file mode 100644
index 0000000000..4cb1f7d13b
--- /dev/null
+++ b/native/core/benches/array_conversion.rs
@@ -0,0 +1,263 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmarks for SparkUnsafeArray to Arrow array conversion.
+//! This specifically tests the append_to_builder function used in shuffle read path.
+
+use arrow::array::builder::{ArrayBuilder, Int32Builder, Int64Builder, Float64Builder, Date32Builder, TimestampMicrosecondBuilder};
+use arrow::datatypes::{DataType, TimeUnit};
+use comet::execution::shuffle::list::{append_to_builder, SparkUnsafeArray};
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+
+const NUM_ELEMENTS: usize = 10000;
+
+/// Create a SparkUnsafeArray in memory with i32 elements.
+/// Layout:
+/// - 8 bytes: num_elements (i64)
+/// - null bitset: 8 bytes per 64 elements
+/// - element data: 4 bytes per element (i32)
+fn create_spark_unsafe_array_i32(num_elements: usize, with_nulls: bool) -> Vec<u8> {
+    // Header size: 8 (num_elements) + ceil(num_elements/64) * 8 (null bitset)
+    let null_bitset_words = (num_elements + 63) / 64;
+    let header_size = 8 + null_bitset_words * 8;
+    let data_size = num_elements * 4; // i32 = 4 bytes
+    let total_size = header_size + data_size;
+
+    let mut buffer = vec![0u8; total_size];
+
+    // Write num_elements
+    buffer[0..8].copy_from_slice(&(num_elements as i64).to_le_bytes());
+
+    // Write null bitset (set every 10th element as null if with_nulls)
+    if with_nulls {
+        for i in (0..num_elements).step_by(10) {
+            let word_idx = i / 64;
+            let bit_idx = i % 64;
+            let word_offset = 8 + word_idx * 8;
+            let current_word = i64::from_le_bytes(buffer[word_offset..word_offset + 8].try_into().unwrap());
+            let new_word = current_word | (1i64 << bit_idx);
+            buffer[word_offset..word_offset + 8].copy_from_slice(&new_word.to_le_bytes());
+        }
+    }
+
+    // Write element data
+    for i in 0..num_elements {
+        let offset = header_size + i * 4;
+        buffer[offset..offset + 4].copy_from_slice(&(i as i32).to_le_bytes());
+    }
+
+    buffer
+}
+
+/// Create a SparkUnsafeArray in memory with i64 elements.
+fn create_spark_unsafe_array_i64(num_elements: usize, with_nulls: bool) -> Vec<u8> {
+    let null_bitset_words = (num_elements + 63) / 64;
+    let header_size = 8 + null_bitset_words * 8;
+    let data_size = num_elements * 8; // i64 = 8 bytes
+    let total_size = header_size + data_size;
+
+    let mut buffer = vec![0u8; total_size];
+
+    // Write num_elements
+    buffer[0..8].copy_from_slice(&(num_elements as i64).to_le_bytes());
+
+    // Write null bitset
+    if with_nulls {
+        for i in (0..num_elements).step_by(10) {
+            let word_idx = i / 64;
+            let bit_idx = i % 64;
+            let word_offset = 8 + word_idx * 8;
+            let current_word = i64::from_le_bytes(buffer[word_offset..word_offset + 8].try_into().unwrap());
+            let new_word = current_word | (1i64 << bit_idx);
+            buffer[word_offset..word_offset + 8].copy_from_slice(&new_word.to_le_bytes());
+        }
+    }
+
+    // Write element data
+    for i in 0..num_elements {
+        let offset = header_size + i * 8;
+        buffer[offset..offset + 8].copy_from_slice(&(i as i64).to_le_bytes());
+    }
+
+    buffer
+}
+
+/// Create a SparkUnsafeArray in memory with f64 elements.
+fn create_spark_unsafe_array_f64(num_elements: usize, with_nulls: bool) -> Vec<u8> {
+    let null_bitset_words = (num_elements + 63) / 64;
+    let header_size = 8 + null_bitset_words * 8;
+    let data_size = num_elements * 8; // f64 = 8 bytes
+    let total_size = header_size + data_size;
+
+    let mut buffer = vec![0u8; total_size];
+
+    // Write num_elements
+    buffer[0..8].copy_from_slice(&(num_elements as i64).to_le_bytes());
+
+    // Write null bitset
+    if with_nulls {
+        for i in (0..num_elements).step_by(10) {
+            let word_idx = i / 64;
+            let bit_idx = i % 64;
+            let word_offset = 8 + word_idx * 8;
+            let current_word = i64::from_le_bytes(buffer[word_offset..word_offset + 8].try_into().unwrap());
+            let new_word = current_word | (1i64 << bit_idx);
+            buffer[word_offset..word_offset + 8].copy_from_slice(&new_word.to_le_bytes());
+        }
+    }
+
+    // Write element data
+    for i in 0..num_elements {
+        let offset = header_size + i * 8;
+        buffer[offset..offset + 8].copy_from_slice(&(i as f64).to_le_bytes());
+    }
+
+    buffer
+}
+
+fn benchmark_array_conversion(c: &mut Criterion) {
+    let mut group = c.benchmark_group("spark_unsafe_array_to_arrow");
+
+    // Benchmark i32 array conversion
+    for with_nulls in [false, true] {
+        let buffer = create_spark_unsafe_array_i32(NUM_ELEMENTS, with_nulls);
+        let array = SparkUnsafeArray::new(buffer.as_ptr() as i64);
+        let null_str = if with_nulls { "with_nulls" } else { "no_nulls" };
+
+        group.bench_with_input(
+            BenchmarkId::new("i32", null_str),
+            &(&array, &buffer),
+            |b, (array, _buffer)| {
+                b.iter(|| {
+                    let mut builder = Int32Builder::with_capacity(NUM_ELEMENTS);
+                    if with_nulls {
+                        append_to_builder::<true>(&DataType::Int32, &mut builder, array).unwrap();
+                    } else {
+                        append_to_builder::<false>(&DataType::Int32, &mut builder, array).unwrap();
+                    }
+                    builder.finish()
+                });
+            },
+        );
+    }
+
+    // Benchmark i64 array conversion
+    for with_nulls in [false, true] {
+        let buffer = create_spark_unsafe_array_i64(NUM_ELEMENTS, with_nulls);
+        let array = SparkUnsafeArray::new(buffer.as_ptr() as i64);
+        let null_str = if with_nulls { "with_nulls" } else { "no_nulls" };
+
+        group.bench_with_input(
+            BenchmarkId::new("i64", null_str),
+            &(&array, &buffer),
+            |b, (array, _buffer)| {
+                b.iter(|| {
+                    let mut builder = Int64Builder::with_capacity(NUM_ELEMENTS);
+                    if with_nulls {
+                        append_to_builder::<true>(&DataType::Int64, &mut builder, array).unwrap();
+                    } else {
+                        append_to_builder::<false>(&DataType::Int64, &mut builder, array).unwrap();
+                    }
+                    builder.finish()
+                });
+            },
+        );
+    }
+
+    // Benchmark f64 array conversion
+    for with_nulls in [false, true] {
+        let buffer = create_spark_unsafe_array_f64(NUM_ELEMENTS, with_nulls);
+        let array = SparkUnsafeArray::new(buffer.as_ptr() as i64);
+        let null_str = if with_nulls { "with_nulls" } else { "no_nulls" };
+
+        group.bench_with_input(
+            BenchmarkId::new("f64", null_str),
+            &(&array, &buffer),
+            |b, (array, _buffer)| {
+                b.iter(|| {
+                    let mut builder = Float64Builder::with_capacity(NUM_ELEMENTS);
+                    if with_nulls {
+                        append_to_builder::<true>(&DataType::Float64, &mut builder, array).unwrap();
+                    } else {
+                        append_to_builder::<false>(&DataType::Float64, &mut builder, array).unwrap();
+                    }
+                    builder.finish()
+                });
+            },
+        );
+    }
+
+    // Benchmark date32 array conversion (same memory layout as i32)
+    for with_nulls in [false, true] {
+        let buffer = create_spark_unsafe_array_i32(NUM_ELEMENTS, with_nulls);
+        let array = SparkUnsafeArray::new(buffer.as_ptr() as i64);
+        let null_str = if with_nulls { "with_nulls" } else { "no_nulls" };
+
+        group.bench_with_input(
+            BenchmarkId::new("date32", null_str),
+            &(&array, &buffer),
+            |b, (array, _buffer)| {
+                b.iter(|| {
+                    let mut builder = Date32Builder::with_capacity(NUM_ELEMENTS);
+                    if with_nulls {
+                        append_to_builder::<true>(&DataType::Date32, &mut builder, array).unwrap();
+                    } else {
+                        append_to_builder::<false>(&DataType::Date32, &mut builder, array).unwrap();
+                    }
+                    builder.finish()
+                });
+            },
+        );
+    }
+
+    // Benchmark timestamp array conversion (same memory layout as i64)
+    for with_nulls in [false, true] {
+        let buffer = create_spark_unsafe_array_i64(NUM_ELEMENTS, with_nulls);
+        let array = SparkUnsafeArray::new(buffer.as_ptr() as i64);
+        let null_str = if with_nulls { "with_nulls" } else { "no_nulls" };
+
+        group.bench_with_input(
+            BenchmarkId::new("timestamp", null_str),
+            &(&array, &buffer),
+            |b, (array, _buffer)| {
+                b.iter(|| {
+                    let mut builder = TimestampMicrosecondBuilder::with_capacity(NUM_ELEMENTS);
+                    let dt = DataType::Timestamp(TimeUnit::Microsecond, None);
+                    if with_nulls {
+                        append_to_builder::<true>(&dt, &mut builder, array).unwrap();
+                    } else {
+                        append_to_builder::<false>(&dt, &mut builder, array).unwrap();
+                    }
+                    builder.finish()
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn config() -> Criterion {
+    Criterion::default()
+}
+
+criterion_group! {
+    name = benches;
+    config = config();
+    targets = benchmark_array_conversion
+}
+criterion_main!(benches);
diff --git a/native/core/src/execution/shuffle/list.rs b/native/core/src/execution/shuffle/list.rs
index c31244b87d..e4ee31618a 100644
--- a/native/core/src/execution/shuffle/list.rs
+++ b/native/core/src/execution/shuffle/list.rs
@@ -90,6 +90,322 @@ impl SparkUnsafeArray {
             (word & mask) != 0
         }
     }
+
+    /// Returns the null bitset pointer (starts at row_addr + 8).
+    #[inline]
+    pub(crate) fn null_bitset_ptr(&self) -> *const i64 {
+        (self.row_addr + 8) as *const i64
+    }
+
+    /// Bulk append i32 values to builder.
+    /// For non-nullable: uses slice append for optimal performance.
+    /// For nullable: uses pointer iteration with efficient null bitset reading.
+    #[inline]
+    pub(crate) fn append_ints_to_builder<const NULLABLE: bool>(
+        &self,
+        builder: &mut Int32Builder,
+    ) {
+        let num_elements = self.num_elements;
+        if num_elements == 0 {
+            return;
+        }
+
+        if NULLABLE {
+            let mut ptr = self.element_offset as *const i32;
+            let null_words = self.null_bitset_ptr();
+            for idx in 0..num_elements {
+                let word_idx = idx >> 6;
+                let bit_idx = idx & 0x3f;
+                let is_null = unsafe { (*null_words.add(word_idx) & (1i64 << bit_idx)) != 0 };
+
+                if is_null {
+                    builder.append_null();
+                } else {
+                    builder.append_value(unsafe { *ptr });
+                }
+                ptr = unsafe { ptr.add(1) };
+            }
+        } else {
+            // Use slice-based append for non-nullable path (much faster)
+            let slice = unsafe {
+                std::slice::from_raw_parts(self.element_offset as *const i32, num_elements)
+            };
+            builder.append_slice(slice);
+        }
+    }
+
+    /// Bulk append i64 values to builder.
+    #[inline]
+    pub(crate) fn append_longs_to_builder<const NULLABLE: bool>(
+        &self,
+        builder: &mut Int64Builder,
+    ) {
+        let num_elements = self.num_elements;
+        if num_elements == 0 {
+            return;
+        }
+
+        if NULLABLE {
+            let mut ptr = self.element_offset as *const i64;
+            let null_words = self.null_bitset_ptr();
+            for idx in 0..num_elements {
+                let word_idx = idx >> 6;
+                let bit_idx = idx & 0x3f;
+                let is_null = unsafe { (*null_words.add(word_idx) & (1i64 << bit_idx)) != 0 };
+
+                if is_null {
+                    builder.append_null();
+                } else {
+                    builder.append_value(unsafe { *ptr });
+                }
+                ptr = unsafe { ptr.add(1) };
+            }
+        } else {
+            let slice = unsafe {
+                std::slice::from_raw_parts(self.element_offset as *const i64, num_elements)
+            };
+            builder.append_slice(slice);
+        }
+    }
+
+    /// Bulk append i16 values to builder.
+    #[inline]
+    pub(crate) fn append_shorts_to_builder<const NULLABLE: bool>(
+        &self,
+        builder: &mut Int16Builder,
+    ) {
+        let num_elements = self.num_elements;
+        if num_elements == 0 {
+            return;
+        }
+
+        if NULLABLE {
+            let mut ptr = self.element_offset as *const i16;
+            let null_words = self.null_bitset_ptr();
+            for idx in 0..num_elements {
+                let word_idx = idx >> 6;
+                let bit_idx = idx & 0x3f;
+                let is_null = unsafe { (*null_words.add(word_idx) & (1i64 << bit_idx)) != 0 };
+
+                if is_null {
+                    builder.append_null();
+                } else {
+                    builder.append_value(unsafe { *ptr });
+                }
+                ptr = unsafe { ptr.add(1) };
+            }
+        } else {
+            let slice = unsafe {
+                std::slice::from_raw_parts(self.element_offset as *const i16, num_elements)
+            };
+            builder.append_slice(slice);
+        }
+    }
+
+    /// Bulk append i8 values to builder.
+    #[inline]
+    pub(crate) fn append_bytes_to_builder<const NULLABLE: bool>(
+        &self,
+        builder: &mut Int8Builder,
+    ) {
+        let num_elements = self.num_elements;
+        if num_elements == 0 {
+            return;
+        }
+
+        if NULLABLE {
+            let mut ptr = self.element_offset as *const i8;
+            let null_words = self.null_bitset_ptr();
+            for idx in 0..num_elements {
+                let word_idx = idx >> 6;
+                let bit_idx = idx & 0x3f;
+                let is_null = unsafe { (*null_words.add(word_idx) & (1i64 << bit_idx)) != 0 };
+
+                if is_null {
+                    builder.append_null();
+                } else {
+                    builder.append_value(unsafe { *ptr });
+                }
+                ptr = unsafe { ptr.add(1) };
+            }
+        } else {
+            let slice = unsafe {
+                std::slice::from_raw_parts(self.element_offset as *const i8, num_elements)
+            };
+            builder.append_slice(slice);
+        }
+    }
+
+    /// Bulk append f32 values to builder.
+    #[inline]
+    pub(crate) fn append_floats_to_builder<const NULLABLE: bool>(
+        &self,
+        builder: &mut Float32Builder,
+    ) {
+        let num_elements = self.num_elements;
+        if num_elements == 0 {
+            return;
+        }
+
+        if NULLABLE {
+            let mut ptr = self.element_offset as *const f32;
+            let null_words = self.null_bitset_ptr();
+            for idx in 0..num_elements {
+                let word_idx = idx >> 6;
+                let bit_idx = idx & 0x3f;
+                let is_null = unsafe { (*null_words.add(word_idx) & (1i64 << bit_idx)) != 0 };
+
+                if is_null {
+                    builder.append_null();
+                } else {
+                    builder.append_value(unsafe { *ptr });
+                }
+                ptr = unsafe { ptr.add(1) };
+            }
+        } else {
+            let slice = unsafe {
+                std::slice::from_raw_parts(self.element_offset as *const f32, num_elements)
+            };
+            builder.append_slice(slice);
+        }
+    }
+
+    /// Bulk append f64 values to builder.
+    #[inline]
+    pub(crate) fn append_doubles_to_builder<const NULLABLE: bool>(
+        &self,
+        builder: &mut Float64Builder,
+    ) {
+        let num_elements = self.num_elements;
+        if num_elements == 0 {
+            return;
+        }
+
+        if NULLABLE {
+            let mut ptr = self.element_offset as *const f64;
+            let null_words = self.null_bitset_ptr();
+            for idx in 0..num_elements {
+                let word_idx = idx >> 6;
+                let bit_idx = idx & 0x3f;
+                let is_null = unsafe { (*null_words.add(word_idx) & (1i64 << bit_idx)) != 0 };
+
+                if is_null {
+                    builder.append_null();
+                } else {
+                    builder.append_value(unsafe { *ptr });
+                }
+                ptr = unsafe { ptr.add(1) };
+            }
+        } else {
+            let slice = unsafe {
+                std::slice::from_raw_parts(self.element_offset as *const f64, num_elements)
+            };
+            builder.append_slice(slice);
+        }
+    }
+
+    /// Bulk append boolean values to builder using pointer iteration.
+    #[inline]
+    pub(crate) fn append_booleans_to_builder<const NULLABLE: bool>(
+        &self,
+        builder: &mut BooleanBuilder,
+    ) {
+        let num_elements = self.num_elements;
+        if num_elements == 0 {
+            return;
+        }
+
+        let mut ptr = self.element_offset as *const u8;
+
+        if NULLABLE {
+            let null_words = self.null_bitset_ptr();
+            for idx in 0..num_elements {
+                let word_idx = idx >> 6;
+                let bit_idx = idx & 0x3f;
+                let is_null = unsafe { (*null_words.add(word_idx) & (1i64 << bit_idx)) != 0 };
+
+                if is_null {
+                    builder.append_null();
+                } else {
+                    builder.append_value(unsafe { *ptr != 0 });
+                }
+                ptr = unsafe { ptr.add(1) };
+            }
+        } else {
+            for _ in 0..num_elements {
+                builder.append_value(unsafe { *ptr != 0 });
+                ptr = unsafe { ptr.add(1) };
+            }
+        }
+    }
+
+    /// Bulk append timestamp values to builder.
+    #[inline]
+    pub(crate) fn append_timestamps_to_builder<const NULLABLE: bool>(
+        &self,
+        builder: &mut TimestampMicrosecondBuilder,
+    ) {
+        let num_elements = self.num_elements;
+        if num_elements == 0 {
+            return;
+        }
+
+        if NULLABLE {
+            let mut ptr = self.element_offset as *const i64;
+            let null_words = self.null_bitset_ptr();
+            for idx in 0..num_elements {
+                let word_idx = idx >> 6;
+                let bit_idx = idx & 0x3f;
+                let is_null = unsafe { (*null_words.add(word_idx) & (1i64 << bit_idx)) != 0 };
+
+                if is_null {
+                    builder.append_null();
+                } else {
+                    builder.append_value(unsafe { *ptr });
+                }
+                ptr = unsafe { ptr.add(1) };
+            }
+        } else {
+            let slice = unsafe {
+                std::slice::from_raw_parts(self.element_offset as *const i64, num_elements)
+            };
+            builder.append_slice(slice);
+        }
+    }
+
+    /// Bulk append date values to builder.
+    #[inline]
+    pub(crate) fn append_dates_to_builder<const NULLABLE: bool>(
+        &self,
+        builder: &mut Date32Builder,
+    ) {
+        let num_elements = self.num_elements;
+        if num_elements == 0 {
+            return;
+        }
+
+        if NULLABLE {
+            let mut ptr = self.element_offset as *const i32;
+            let null_words = self.null_bitset_ptr();
+            for idx in 0..num_elements {
+                let word_idx = idx >> 6;
+                let bit_idx = idx & 0x3f;
+                let is_null = unsafe { (*null_words.add(word_idx) & (1i64 << bit_idx)) != 0 };
+
+                if is_null {
+                    builder.append_null();
+                } else {
+                    builder.append_value(unsafe { *ptr });
+                }
+                ptr = unsafe { ptr.add(1) };
+            }
+        } else {
+            let slice = unsafe {
+                std::slice::from_raw_parts(self.element_offset as *const i32, num_elements)
+            };
+            builder.append_slice(slice);
+        }
+    }
 }
 
 pub fn append_to_builder<const NULLABLE: bool>(
@@ -112,77 +428,40 @@ pub fn append_to_builder<const NULLABLE: bool>(
 
     match data_type {
         DataType::Boolean => {
-            add_values!(
-                BooleanBuilder,
-                |builder: &mut BooleanBuilder, values: &SparkUnsafeArray, idx: usize| builder
-                    .append_value(values.get_boolean(idx)),
-                |builder: &mut BooleanBuilder| builder.append_null()
-            );
+            let builder = downcast_builder_ref!(BooleanBuilder, builder);
+            array.append_booleans_to_builder::<NULLABLE>(builder);
         }
         DataType::Int8 => {
-            add_values!(
-                Int8Builder,
-                |builder: &mut Int8Builder, values: &SparkUnsafeArray, idx: usize| builder
-                    .append_value(values.get_byte(idx)),
-                |builder: &mut Int8Builder| builder.append_null()
-            );
+            let builder = downcast_builder_ref!(Int8Builder, builder);
+            array.append_bytes_to_builder::<NULLABLE>(builder);
         }
         DataType::Int16 => {
-            add_values!(
-                Int16Builder,
-                |builder: &mut Int16Builder, values: &SparkUnsafeArray, idx: usize| builder
-                    .append_value(values.get_short(idx)),
-                |builder: &mut Int16Builder| builder.append_null()
-            );
+            let builder = downcast_builder_ref!(Int16Builder, builder);
+            array.append_shorts_to_builder::<NULLABLE>(builder);
         }
         DataType::Int32 => {
-            add_values!(
-                Int32Builder,
-                |builder: &mut Int32Builder, values: &SparkUnsafeArray, idx: usize| builder
-                    .append_value(values.get_int(idx)),
-                |builder: &mut Int32Builder| builder.append_null()
-            );
+            let builder = downcast_builder_ref!(Int32Builder, builder);
+            array.append_ints_to_builder::<NULLABLE>(builder);
         }
         DataType::Int64 => {
-            add_values!(
-                Int64Builder,
-                |builder: &mut Int64Builder, values: &SparkUnsafeArray, idx: usize| builder
-                    .append_value(values.get_long(idx)),
-                |builder: &mut Int64Builder| builder.append_null()
-            );
+            let builder = downcast_builder_ref!(Int64Builder, builder);
+            array.append_longs_to_builder::<NULLABLE>(builder);
         }
         DataType::Float32 => {
-            add_values!(
-                Float32Builder,
-                |builder: &mut Float32Builder, values: &SparkUnsafeArray, idx: usize| builder
-                    .append_value(values.get_float(idx)),
-                |builder: &mut Float32Builder| builder.append_null()
-            );
+            let builder = downcast_builder_ref!(Float32Builder, builder);
+            array.append_floats_to_builder::<NULLABLE>(builder);
         }
         DataType::Float64 => {
-            add_values!(
-                Float64Builder,
-                |builder: &mut Float64Builder, values: &SparkUnsafeArray, idx: usize| builder
-                    .append_value(values.get_double(idx)),
-                |builder: &mut Float64Builder| builder.append_null()
-            );
+            let builder = downcast_builder_ref!(Float64Builder, builder);
+            array.append_doubles_to_builder::<NULLABLE>(builder);
         }
         DataType::Timestamp(TimeUnit::Microsecond, _) => {
-            add_values!(
-                TimestampMicrosecondBuilder,
-                |builder: &mut TimestampMicrosecondBuilder,
-                 values: &SparkUnsafeArray,
-                 idx: usize| builder.append_value(values.get_timestamp(idx)),
-                |builder: &mut TimestampMicrosecondBuilder| builder.append_null()
-            );
+            let builder = downcast_builder_ref!(TimestampMicrosecondBuilder, builder);
+            array.append_timestamps_to_builder::<NULLABLE>(builder);
         }
         DataType::Date32 => {
-            add_values!(
-                Date32Builder,
-                |builder: &mut Date32Builder, values: &SparkUnsafeArray, idx: usize| builder
-                    .append_value(values.get_date(idx)),
-                |builder: &mut Date32Builder| builder.append_null()
-            );
+            let builder = downcast_builder_ref!(Date32Builder, builder);
+            array.append_dates_to_builder::<NULLABLE>(builder);
         }
         DataType::Binary => {
             add_values!(
diff --git a/native/core/src/execution/shuffle/mod.rs b/native/core/src/execution/shuffle/mod.rs
index e2798df63e..172dc5f942 100644
--- a/native/core/src/execution/shuffle/mod.rs
+++ b/native/core/src/execution/shuffle/mod.rs
@@ -17,7 +17,7 @@
 
 pub(crate) mod codec;
 mod comet_partitioning;
-mod list;
+pub mod list;
 mod map;
 pub mod row;
 mod shuffle_writer;

From 2ea5631ffc6f0a6377bc6f76b7144cbb7087bcb8 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 20 Jan 2026 11:32:59 -0700
Subject: [PATCH 2/4] chore: format code

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 native/core/benches/array_conversion.rs   | 17 ++++++++++++-----
 native/core/src/execution/shuffle/list.rs | 15 +++------------
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/native/core/benches/array_conversion.rs b/native/core/benches/array_conversion.rs
index 4cb1f7d13b..ed6639a146 100644
--- a/native/core/benches/array_conversion.rs
+++ b/native/core/benches/array_conversion.rs
@@ -18,7 +18,10 @@
 //! Benchmarks for SparkUnsafeArray to Arrow array conversion.
 //! This specifically tests the append_to_builder function used in shuffle read path.
 
-use arrow::array::builder::{ArrayBuilder, Int32Builder, Int64Builder, Float64Builder, Date32Builder, TimestampMicrosecondBuilder};
+use arrow::array::builder::{
+    ArrayBuilder, Date32Builder, Float64Builder, Int32Builder, Int64Builder,
+    TimestampMicrosecondBuilder,
+};
 use arrow::datatypes::{DataType, TimeUnit};
 use comet::execution::shuffle::list::{append_to_builder, SparkUnsafeArray};
 use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
@@ -48,7 +51,8 @@ fn create_spark_unsafe_array_i32(num_elements: usize, with_nulls: bool) -> Vec<u
             let word_idx = i / 64;
             let bit_idx = i % 64;
             let word_offset = 8 + word_idx * 8;
-            let current_word = i64::from_le_bytes(buffer[word_offset..word_offset + 8].try_into().unwrap());
+            let current_word =
+                i64::from_le_bytes(buffer[word_offset..word_offset + 8].try_into().unwrap());
             let new_word = current_word | (1i64 << bit_idx);
             buffer[word_offset..word_offset + 8].copy_from_slice(&new_word.to_le_bytes());
         }
@@ -81,7 +85,8 @@ fn create_spark_unsafe_array_i64(num_elements: usize, with_nulls: bool) -> Vec<u
             let word_idx = i / 64;
             let bit_idx = i % 64;
             let word_offset = 8 + word_idx * 8;
-            let current_word = i64::from_le_bytes(buffer[word_offset..word_offset + 8].try_into().unwrap());
+            let current_word =
+                i64::from_le_bytes(buffer[word_offset..word_offset + 8].try_into().unwrap());
             let new_word = current_word | (1i64 << bit_idx);
             buffer[word_offset..word_offset + 8].copy_from_slice(&new_word.to_le_bytes());
         }
@@ -114,7 +119,8 @@ fn create_spark_unsafe_array_f64(num_elements: usize, with_nulls: bool) -> Vec<u
             let word_idx = i / 64;
             let bit_idx = i % 64;
             let word_offset = 8 + word_idx * 8;
-            let current_word = i64::from_le_bytes(buffer[word_offset..word_offset + 8].try_into().unwrap());
+            let current_word =
+                i64::from_le_bytes(buffer[word_offset..word_offset + 8].try_into().unwrap());
             let new_word = current_word | (1i64 << bit_idx);
             buffer[word_offset..word_offset + 8].copy_from_slice(&new_word.to_le_bytes());
         }
@@ -193,7 +199,8 @@ fn benchmark_array_conversion(c: &mut Criterion) {
                     if with_nulls {
                         append_to_builder::<true>(&DataType::Float64, &mut builder, array).unwrap();
                     } else {
-                        append_to_builder::<false>(&DataType::Float64, &mut builder, array).unwrap();
+                        append_to_builder::<false>(&DataType::Float64, &mut builder, array)
+                            .unwrap();
                     }
                     builder.finish()
                 });
diff --git a/native/core/src/execution/shuffle/list.rs b/native/core/src/execution/shuffle/list.rs
index e4ee31618a..267f343ff5 100644
--- a/native/core/src/execution/shuffle/list.rs
+++ b/native/core/src/execution/shuffle/list.rs
@@ -101,10 +101,7 @@ impl SparkUnsafeArray {
     /// For non-nullable: uses slice append for optimal performance.
     /// For nullable: uses pointer iteration with efficient null bitset reading.
     #[inline]
-    pub(crate) fn append_ints_to_builder<const NULLABLE: bool>(
-        &self,
-        builder: &mut Int32Builder,
-    ) {
+    pub(crate) fn append_ints_to_builder<const NULLABLE: bool>(&self, builder: &mut Int32Builder) {
         let num_elements = self.num_elements;
         if num_elements == 0 {
             return;
@@ -136,10 +133,7 @@ impl SparkUnsafeArray {
 
     /// Bulk append i64 values to builder.
     #[inline]
-    pub(crate) fn append_longs_to_builder<const NULLABLE: bool>(
-        &self,
-        builder: &mut Int64Builder,
-    ) {
+    pub(crate) fn append_longs_to_builder<const NULLABLE: bool>(&self, builder: &mut Int64Builder) {
         let num_elements = self.num_elements;
         if num_elements == 0 {
             return;
@@ -204,10 +198,7 @@ impl SparkUnsafeArray {
 
     /// Bulk append i8 values to builder.
     #[inline]
-    pub(crate) fn append_bytes_to_builder<const NULLABLE: bool>(
-        &self,
-        builder: &mut Int8Builder,
-    ) {
+    pub(crate) fn append_bytes_to_builder<const NULLABLE: bool>(&self, builder: &mut Int8Builder) {
         let num_elements = self.num_elements;
         if num_elements == 0 {
             return;

From a224e222f3926cca8b404288ad2b0d8145a62643 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 20 Jan 2026 11:35:44 -0700
Subject: [PATCH 3/4] refactor: remove unnecessary #[inline] from large
 functions

The #[inline] attribute on functions with loops iterating over thousands
of elements provides no benefit - the function call overhead is negligible
compared to loop body execution, and inlining large functions causes
instruction cache pressure.

Keep #[inline] only on small helper functions:
- get_header_portion_in_bytes (tiny const fn)
- is_null_at (small, hot path)
- null_bitset_ptr (tiny accessor)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 native/core/src/execution/shuffle/list.rs | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/native/core/src/execution/shuffle/list.rs b/native/core/src/execution/shuffle/list.rs
index 267f343ff5..6236bdf5c1 100644
--- a/native/core/src/execution/shuffle/list.rs
+++ b/native/core/src/execution/shuffle/list.rs
@@ -100,7 +100,6 @@ impl SparkUnsafeArray {
     /// Bulk append i32 values to builder.
     /// For non-nullable: uses slice append for optimal performance.
     /// For nullable: uses pointer iteration with efficient null bitset reading.
-    #[inline]
     pub(crate) fn append_ints_to_builder<const NULLABLE: bool>(&self, builder: &mut Int32Builder) {
         let num_elements = self.num_elements;
         if num_elements == 0 {
@@ -132,7 +131,6 @@ impl SparkUnsafeArray {
     }
 
     /// Bulk append i64 values to builder.
-    #[inline]
     pub(crate) fn append_longs_to_builder<const NULLABLE: bool>(&self, builder: &mut Int64Builder) {
         let num_elements = self.num_elements;
         if num_elements == 0 {
@@ -163,7 +161,6 @@ impl SparkUnsafeArray {
     }
 
     /// Bulk append i16 values to builder.
-    #[inline]
     pub(crate) fn append_shorts_to_builder<const NULLABLE: bool>(
         &self,
         builder: &mut Int16Builder,
@@ -197,7 +194,6 @@ impl SparkUnsafeArray {
     }
 
     /// Bulk append i8 values to builder.
-    #[inline]
     pub(crate) fn append_bytes_to_builder<const NULLABLE: bool>(&self, builder: &mut Int8Builder) {
         let num_elements = self.num_elements;
         if num_elements == 0 {
@@ -228,7 +224,6 @@ impl SparkUnsafeArray {
     }
 
     /// Bulk append f32 values to builder.
-    #[inline]
     pub(crate) fn append_floats_to_builder<const NULLABLE: bool>(
         &self,
         builder: &mut Float32Builder,
@@ -262,7 +257,6 @@ impl SparkUnsafeArray {
     }
 
     /// Bulk append f64 values to builder.
-    #[inline]
     pub(crate) fn append_doubles_to_builder<const NULLABLE: bool>(
         &self,
         builder: &mut Float64Builder,
@@ -296,7 +290,6 @@ impl SparkUnsafeArray {
     }
 
     /// Bulk append boolean values to builder using pointer iteration.
-    #[inline]
     pub(crate) fn append_booleans_to_builder<const NULLABLE: bool>(
         &self,
         builder: &mut BooleanBuilder,
@@ -331,7 +324,6 @@ impl SparkUnsafeArray {
     }
 
     /// Bulk append timestamp values to builder.
-    #[inline]
     pub(crate) fn append_timestamps_to_builder<const NULLABLE: bool>(
         &self,
         builder: &mut TimestampMicrosecondBuilder,
@@ -365,7 +357,6 @@ impl SparkUnsafeArray {
     }
 
     /// Bulk append date values to builder.
-    #[inline]
     pub(crate) fn append_dates_to_builder<const NULLABLE: bool>(
         &self,
         builder: &mut Date32Builder,

From fe5454877e8207e5c25e5134586bfd7fbb9533a3 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 20 Jan 2026 11:41:31 -0700
Subject: [PATCH 4/4] fix: address clippy warnings in benchmark

- Remove unused ArrayBuilder import
- Use div_ceil() instead of manual implementation

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 native/core/benches/array_conversion.rs | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/native/core/benches/array_conversion.rs b/native/core/benches/array_conversion.rs
index ed6639a146..5007bff1ce 100644
--- a/native/core/benches/array_conversion.rs
+++ b/native/core/benches/array_conversion.rs
@@ -19,8 +19,7 @@
 //! This specifically tests the append_to_builder function used in shuffle read path.
 
 use arrow::array::builder::{
-    ArrayBuilder, Date32Builder, Float64Builder, Int32Builder, Int64Builder,
-    TimestampMicrosecondBuilder,
+    Date32Builder, Float64Builder, Int32Builder, Int64Builder, TimestampMicrosecondBuilder,
 };
 use arrow::datatypes::{DataType, TimeUnit};
 use comet::execution::shuffle::list::{append_to_builder, SparkUnsafeArray};
@@ -35,7 +34,7 @@ const NUM_ELEMENTS: usize = 10000;
 /// - element data: 4 bytes per element (i32)
 fn create_spark_unsafe_array_i32(num_elements: usize, with_nulls: bool) -> Vec<u8> {
     // Header size: 8 (num_elements) + ceil(num_elements/64) * 8 (null bitset)
-    let null_bitset_words = (num_elements + 63) / 64;
+    let null_bitset_words = num_elements.div_ceil(64);
     let header_size = 8 + null_bitset_words * 8;
     let data_size = num_elements * 4; // i32 = 4 bytes
     let total_size = header_size + data_size;
@@ -69,7 +68,7 @@ fn create_spark_unsafe_array_i32(num_elements: usize, with_nulls: bool) -> Vec<u
 
 /// Create a SparkUnsafeArray in memory with i64 elements.
 fn create_spark_unsafe_array_i64(num_elements: usize, with_nulls: bool) -> Vec<u8> {
-    let null_bitset_words = (num_elements + 63) / 64;
+    let null_bitset_words = num_elements.div_ceil(64);
     let header_size = 8 + null_bitset_words * 8;
     let data_size = num_elements * 8; // i64 = 8 bytes
     let total_size = header_size + data_size;
@@ -103,7 +102,7 @@ fn create_spark_unsafe_array_i64(num_elements: usize, with_nulls: bool) -> Vec<u
 
 /// Create a SparkUnsafeArray in memory with f64 elements.
 fn create_spark_unsafe_array_f64(num_elements: usize, with_nulls: bool) -> Vec<u8> {
-    let null_bitset_words = (num_elements + 63) / 64;
+    let null_bitset_words = num_elements.div_ceil(64);
     let header_size = 8 + null_bitset_words * 8;
     let data_size = num_elements * 8; // f64 = 8 bytes
     let total_size = header_size + data_size;