Findit-AI · al8n · May 7, 2026 · May 7, 2026 · May 7, 2026 · May 7, 2026
diff --git a/src/row/arch/neon/endian.rs b/src/row/arch/neon/endian.rs
@@ -0,0 +1,109 @@
+//! Endian-aware u16/u32 SIMD loaders for AArch64 NEON.
+#![allow(dead_code)] // tier kernels (Phase 2 rollout PRs) will consume these
+//!
+//! Each helper takes a raw byte pointer to LE-encoded (or BE-encoded) data
+//! and returns a NEON vector containing the elements in **host-native** byte
+//! order, ready for native u16/u32 SIMD math.
+//!
+//! The host-native conversion is monomorphized at compile time via
+//! `cfg(target_endian = ...)`:
+//!   - `load_le_*` is a no-op on LE targets, byte-swap on BE targets
+//!   - `load_be_*` is byte-swap on LE targets, no-op on BE targets
+//!
+//! Tier kernels call the generic dispatchers `load_endian_u16x8::<BE>` and
+//! `load_endian_u32x4::<BE>` from their own `<const BE: bool>` contexts.
+//! The `if BE { ... } else { ... }` in the dispatcher is eliminated by the
+//! compiler — each monomorphization sees only one branch.
+
+use core::arch::aarch64::*;
+
+// ---- u16x8 loaders ---------------------------------------------------------
+
+/// Loads 8 × u16 from `ptr` (LE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 16 readable bytes, aligned to at least 1 byte.
+/// Caller must have NEON enabled (via `#[target_feature(enable = "neon")]`).
+#[inline(always)]
+pub(crate) unsafe fn load_le_u16x8(ptr: *const u8) -> uint16x8_t {
+  let v = unsafe { vld1q_u16(ptr.cast()) };
+  #[cfg(target_endian = "big")]
+  let v = unsafe { vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(v))) };
+  v
+}
+
+/// Loads 8 × u16 from `ptr` (BE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 16 readable bytes, aligned to at least 1 byte.
+/// Caller must have NEON enabled (via `#[target_feature(enable = "neon")]`).
+#[inline(always)]
+pub(crate) unsafe fn load_be_u16x8(ptr: *const u8) -> uint16x8_t {
+  let v = unsafe { vld1q_u16(ptr.cast()) };
+  #[cfg(target_endian = "little")]
+  let v = unsafe { vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(v))) };
+  v
+}
+
+/// Generic dispatcher: routes to `load_le_u16x8` or `load_be_u16x8` based on
+/// the compile-time `BE` const parameter.  The unused branch is eliminated by
+/// the compiler when the caller is monomorphized.
+///
+/// # Safety
+///
+/// Same as `load_le_u16x8` / `load_be_u16x8`.
+#[inline(always)]
+pub(crate) unsafe fn load_endian_u16x8<const BE: bool>(ptr: *const u8) -> uint16x8_t {
+  if BE {
+    unsafe { load_be_u16x8(ptr) }
+  } else {
+    unsafe { load_le_u16x8(ptr) }
+  }
+}
+
+// ---- u32x4 loaders ---------------------------------------------------------
+
+/// Loads 4 × u32 from `ptr` (LE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 16 readable bytes, aligned to at least 1 byte.
+/// Caller must have NEON enabled.
+#[inline(always)]
+pub(crate) unsafe fn load_le_u32x4(ptr: *const u8) -> uint32x4_t {
+  let v = unsafe { vld1q_u32(ptr.cast()) };
+  #[cfg(target_endian = "big")]
+  let v = unsafe { vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(v))) };
+  v
+}
+
+/// Loads 4 × u32 from `ptr` (BE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 16 readable bytes, aligned to at least 1 byte.
+/// Caller must have NEON enabled.
+#[inline(always)]
+pub(crate) unsafe fn load_be_u32x4(ptr: *const u8) -> uint32x4_t {
+  let v = unsafe { vld1q_u32(ptr.cast()) };
+  #[cfg(target_endian = "little")]
+  let v = unsafe { vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(v))) };
+  v
+}
+
+/// Generic dispatcher: routes to `load_le_u32x4` or `load_be_u32x4` based on
+/// the compile-time `BE` const parameter.
+///
+/// # Safety
+///
+/// Same as `load_le_u32x4` / `load_be_u32x4`.
+#[inline(always)]
+pub(crate) unsafe fn load_endian_u32x4<const BE: bool>(ptr: *const u8) -> uint32x4_t {
+  if BE {
+    unsafe { load_be_u32x4(ptr) }
+  } else {
+    unsafe { load_le_u32x4(ptr) }
+  }
+}
diff --git a/src/row/arch/neon/mod.rs b/src/row/arch/neon/mod.rs
@@ -39,6 +39,7 @@ pub(super) use crate::{ColorMatrix, row::scalar};
 
 pub(crate) mod alpha_extract;
 mod ayuv64;
+pub(crate) mod endian;
 mod gray;
 mod hsv;
 pub(crate) mod legacy_rgb;

diff --git a/src/row/arch/neon/tests/endian.rs b/src/row/arch/neon/tests/endian.rs
@@ -0,0 +1,191 @@
+use crate::row::arch::neon::endian::*;
+
+// Helper: extract uint16x8_t to a stack array.
+unsafe fn u16x8_to_arr(v: core::arch::aarch64::uint16x8_t) -> [u16; 8] {
+  let mut out = [0u16; 8];
+  unsafe { core::arch::aarch64::vst1q_u16(out.as_mut_ptr(), v) };
+  out
+}
+
+// Helper: extract uint32x4_t to a stack array.
+unsafe fn u32x4_to_arr(v: core::arch::aarch64::uint32x4_t) -> [u32; 4] {
+  let mut out = [0u32; 4];
+  unsafe { core::arch::aarch64::vst1q_u32(out.as_mut_ptr(), v) };
+  out
+}
+
+// ---- LE loader on LE host (no-op) ------------------------------------------
+
+/// On a LE host, `load_le_u16x8` must NOT swap bytes — the in-memory LE
+/// layout already matches host-native order.
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn neon_load_le_u16x8_noop_on_le_host() {
+  // 0x0102 stored LE = bytes [0x02, 0x01]; host reads as 0x0102.
+  let input: [u8; 16] = [
+    0x02, 0x01, // u16[0] = 0x0102
+    0x04, 0x03, // u16[1] = 0x0304
+    0x06, 0x05, // u16[2] = 0x0506
+    0x08, 0x07, // u16[3] = 0x0708
+    0x0a, 0x09, // u16[4] = 0x090a
+    0x0c, 0x0b, // u16[5] = 0x0b0c
+    0x0e, 0x0d, // u16[6] = 0x0d0e
+    0x10, 0x0f, // u16[7] = 0x0f10
+  ];
+  let v = unsafe { load_le_u16x8(input.as_ptr()) };
+  let got = unsafe { u16x8_to_arr(v) };
+  assert_eq!(
+    got,
+    [
+      0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10
+    ],
+    "load_le_u16x8 must not swap on LE host"
+  );
+}
+
+/// On a BE host, `load_le_u16x8` MUST swap bytes so the host-native value
+/// matches the original LE value.
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "big")]
+fn neon_load_le_u16x8_swaps_on_be_host() {
+  // Same byte layout as above; on a BE host the raw vld1q_u16 would give
+  // the byte-swapped values, so load_le_u16x8 must reverse that.
+  let input: [u8; 16] = [
+    0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f,
+  ];
+  let v = unsafe { load_le_u16x8(input.as_ptr()) };
+  let got = unsafe { u16x8_to_arr(v) };
+  assert_eq!(
+    got,
+    [
+      0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10
+    ],
+    "load_le_u16x8 must swap bytes on BE host to restore LE value"
+  );
+}
+
+// ---- BE loader on LE host (swap) -------------------------------------------
+
+/// On a LE host, `load_be_u16x8` MUST swap bytes — BE-encoded data has the
+/// most-significant byte first, which needs swapping for LE-native math.
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn neon_load_be_u16x8_swaps_on_le_host() {
+  // 0x0102 stored BE = bytes [0x01, 0x02].
+  let input: [u8; 16] = [
+    0x01, 0x02, // u16[0] = 0x0102 in BE encoding
+    0x03, 0x04, // u16[1] = 0x0304
+    0x05, 0x06, // u16[2] = 0x0506
+    0x07, 0x08, // u16[3] = 0x0708
+    0x09, 0x0a, // u16[4] = 0x090a
+    0x0b, 0x0c, // u16[5] = 0x0b0c
+    0x0d, 0x0e, // u16[6] = 0x0d0e
+    0x0f, 0x10, // u16[7] = 0x0f10
+  ];
+  let v = unsafe { load_be_u16x8(input.as_ptr()) };
+  let got = unsafe { u16x8_to_arr(v) };
+  assert_eq!(
+    got,
+    [
+      0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10
+    ],
+    "load_be_u16x8 must swap on LE host to convert BE→LE-native"
+  );
+}
+
+/// On a BE host, `load_be_u16x8` must NOT swap — BE-encoded data already
+/// matches host-native order.
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "big")]
+fn neon_load_be_u16x8_noop_on_be_host() {
+  let input: [u8; 16] = [
+    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
+  ];
+  let v = unsafe { load_be_u16x8(input.as_ptr()) };
+  let got = unsafe { u16x8_to_arr(v) };
+  assert_eq!(
+    got,
+    [
+      0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10
+    ],
+    "load_be_u16x8 must not swap on BE host"
+  );
+}
+
+// ---- u32x4 LE loader on LE host (no-op) ------------------------------------
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn neon_load_le_u32x4_noop_on_le_host() {
+  let input: [u8; 16] = [
+    0x04, 0x03, 0x02, 0x01, // u32[0] = 0x01020304 LE
+    0x08, 0x07, 0x06, 0x05, // u32[1] = 0x05060708
+    0x0c, 0x0b, 0x0a, 0x09, // u32[2] = 0x090a0b0c
+    0x10, 0x0f, 0x0e, 0x0d, // u32[3] = 0x0d0e0f10
+  ];
+  let v = unsafe { load_le_u32x4(input.as_ptr()) };
+  let got = unsafe { u32x4_to_arr(v) };
+  assert_eq!(
+    got,
+    [0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10],
+    "load_le_u32x4 must not swap on LE host"
+  );
+}
+
+// ---- u32x4 BE loader on LE host (swap) -------------------------------------
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn neon_load_be_u32x4_swaps_on_le_host() {
+  let input: [u8; 16] = [
+    0x01, 0x02, 0x03, 0x04, // u32[0] = 0x01020304 BE
+    0x05, 0x06, 0x07, 0x08, // u32[1] = 0x05060708
+    0x09, 0x0a, 0x0b, 0x0c, // u32[2] = 0x090a0b0c
+    0x0d, 0x0e, 0x0f, 0x10, // u32[3] = 0x0d0e0f10
+  ];
+  let v = unsafe { load_be_u32x4(input.as_ptr()) };
+  let got = unsafe { u32x4_to_arr(v) };
+  assert_eq!(
+    got,
+    [0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10],
+    "load_be_u32x4 must swap on LE host"
+  );
+}
+
+// ---- Generic dispatcher consistency ----------------------------------------
+
+/// Verify `load_endian_u16x8::<false>` routes to the LE loader.
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn neon_load_endian_u16x8_le_dispatcher() {
+  let input: [u8; 16] = [
+    0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f,
+  ];
+  let direct = unsafe { load_le_u16x8(input.as_ptr()) };
+  let via_dispatch = unsafe { load_endian_u16x8::<false>(input.as_ptr()) };
+  let d = unsafe { u16x8_to_arr(direct) };
+  let g = unsafe { u16x8_to_arr(via_dispatch) };
+  assert_eq!(d, g, "load_endian_u16x8::<false> must match load_le_u16x8");
+}
+
+/// Verify `load_endian_u16x8::<true>` routes to the BE loader.
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn neon_load_endian_u16x8_be_dispatcher() {
+  let input: [u8; 16] = [
+    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
+  ];
+  let direct = unsafe { load_be_u16x8(input.as_ptr()) };
+  let via_dispatch = unsafe { load_endian_u16x8::<true>(input.as_ptr()) };
+  let d = unsafe { u16x8_to_arr(direct) };
+  let g = unsafe { u16x8_to_arr(via_dispatch) };
+  assert_eq!(d, g, "load_endian_u16x8::<true> must match load_be_u16x8");
+}
diff --git a/src/row/arch/neon/tests/mod.rs b/src/row/arch/neon/tests/mod.rs
@@ -1,6 +1,7 @@
 use super::*;
 
 mod ayuv64;
+mod endian;
 mod high_bit_4_2_0;
 mod high_bit_4_4_4_and_pn;
 mod legacy_rgb;