Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 109 additions & 0 deletions src/row/arch/neon/endian.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
//! Endian-aware u16/u32 SIMD loaders for AArch64 NEON.
#![allow(dead_code)] // tier kernels (Phase 2 rollout PRs) will consume these
//!
//! Each helper takes a raw byte pointer to LE-encoded (or BE-encoded) data
//! and returns a NEON vector containing the elements in **host-native** byte
//! order, ready for native u16/u32 SIMD math.
//!
//! The host-native conversion is monomorphized at compile time via
//! `cfg(target_endian = ...)`:
//! - `load_le_*` is a no-op on LE targets, byte-swap on BE targets
//! - `load_be_*` is byte-swap on LE targets, no-op on BE targets
//!
//! Tier kernels call the generic dispatchers `load_endian_u16x8::<BE>` and
//! `load_endian_u32x4::<BE>` from their own `<const BE: bool>` contexts.
//! The `if BE { ... } else { ... }` in the dispatcher is eliminated by the
//! compiler — each monomorphization sees only one branch.

use core::arch::aarch64::*;

// ---- u16x8 loaders ---------------------------------------------------------

/// Loads 8 × u16 from `ptr` (LE-encoded on disk/wire) into host-native order.
///
/// # Safety
///
/// `ptr` must point to at least 16 readable bytes, aligned to at least 1 byte.
/// Caller must have NEON enabled (via `#[target_feature(enable = "neon")]`).
#[inline(always)]
pub(crate) unsafe fn load_le_u16x8(ptr: *const u8) -> uint16x8_t {
let v = unsafe { vld1q_u16(ptr.cast()) };
#[cfg(target_endian = "big")]
let v = unsafe { vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(v))) };
v
}

/// Loads 8 × u16 from `ptr` (BE-encoded on disk/wire) into host-native order.
///
/// # Safety
///
/// `ptr` must point to at least 16 readable bytes, aligned to at least 1 byte.
/// Caller must have NEON enabled (via `#[target_feature(enable = "neon")]`).
#[inline(always)]
pub(crate) unsafe fn load_be_u16x8(ptr: *const u8) -> uint16x8_t {
let v = unsafe { vld1q_u16(ptr.cast()) };
#[cfg(target_endian = "little")]
let v = unsafe { vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(v))) };
v
}

/// Generic dispatcher: routes to `load_le_u16x8` or `load_be_u16x8` based on
/// the compile-time `BE` const parameter. The unused branch is eliminated by
/// the compiler when the caller is monomorphized.
///
/// # Safety
///
/// Same as `load_le_u16x8` / `load_be_u16x8`.
#[inline(always)]
pub(crate) unsafe fn load_endian_u16x8<const BE: bool>(ptr: *const u8) -> uint16x8_t {
if BE {
unsafe { load_be_u16x8(ptr) }
} else {
unsafe { load_le_u16x8(ptr) }
}
}

// ---- u32x4 loaders ---------------------------------------------------------

/// Loads 4 × u32 from `ptr` (LE-encoded on disk/wire) into host-native order.
///
/// # Safety
///
/// `ptr` must point to at least 16 readable bytes, aligned to at least 1 byte.
/// Caller must have NEON enabled.
#[inline(always)]
pub(crate) unsafe fn load_le_u32x4(ptr: *const u8) -> uint32x4_t {
let v = unsafe { vld1q_u32(ptr.cast()) };
#[cfg(target_endian = "big")]
let v = unsafe { vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(v))) };
v
}

/// Loads 4 × u32 from `ptr` (BE-encoded on disk/wire) into host-native order.
///
/// # Safety
///
/// `ptr` must point to at least 16 readable bytes, aligned to at least 1 byte.
/// Caller must have NEON enabled.
#[inline(always)]
pub(crate) unsafe fn load_be_u32x4(ptr: *const u8) -> uint32x4_t {
let v = unsafe { vld1q_u32(ptr.cast()) };
#[cfg(target_endian = "little")]
let v = unsafe { vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(v))) };
v
}

/// Generic dispatcher: routes to `load_le_u32x4` or `load_be_u32x4` based on
/// the compile-time `BE` const parameter.
///
/// # Safety
///
/// Same as `load_le_u32x4` / `load_be_u32x4`.
#[inline(always)]
pub(crate) unsafe fn load_endian_u32x4<const BE: bool>(ptr: *const u8) -> uint32x4_t {
if BE {
unsafe { load_be_u32x4(ptr) }
} else {
unsafe { load_le_u32x4(ptr) }
}
}
1 change: 1 addition & 0 deletions src/row/arch/neon/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ pub(super) use crate::{ColorMatrix, row::scalar};

pub(crate) mod alpha_extract;
mod ayuv64;
pub(crate) mod endian;
mod gray;
mod hsv;
pub(crate) mod legacy_rgb;
Expand Down
191 changes: 191 additions & 0 deletions src/row/arch/neon/tests/endian.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
use crate::row::arch::neon::endian::*;

// Helper: extract uint16x8_t to a stack array.
unsafe fn u16x8_to_arr(v: core::arch::aarch64::uint16x8_t) -> [u16; 8] {
let mut out = [0u16; 8];
unsafe { core::arch::aarch64::vst1q_u16(out.as_mut_ptr(), v) };
out
}

// Helper: extract uint32x4_t to a stack array.
unsafe fn u32x4_to_arr(v: core::arch::aarch64::uint32x4_t) -> [u32; 4] {
let mut out = [0u32; 4];
unsafe { core::arch::aarch64::vst1q_u32(out.as_mut_ptr(), v) };
out
}

// ---- LE loader on LE host (no-op) ------------------------------------------

/// On a LE host, `load_le_u16x8` must NOT swap bytes — the in-memory LE
/// layout already matches host-native order.
#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
#[cfg(target_endian = "little")]
fn neon_load_le_u16x8_noop_on_le_host() {
// 0x0102 stored LE = bytes [0x02, 0x01]; host reads as 0x0102.
Comment on lines +19 to +25
let input: [u8; 16] = [
0x02, 0x01, // u16[0] = 0x0102
0x04, 0x03, // u16[1] = 0x0304
0x06, 0x05, // u16[2] = 0x0506
0x08, 0x07, // u16[3] = 0x0708
0x0a, 0x09, // u16[4] = 0x090a
0x0c, 0x0b, // u16[5] = 0x0b0c
0x0e, 0x0d, // u16[6] = 0x0d0e
0x10, 0x0f, // u16[7] = 0x0f10
];
let v = unsafe { load_le_u16x8(input.as_ptr()) };
let got = unsafe { u16x8_to_arr(v) };
assert_eq!(
got,
[
0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10
],
"load_le_u16x8 must not swap on LE host"
);
}

/// On a BE host, `load_le_u16x8` MUST swap bytes so the host-native value
/// matches the original LE value.
#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
#[cfg(target_endian = "big")]
fn neon_load_le_u16x8_swaps_on_be_host() {
// Same byte layout as above; on a BE host the raw vld1q_u16 would give
// the byte-swapped values, so load_le_u16x8 must reverse that.
let input: [u8; 16] = [
0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f,
];
let v = unsafe { load_le_u16x8(input.as_ptr()) };
let got = unsafe { u16x8_to_arr(v) };
assert_eq!(
got,
[
0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10
],
"load_le_u16x8 must swap bytes on BE host to restore LE value"
);
}

// ---- BE loader on LE host (swap) -------------------------------------------

/// On a LE host, `load_be_u16x8` MUST swap bytes — BE-encoded data has the
/// most-significant byte first, which needs swapping for LE-native math.
#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
#[cfg(target_endian = "little")]
fn neon_load_be_u16x8_swaps_on_le_host() {
// 0x0102 stored BE = bytes [0x01, 0x02].
let input: [u8; 16] = [
0x01, 0x02, // u16[0] = 0x0102 in BE encoding
0x03, 0x04, // u16[1] = 0x0304
0x05, 0x06, // u16[2] = 0x0506
0x07, 0x08, // u16[3] = 0x0708
0x09, 0x0a, // u16[4] = 0x090a
0x0b, 0x0c, // u16[5] = 0x0b0c
0x0d, 0x0e, // u16[6] = 0x0d0e
0x0f, 0x10, // u16[7] = 0x0f10
];
let v = unsafe { load_be_u16x8(input.as_ptr()) };
let got = unsafe { u16x8_to_arr(v) };
assert_eq!(
got,
[
0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10
],
"load_be_u16x8 must swap on LE host to convert BE→LE-native"
);
}

/// On a BE host, `load_be_u16x8` must NOT swap — BE-encoded data already
/// matches host-native order.
#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
#[cfg(target_endian = "big")]
fn neon_load_be_u16x8_noop_on_be_host() {
let input: [u8; 16] = [
0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
];
let v = unsafe { load_be_u16x8(input.as_ptr()) };
let got = unsafe { u16x8_to_arr(v) };
assert_eq!(
got,
[
0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10
],
"load_be_u16x8 must not swap on BE host"
);
}

// ---- u32x4 LE loader on LE host (no-op) ------------------------------------

#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
#[cfg(target_endian = "little")]
fn neon_load_le_u32x4_noop_on_le_host() {
let input: [u8; 16] = [
0x04, 0x03, 0x02, 0x01, // u32[0] = 0x01020304 LE
0x08, 0x07, 0x06, 0x05, // u32[1] = 0x05060708
0x0c, 0x0b, 0x0a, 0x09, // u32[2] = 0x090a0b0c
0x10, 0x0f, 0x0e, 0x0d, // u32[3] = 0x0d0e0f10
];
let v = unsafe { load_le_u32x4(input.as_ptr()) };
let got = unsafe { u32x4_to_arr(v) };
assert_eq!(
got,
[0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10],
"load_le_u32x4 must not swap on LE host"
);
}

// ---- u32x4 BE loader on LE host (swap) -------------------------------------

#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
#[cfg(target_endian = "little")]
fn neon_load_be_u32x4_swaps_on_le_host() {
let input: [u8; 16] = [
0x01, 0x02, 0x03, 0x04, // u32[0] = 0x01020304 BE
0x05, 0x06, 0x07, 0x08, // u32[1] = 0x05060708
0x09, 0x0a, 0x0b, 0x0c, // u32[2] = 0x090a0b0c
0x0d, 0x0e, 0x0f, 0x10, // u32[3] = 0x0d0e0f10
];
let v = unsafe { load_be_u32x4(input.as_ptr()) };
let got = unsafe { u32x4_to_arr(v) };
assert_eq!(
got,
[0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10],
"load_be_u32x4 must swap on LE host"
);
}

// ---- Generic dispatcher consistency ----------------------------------------

/// Verify `load_endian_u16x8::<false>` routes to the LE loader.
#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
#[cfg(target_endian = "little")]
fn neon_load_endian_u16x8_le_dispatcher() {
let input: [u8; 16] = [
0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f,
];
let direct = unsafe { load_le_u16x8(input.as_ptr()) };
let via_dispatch = unsafe { load_endian_u16x8::<false>(input.as_ptr()) };
let d = unsafe { u16x8_to_arr(direct) };
let g = unsafe { u16x8_to_arr(via_dispatch) };
assert_eq!(d, g, "load_endian_u16x8::<false> must match load_le_u16x8");
}

/// Verify `load_endian_u16x8::<true>` routes to the BE loader.
#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
#[cfg(target_endian = "little")]
fn neon_load_endian_u16x8_be_dispatcher() {
let input: [u8; 16] = [
0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
];
let direct = unsafe { load_be_u16x8(input.as_ptr()) };
let via_dispatch = unsafe { load_endian_u16x8::<true>(input.as_ptr()) };
let d = unsafe { u16x8_to_arr(direct) };
let g = unsafe { u16x8_to_arr(via_dispatch) };
assert_eq!(d, g, "load_endian_u16x8::<true> must match load_be_u16x8");
}
1 change: 1 addition & 0 deletions src/row/arch/neon/tests/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use super::*;

mod ayuv64;
mod endian;
mod high_bit_4_2_0;
mod high_bit_4_4_4_and_pn;
mod legacy_rgb;
Expand Down
Loading
Loading