diff --git a/src/row/arch/neon/endian.rs b/src/row/arch/neon/endian.rs new file mode 100644 index 00000000..6f800f7b --- /dev/null +++ b/src/row/arch/neon/endian.rs @@ -0,0 +1,109 @@ +//! Endian-aware u16/u32 SIMD loaders for AArch64 NEON. +#![allow(dead_code)] // tier kernels (Phase 2 rollout PRs) will consume these +//! +//! Each helper takes a raw byte pointer to LE-encoded (or BE-encoded) data +//! and returns a NEON vector containing the elements in **host-native** byte +//! order, ready for native u16/u32 SIMD math. +//! +//! The host-native conversion is monomorphized at compile time via +//! `cfg(target_endian = ...)`: +//! - `load_le_*` is a no-op on LE targets, byte-swap on BE targets +//! - `load_be_*` is byte-swap on LE targets, no-op on BE targets +//! +//! Tier kernels call the generic dispatchers `load_endian_u16x8::` and +//! `load_endian_u32x4::` from their own `` contexts. +//! The `if BE { ... } else { ... }` in the dispatcher is eliminated by the +//! compiler — each monomorphization sees only one branch. + +use core::arch::aarch64::*; + +// ---- u16x8 loaders --------------------------------------------------------- + +/// Loads 8 × u16 from `ptr` (LE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 16 readable bytes, aligned to at least 1 byte. +/// Caller must have NEON enabled (via `#[target_feature(enable = "neon")]`). +#[inline(always)] +pub(crate) unsafe fn load_le_u16x8(ptr: *const u8) -> uint16x8_t { + let v = unsafe { vld1q_u16(ptr.cast()) }; + #[cfg(target_endian = "big")] + let v = unsafe { vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(v))) }; + v +} + +/// Loads 8 × u16 from `ptr` (BE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 16 readable bytes, aligned to at least 1 byte. +/// Caller must have NEON enabled (via `#[target_feature(enable = "neon")]`). +#[inline(always)] +pub(crate) unsafe fn load_be_u16x8(ptr: *const u8) -> uint16x8_t { + let v = unsafe { vld1q_u16(ptr.cast()) }; + #[cfg(target_endian = "little")] + let v = unsafe { vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(v))) }; + v +} + +/// Generic dispatcher: routes to `load_le_u16x8` or `load_be_u16x8` based on +/// the compile-time `BE` const parameter. The unused branch is eliminated by +/// the compiler when the caller is monomorphized. +/// +/// # Safety +/// +/// Same as `load_le_u16x8` / `load_be_u16x8`. +#[inline(always)] +pub(crate) unsafe fn load_endian_u16x8(ptr: *const u8) -> uint16x8_t { + if BE { + unsafe { load_be_u16x8(ptr) } + } else { + unsafe { load_le_u16x8(ptr) } + } +} + +// ---- u32x4 loaders --------------------------------------------------------- + +/// Loads 4 × u32 from `ptr` (LE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 16 readable bytes, aligned to at least 1 byte. +/// Caller must have NEON enabled. +#[inline(always)] +pub(crate) unsafe fn load_le_u32x4(ptr: *const u8) -> uint32x4_t { + let v = unsafe { vld1q_u32(ptr.cast()) }; + #[cfg(target_endian = "big")] + let v = unsafe { vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(v))) }; + v +} + +/// Loads 4 × u32 from `ptr` (BE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 16 readable bytes, aligned to at least 1 byte. +/// Caller must have NEON enabled. +#[inline(always)] +pub(crate) unsafe fn load_be_u32x4(ptr: *const u8) -> uint32x4_t { + let v = unsafe { vld1q_u32(ptr.cast()) }; + #[cfg(target_endian = "little")] + let v = unsafe { vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(v))) }; + v +} + +/// Generic dispatcher: routes to `load_le_u32x4` or `load_be_u32x4` based on +/// the compile-time `BE` const parameter. +/// +/// # Safety +/// +/// Same as `load_le_u32x4` / `load_be_u32x4`. +#[inline(always)] +pub(crate) unsafe fn load_endian_u32x4(ptr: *const u8) -> uint32x4_t { + if BE { + unsafe { load_be_u32x4(ptr) } + } else { + unsafe { load_le_u32x4(ptr) } + } +} diff --git a/src/row/arch/neon/mod.rs b/src/row/arch/neon/mod.rs index c46d02c1..86b8d00a 100644 --- a/src/row/arch/neon/mod.rs +++ b/src/row/arch/neon/mod.rs @@ -39,6 +39,7 @@ pub(super) use crate::{ColorMatrix, row::scalar}; pub(crate) mod alpha_extract; mod ayuv64; +pub(crate) mod endian; mod gray; mod hsv; pub(crate) mod legacy_rgb; diff --git a/src/row/arch/neon/tests/endian.rs b/src/row/arch/neon/tests/endian.rs new file mode 100644 index 00000000..79c7cb8c --- /dev/null +++ b/src/row/arch/neon/tests/endian.rs @@ -0,0 +1,191 @@ +use crate::row::arch::neon::endian::*; + +// Helper: extract uint16x8_t to a stack array. +unsafe fn u16x8_to_arr(v: core::arch::aarch64::uint16x8_t) -> [u16; 8] { + let mut out = [0u16; 8]; + unsafe { core::arch::aarch64::vst1q_u16(out.as_mut_ptr(), v) }; + out +} + +// Helper: extract uint32x4_t to a stack array. +unsafe fn u32x4_to_arr(v: core::arch::aarch64::uint32x4_t) -> [u32; 4] { + let mut out = [0u32; 4]; + unsafe { core::arch::aarch64::vst1q_u32(out.as_mut_ptr(), v) }; + out +} + +// ---- LE loader on LE host (no-op) ------------------------------------------ + +/// On a LE host, `load_le_u16x8` must NOT swap bytes — the in-memory LE +/// layout already matches host-native order. +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn neon_load_le_u16x8_noop_on_le_host() { + // 0x0102 stored LE = bytes [0x02, 0x01]; host reads as 0x0102. + let input: [u8; 16] = [ + 0x02, 0x01, // u16[0] = 0x0102 + 0x04, 0x03, // u16[1] = 0x0304 + 0x06, 0x05, // u16[2] = 0x0506 + 0x08, 0x07, // u16[3] = 0x0708 + 0x0a, 0x09, // u16[4] = 0x090a + 0x0c, 0x0b, // u16[5] = 0x0b0c + 0x0e, 0x0d, // u16[6] = 0x0d0e + 0x10, 0x0f, // u16[7] = 0x0f10 + ]; + let v = unsafe { load_le_u16x8(input.as_ptr()) }; + let got = unsafe { u16x8_to_arr(v) }; + assert_eq!( + got, + [ + 0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10 + ], + "load_le_u16x8 must not swap on LE host" + ); +} + +/// On a BE host, `load_le_u16x8` MUST swap bytes so the host-native value +/// matches the original LE value. +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "big")] +fn neon_load_le_u16x8_swaps_on_be_host() { + // Same byte layout as above; on a BE host the raw vld1q_u16 would give + // the byte-swapped values, so load_le_u16x8 must reverse that. + let input: [u8; 16] = [ + 0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f, + ]; + let v = unsafe { load_le_u16x8(input.as_ptr()) }; + let got = unsafe { u16x8_to_arr(v) }; + assert_eq!( + got, + [ + 0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10 + ], + "load_le_u16x8 must swap bytes on BE host to restore LE value" + ); +} + +// ---- BE loader on LE host (swap) ------------------------------------------- + +/// On a LE host, `load_be_u16x8` MUST swap bytes — BE-encoded data has the +/// most-significant byte first, which needs swapping for LE-native math. +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn neon_load_be_u16x8_swaps_on_le_host() { + // 0x0102 stored BE = bytes [0x01, 0x02]. + let input: [u8; 16] = [ + 0x01, 0x02, // u16[0] = 0x0102 in BE encoding + 0x03, 0x04, // u16[1] = 0x0304 + 0x05, 0x06, // u16[2] = 0x0506 + 0x07, 0x08, // u16[3] = 0x0708 + 0x09, 0x0a, // u16[4] = 0x090a + 0x0b, 0x0c, // u16[5] = 0x0b0c + 0x0d, 0x0e, // u16[6] = 0x0d0e + 0x0f, 0x10, // u16[7] = 0x0f10 + ]; + let v = unsafe { load_be_u16x8(input.as_ptr()) }; + let got = unsafe { u16x8_to_arr(v) }; + assert_eq!( + got, + [ + 0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10 + ], + "load_be_u16x8 must swap on LE host to convert BE→LE-native" + ); +} + +/// On a BE host, `load_be_u16x8` must NOT swap — BE-encoded data already +/// matches host-native order. +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "big")] +fn neon_load_be_u16x8_noop_on_be_host() { + let input: [u8; 16] = [ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, + ]; + let v = unsafe { load_be_u16x8(input.as_ptr()) }; + let got = unsafe { u16x8_to_arr(v) }; + assert_eq!( + got, + [ + 0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10 + ], + "load_be_u16x8 must not swap on BE host" + ); +} + +// ---- u32x4 LE loader on LE host (no-op) ------------------------------------ + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn neon_load_le_u32x4_noop_on_le_host() { + let input: [u8; 16] = [ + 0x04, 0x03, 0x02, 0x01, // u32[0] = 0x01020304 LE + 0x08, 0x07, 0x06, 0x05, // u32[1] = 0x05060708 + 0x0c, 0x0b, 0x0a, 0x09, // u32[2] = 0x090a0b0c + 0x10, 0x0f, 0x0e, 0x0d, // u32[3] = 0x0d0e0f10 + ]; + let v = unsafe { load_le_u32x4(input.as_ptr()) }; + let got = unsafe { u32x4_to_arr(v) }; + assert_eq!( + got, + [0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10], + "load_le_u32x4 must not swap on LE host" + ); +} + +// ---- u32x4 BE loader on LE host (swap) ------------------------------------- + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn neon_load_be_u32x4_swaps_on_le_host() { + let input: [u8; 16] = [ + 0x01, 0x02, 0x03, 0x04, // u32[0] = 0x01020304 BE + 0x05, 0x06, 0x07, 0x08, // u32[1] = 0x05060708 + 0x09, 0x0a, 0x0b, 0x0c, // u32[2] = 0x090a0b0c + 0x0d, 0x0e, 0x0f, 0x10, // u32[3] = 0x0d0e0f10 + ]; + let v = unsafe { load_be_u32x4(input.as_ptr()) }; + let got = unsafe { u32x4_to_arr(v) }; + assert_eq!( + got, + [0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10], + "load_be_u32x4 must swap on LE host" + ); +} + +// ---- Generic dispatcher consistency ---------------------------------------- + +/// Verify `load_endian_u16x8::` routes to the LE loader. +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn neon_load_endian_u16x8_le_dispatcher() { + let input: [u8; 16] = [ + 0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f, + ]; + let direct = unsafe { load_le_u16x8(input.as_ptr()) }; + let via_dispatch = unsafe { load_endian_u16x8::(input.as_ptr()) }; + let d = unsafe { u16x8_to_arr(direct) }; + let g = unsafe { u16x8_to_arr(via_dispatch) }; + assert_eq!(d, g, "load_endian_u16x8:: must match load_le_u16x8"); +} + +/// Verify `load_endian_u16x8::` routes to the BE loader. +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn neon_load_endian_u16x8_be_dispatcher() { + let input: [u8; 16] = [ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, + ]; + let direct = unsafe { load_be_u16x8(input.as_ptr()) }; + let via_dispatch = unsafe { load_endian_u16x8::(input.as_ptr()) }; + let d = unsafe { u16x8_to_arr(direct) }; + let g = unsafe { u16x8_to_arr(via_dispatch) }; + assert_eq!(d, g, "load_endian_u16x8:: must match load_be_u16x8"); +} diff --git a/src/row/arch/neon/tests/mod.rs b/src/row/arch/neon/tests/mod.rs index 225fa2cf..aa1e28ac 100644 --- a/src/row/arch/neon/tests/mod.rs +++ b/src/row/arch/neon/tests/mod.rs @@ -1,6 +1,7 @@ use super::*; mod ayuv64; +mod endian; mod high_bit_4_2_0; mod high_bit_4_4_4_and_pn; mod legacy_rgb; diff --git a/src/row/arch/wasm_simd128/endian.rs b/src/row/arch/wasm_simd128/endian.rs new file mode 100644 index 00000000..e2c812c6 --- /dev/null +++ b/src/row/arch/wasm_simd128/endian.rs @@ -0,0 +1,123 @@ +//! Endian-aware u16/u32 SIMD loaders for WebAssembly simd128. +#![allow(dead_code)] // tier kernels (Phase 2 rollout PRs) will consume these +//! +//! Each helper takes a raw byte pointer to LE-encoded (or BE-encoded) data +//! and returns a `v128` vector containing the elements in **host-native** byte +//! order, ready for native u16/u32 SIMD math. +//! +//! The host-native conversion is monomorphized at compile time via +//! `cfg(target_endian = ...)`: +//! - `load_le_*` is a no-op on LE targets (wasm32 is LE), byte-swap on BE +//! - `load_be_*` is byte-swap on LE targets, no-op on BE targets +//! +//! Byte-swap is implemented with `u8x16_swizzle`, which has the same +//! semantics as SSSE3 `_mm_shuffle_epi8`: indices ≥ 16 zero the output lane. +//! The shuffle indices are expressed as `i8x16` constants (negative values +//! zero-out lanes, but all our indices are 0..15 so we use non-negative +//! values cast to i8). + +use core::arch::wasm32::*; + +// ---- u16x8 loaders --------------------------------------------------------- + +/// Loads 8 × u16 from `ptr` (LE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 16 readable bytes. Caller must have simd128 +/// enabled via `#[target_feature(enable = "simd128")]`. +#[inline(always)] +pub(crate) unsafe fn load_le_u16x8(ptr: *const u8) -> v128 { + let v = unsafe { v128_load(ptr.cast()) }; + #[cfg(target_endian = "big")] + let v = { + // swap bytes within each u16 lane: [1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14] + let mask = i8x16(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + u8x16_swizzle(v, mask) + }; + v +} + +/// Loads 8 × u16 from `ptr` (BE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 16 readable bytes. Caller must have simd128 +/// enabled. +#[inline(always)] +pub(crate) unsafe fn load_be_u16x8(ptr: *const u8) -> v128 { + let v = unsafe { v128_load(ptr.cast()) }; + #[cfg(target_endian = "little")] + let v = { + let mask = i8x16(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + u8x16_swizzle(v, mask) + }; + v +} + +/// Generic dispatcher: routes to `load_le_u16x8` or `load_be_u16x8` based on +/// the compile-time `BE` const parameter. +/// +/// # Safety +/// +/// Same as `load_le_u16x8` / `load_be_u16x8`. +#[inline(always)] +pub(crate) unsafe fn load_endian_u16x8(ptr: *const u8) -> v128 { + if BE { + unsafe { load_be_u16x8(ptr) } + } else { + unsafe { load_le_u16x8(ptr) } + } +} + +// ---- u32x4 loaders --------------------------------------------------------- + +/// Loads 4 × u32 from `ptr` (LE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 16 readable bytes. Caller must have simd128 +/// enabled. +#[inline(always)] +pub(crate) unsafe fn load_le_u32x4(ptr: *const u8) -> v128 { + let v = unsafe { v128_load(ptr.cast()) }; + #[cfg(target_endian = "big")] + let v = { + // swap bytes within each u32 lane: [3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12] + let mask = i8x16(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + u8x16_swizzle(v, mask) + }; + v +} + +/// Loads 4 × u32 from `ptr` (BE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 16 readable bytes. Caller must have simd128 +/// enabled. +#[inline(always)] +pub(crate) unsafe fn load_be_u32x4(ptr: *const u8) -> v128 { + let v = unsafe { v128_load(ptr.cast()) }; + #[cfg(target_endian = "little")] + let v = { + let mask = i8x16(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + u8x16_swizzle(v, mask) + }; + v +} + +/// Generic dispatcher: routes to `load_le_u32x4` or `load_be_u32x4` based on +/// the compile-time `BE` const parameter. +/// +/// # Safety +/// +/// Same as `load_le_u32x4` / `load_be_u32x4`. +#[inline(always)] +pub(crate) unsafe fn load_endian_u32x4(ptr: *const u8) -> v128 { + if BE { + unsafe { load_be_u32x4(ptr) } + } else { + unsafe { load_le_u32x4(ptr) } + } +} diff --git a/src/row/arch/wasm_simd128/mod.rs b/src/row/arch/wasm_simd128/mod.rs index 8d4ed835..eed6e69d 100644 --- a/src/row/arch/wasm_simd128/mod.rs +++ b/src/row/arch/wasm_simd128/mod.rs @@ -42,6 +42,7 @@ pub(super) use crate::{ColorMatrix, row::scalar}; pub(crate) mod alpha_extract; mod ayuv64; +pub(crate) mod endian; mod gray; mod hsv; pub(crate) mod legacy_rgb; diff --git a/src/row/arch/wasm_simd128/tests/endian.rs b/src/row/arch/wasm_simd128/tests/endian.rs new file mode 100644 index 00000000..7647eabd --- /dev/null +++ b/src/row/arch/wasm_simd128/tests/endian.rs @@ -0,0 +1,173 @@ +use crate::row::arch::wasm_simd128::endian::*; + +// Helper: extract v128 to a stack array of 8 u16 lanes. +unsafe fn v128_to_u16x8(v: core::arch::wasm32::v128) -> [u16; 8] { + let mut out = [0u16; 8]; + unsafe { core::arch::wasm32::v128_store(out.as_mut_ptr().cast(), v) }; + out +} + +// Helper: extract v128 to a stack array of 4 u32 lanes. +unsafe fn v128_to_u32x4(v: core::arch::wasm32::v128) -> [u32; 4] { + let mut out = [0u32; 4]; + unsafe { core::arch::wasm32::v128_store(out.as_mut_ptr().cast(), v) }; + out +} + +// ---- LE loader on LE host (no-op) ------------------------------------------ + +/// On a LE host (wasm32 is always LE), `load_le_u16x8` must NOT swap bytes. +#[test] +#[cfg(target_endian = "little")] +fn wasm_load_le_u16x8_noop_on_le_host() { + let input: [u8; 16] = [ + 0x02, 0x01, // u16[0] = 0x0102 + 0x04, 0x03, // u16[1] = 0x0304 + 0x06, 0x05, // u16[2] = 0x0506 + 0x08, 0x07, // u16[3] = 0x0708 + 0x0a, 0x09, // u16[4] = 0x090a + 0x0c, 0x0b, // u16[5] = 0x0b0c + 0x0e, 0x0d, // u16[6] = 0x0d0e + 0x10, 0x0f, // u16[7] = 0x0f10 + ]; + let v = unsafe { load_le_u16x8(input.as_ptr()) }; + let got = unsafe { v128_to_u16x8(v) }; + assert_eq!( + got, + [ + 0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10 + ], + "wasm load_le_u16x8 must not swap on LE host" + ); +} + +/// On a BE host, `load_le_u16x8` MUST swap bytes. +#[test] +#[cfg(target_endian = "big")] +fn wasm_load_le_u16x8_swaps_on_be_host() { + let input: [u8; 16] = [ + 0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f, + ]; + let v = unsafe { load_le_u16x8(input.as_ptr()) }; + let got = unsafe { v128_to_u16x8(v) }; + assert_eq!( + got, + [ + 0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10 + ], + "wasm load_le_u16x8 must swap on BE host" + ); +} + +// ---- BE loader on LE host (swap) ------------------------------------------- + +/// On a LE host, `load_be_u16x8` MUST swap bytes. +#[test] +#[cfg(target_endian = "little")] +fn wasm_load_be_u16x8_swaps_on_le_host() { + let input: [u8; 16] = [ + 0x01, 0x02, // u16[0] = 0x0102 BE + 0x03, 0x04, // u16[1] = 0x0304 + 0x05, 0x06, // u16[2] = 0x0506 + 0x07, 0x08, // u16[3] = 0x0708 + 0x09, 0x0a, // u16[4] = 0x090a + 0x0b, 0x0c, // u16[5] = 0x0b0c + 0x0d, 0x0e, // u16[6] = 0x0d0e + 0x0f, 0x10, // u16[7] = 0x0f10 + ]; + let v = unsafe { load_be_u16x8(input.as_ptr()) }; + let got = unsafe { v128_to_u16x8(v) }; + assert_eq!( + got, + [ + 0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10 + ], + "wasm load_be_u16x8 must swap on LE host" + ); +} + +/// On a BE host, `load_be_u16x8` must NOT swap. +#[test] +#[cfg(target_endian = "big")] +fn wasm_load_be_u16x8_noop_on_be_host() { + let input: [u8; 16] = [ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, + ]; + let v = unsafe { load_be_u16x8(input.as_ptr()) }; + let got = unsafe { v128_to_u16x8(v) }; + assert_eq!( + got, + [ + 0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10 + ], + "wasm load_be_u16x8 must not swap on BE host" + ); +} + +// ---- u32x4 LE loader on LE host (no-op) ------------------------------------ + +#[test] +#[cfg(target_endian = "little")] +fn wasm_load_le_u32x4_noop_on_le_host() { + let input: [u8; 16] = [ + 0x04, 0x03, 0x02, 0x01, // u32[0] = 0x01020304 LE + 0x08, 0x07, 0x06, 0x05, // u32[1] = 0x05060708 + 0x0c, 0x0b, 0x0a, 0x09, // u32[2] = 0x090a0b0c + 0x10, 0x0f, 0x0e, 0x0d, // u32[3] = 0x0d0e0f10 + ]; + let v = unsafe { load_le_u32x4(input.as_ptr()) }; + let got = unsafe { v128_to_u32x4(v) }; + assert_eq!( + got, + [0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10], + "wasm load_le_u32x4 must not swap on LE host" + ); +} + +// ---- u32x4 BE loader on LE host (swap) ------------------------------------- + +#[test] +#[cfg(target_endian = "little")] +fn wasm_load_be_u32x4_swaps_on_le_host() { + let input: [u8; 16] = [ + 0x01, 0x02, 0x03, 0x04, // u32[0] = 0x01020304 BE + 0x05, 0x06, 0x07, 0x08, // u32[1] = 0x05060708 + 0x09, 0x0a, 0x0b, 0x0c, // u32[2] = 0x090a0b0c + 0x0d, 0x0e, 0x0f, 0x10, // u32[3] = 0x0d0e0f10 + ]; + let v = unsafe { load_be_u32x4(input.as_ptr()) }; + let got = unsafe { v128_to_u32x4(v) }; + assert_eq!( + got, + [0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10], + "wasm load_be_u32x4 must swap on LE host" + ); +} + +// ---- Generic dispatcher consistency ---------------------------------------- + +#[test] +#[cfg(target_endian = "little")] +fn wasm_load_endian_u16x8_le_dispatcher() { + let input: [u8; 16] = [ + 0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f, + ]; + let direct = unsafe { load_le_u16x8(input.as_ptr()) }; + let via_dispatch = unsafe { load_endian_u16x8::(input.as_ptr()) }; + let d = unsafe { v128_to_u16x8(direct) }; + let g = unsafe { v128_to_u16x8(via_dispatch) }; + assert_eq!(d, g, "load_endian_u16x8:: must match load_le_u16x8"); +} + +#[test] +#[cfg(target_endian = "little")] +fn wasm_load_endian_u16x8_be_dispatcher() { + let input: [u8; 16] = [ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, + ]; + let direct = unsafe { load_be_u16x8(input.as_ptr()) }; + let via_dispatch = unsafe { load_endian_u16x8::(input.as_ptr()) }; + let d = unsafe { v128_to_u16x8(direct) }; + let g = unsafe { v128_to_u16x8(via_dispatch) }; + assert_eq!(d, g, "load_endian_u16x8:: must match load_be_u16x8"); +} diff --git a/src/row/arch/wasm_simd128/tests/mod.rs b/src/row/arch/wasm_simd128/tests/mod.rs index 660bc752..0ed1c869 100644 --- a/src/row/arch/wasm_simd128/tests/mod.rs +++ b/src/row/arch/wasm_simd128/tests/mod.rs @@ -5,6 +5,7 @@ use super::*; use crate::row::scalar::planar_gbr_f16 as scalar_f16; mod ayuv64; +mod endian; mod high_bit_4_2_0; mod high_bit_4_4_4_and_pn; mod legacy_rgb; diff --git a/src/row/arch/x86_avx2/endian.rs b/src/row/arch/x86_avx2/endian.rs new file mode 100644 index 00000000..d2dd6995 --- /dev/null +++ b/src/row/arch/x86_avx2/endian.rs @@ -0,0 +1,133 @@ +//! Endian-aware u16/u32 SIMD loaders for x86_64 AVX2. +#![allow(dead_code)] // tier kernels (Phase 2 rollout PRs) will consume these +//! +//! Each helper takes a raw byte pointer to LE-encoded (or BE-encoded) data +//! and returns a `__m256i` vector containing the elements in **host-native** +//! byte order, ready for native u16/u32 SIMD math. +//! +//! The host-native conversion is monomorphized at compile time via +//! `cfg(target_endian = ...)`: +//! - `load_le_*` is a no-op on LE targets (all real x86), byte-swap on BE +//! - `load_be_*` is byte-swap on LE targets, no-op on BE targets +//! +//! Byte-swap is implemented with `_mm256_shuffle_epi8` (AVX2) using +//! compile-time shuffle masks. The masks replicate the 128-bit SSE pattern +//! across both 128-bit lanes of the 256-bit register — AVX2's `vpshufb` +//! operates per-lane, so the same within-lane byte permutation is applied to +//! both lanes independently. + +use core::arch::x86_64::*; + +// ---- Byte-swap shuffle masks ----------------------------------------------- + +/// AVX2 `_mm256_shuffle_epi8` mask that swaps bytes within every 2-byte (u16) +/// lane across both 128-bit halves. +const BYTESWAP_MASK_U16: __m256i = unsafe { + core::mem::transmute([ + // low 128-bit lane + 1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, + // high 128-bit lane (identical pattern) + 1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, + ]) +}; + +/// AVX2 `_mm256_shuffle_epi8` mask that swaps bytes within every 4-byte (u32) +/// lane across both 128-bit halves. +const BYTESWAP_MASK_U32: __m256i = unsafe { + core::mem::transmute([ + // low 128-bit lane + 3u8, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, + // high 128-bit lane (identical pattern) + 3u8, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, + ]) +}; + +// ---- u16x16 loaders -------------------------------------------------------- + +/// Loads 16 × u16 from `ptr` (LE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 32 readable bytes. Caller must have AVX2 +/// enabled via `#[target_feature(enable = "avx2")]`. +#[inline(always)] +pub(crate) unsafe fn load_le_u16x16(ptr: *const u8) -> __m256i { + let v = unsafe { _mm256_loadu_si256(ptr.cast()) }; + #[cfg(target_endian = "big")] + let v = unsafe { _mm256_shuffle_epi8(v, BYTESWAP_MASK_U16) }; + v +} + +/// Loads 16 × u16 from `ptr` (BE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 32 readable bytes. Caller must have AVX2 +/// enabled. +#[inline(always)] +pub(crate) unsafe fn load_be_u16x16(ptr: *const u8) -> __m256i { + let v = unsafe { _mm256_loadu_si256(ptr.cast()) }; + #[cfg(target_endian = "little")] + let v = unsafe { _mm256_shuffle_epi8(v, BYTESWAP_MASK_U16) }; + v +} + +/// Generic dispatcher: routes to `load_le_u16x16` or `load_be_u16x16` based +/// on the compile-time `BE` const parameter. +/// +/// # Safety +/// +/// Same as `load_le_u16x16` / `load_be_u16x16`. +#[inline(always)] +pub(crate) unsafe fn load_endian_u16x16(ptr: *const u8) -> __m256i { + if BE { + unsafe { load_be_u16x16(ptr) } + } else { + unsafe { load_le_u16x16(ptr) } + } +} + +// ---- u32x8 loaders --------------------------------------------------------- + +/// Loads 8 × u32 from `ptr` (LE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 32 readable bytes. Caller must have AVX2 +/// enabled. +#[inline(always)] +pub(crate) unsafe fn load_le_u32x8(ptr: *const u8) -> __m256i { + let v = unsafe { _mm256_loadu_si256(ptr.cast()) }; + #[cfg(target_endian = "big")] + let v = unsafe { _mm256_shuffle_epi8(v, BYTESWAP_MASK_U32) }; + v +} + +/// Loads 8 × u32 from `ptr` (BE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 32 readable bytes. Caller must have AVX2 +/// enabled. +#[inline(always)] +pub(crate) unsafe fn load_be_u32x8(ptr: *const u8) -> __m256i { + let v = unsafe { _mm256_loadu_si256(ptr.cast()) }; + #[cfg(target_endian = "little")] + let v = unsafe { _mm256_shuffle_epi8(v, BYTESWAP_MASK_U32) }; + v +} + +/// Generic dispatcher: routes to `load_le_u32x8` or `load_be_u32x8` based on +/// the compile-time `BE` const parameter. +/// +/// # Safety +/// +/// Same as `load_le_u32x8` / `load_be_u32x8`. +#[inline(always)] +pub(crate) unsafe fn load_endian_u32x8(ptr: *const u8) -> __m256i { + if BE { + unsafe { load_be_u32x8(ptr) } + } else { + unsafe { load_le_u32x8(ptr) } + } +} diff --git a/src/row/arch/x86_avx2/mod.rs b/src/row/arch/x86_avx2/mod.rs index bf7be11b..abe1c871 100644 --- a/src/row/arch/x86_avx2/mod.rs +++ b/src/row/arch/x86_avx2/mod.rs @@ -59,6 +59,7 @@ pub(super) use crate::{ mod alpha_extract; mod ayuv64; +pub(crate) mod endian; mod gray; mod hsv; pub(crate) mod legacy_rgb; diff --git a/src/row/arch/x86_avx2/tests/endian.rs b/src/row/arch/x86_avx2/tests/endian.rs new file mode 100644 index 00000000..ff394233 --- /dev/null +++ b/src/row/arch/x86_avx2/tests/endian.rs @@ -0,0 +1,204 @@ +use crate::row::arch::x86_avx2::endian::*; + +// Helper: extract __m256i to a stack array of 16 u16 lanes. +#[cfg(target_arch = "x86_64")] +unsafe fn m256i_to_u16x16(v: core::arch::x86_64::__m256i) -> [u16; 16] { + let mut out = [0u16; 16]; + unsafe { core::arch::x86_64::_mm256_storeu_si256(out.as_mut_ptr().cast(), v) }; + out +} + +// Helper: extract __m256i to a stack array of 8 u32 lanes. +#[cfg(target_arch = "x86_64")] +unsafe fn m256i_to_u32x8(v: core::arch::x86_64::__m256i) -> [u32; 8] { + let mut out = [0u32; 8]; + unsafe { core::arch::x86_64::_mm256_storeu_si256(out.as_mut_ptr().cast(), v) }; + out +} + +// ---- LE loader on LE host (no-op) ------------------------------------------ + +#[test] +#[cfg_attr(miri, ignore = "x86 AVX2 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn avx2_load_le_u16x16_noop_on_le_host() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let input: [u8; 32] = [ + 0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f, + 0x12, 0x11, 0x14, 0x13, 0x16, 0x15, 0x18, 0x17, 0x1a, 0x19, 0x1c, 0x1b, 0x1e, 0x1d, 0x20, 0x1f, + ]; + let v = unsafe { load_le_u16x16(input.as_ptr()) }; + let got = unsafe { m256i_to_u16x16(v) }; + assert_eq!( + got, + [ + 0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10, 0x1112, 0x1314, 0x1516, + 0x1718, 0x191a, 0x1b1c, 0x1d1e, 0x1f20, + ], + "AVX2 load_le_u16x16 must not swap on LE host" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 AVX2 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "big")] +fn avx2_load_le_u16x16_swaps_on_be_host() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let input: [u8; 32] = [ + 0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f, + 0x12, 0x11, 0x14, 0x13, 0x16, 0x15, 0x18, 0x17, 0x1a, 0x19, 0x1c, 0x1b, 0x1e, 0x1d, 0x20, 0x1f, + ]; + let v = unsafe { load_le_u16x16(input.as_ptr()) }; + let got = unsafe { m256i_to_u16x16(v) }; + assert_eq!( + got, + [ + 0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10, 0x1112, 0x1314, 0x1516, + 0x1718, 0x191a, 0x1b1c, 0x1d1e, 0x1f20, + ], + "AVX2 load_le_u16x16 must swap on BE host" + ); +} + +// ---- BE loader on LE host (swap) ------------------------------------------- + +#[test] +#[cfg_attr(miri, ignore = "x86 AVX2 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn avx2_load_be_u16x16_swaps_on_le_host() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let input: [u8; 32] = [ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, + ]; + let v = unsafe { load_be_u16x16(input.as_ptr()) }; + let got = unsafe { m256i_to_u16x16(v) }; + assert_eq!( + got, + [ + 0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10, 0x1112, 0x1314, 0x1516, + 0x1718, 0x191a, 0x1b1c, 0x1d1e, 0x1f20, + ], + "AVX2 load_be_u16x16 must swap on LE host" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 AVX2 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "big")] +fn avx2_load_be_u16x16_noop_on_be_host() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let input: [u8; 32] = [ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, + ]; + let v = unsafe { load_be_u16x16(input.as_ptr()) }; + let got = unsafe { m256i_to_u16x16(v) }; + assert_eq!( + got, + [ + 0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10, 0x1112, 0x1314, 0x1516, + 0x1718, 0x191a, 0x1b1c, 0x1d1e, 0x1f20, + ], + "AVX2 load_be_u16x16 must not swap on BE host" + ); +} + +// ---- u32x8 LE loader on LE host (no-op) ------------------------------------ + +#[test] +#[cfg_attr(miri, ignore = "x86 AVX2 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn avx2_load_le_u32x8_noop_on_le_host() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let input: [u8; 32] = [ + 0x04, 0x03, 0x02, 0x01, 0x08, 0x07, 0x06, 0x05, 0x0c, 0x0b, 0x0a, 0x09, 0x10, 0x0f, 0x0e, 0x0d, + 0x14, 0x13, 0x12, 0x11, 0x18, 0x17, 0x16, 0x15, 0x1c, 0x1b, 0x1a, 0x19, 0x20, 0x1f, 0x1e, 0x1d, + ]; + let v = unsafe { load_le_u32x8(input.as_ptr()) }; + let got = unsafe { m256i_to_u32x8(v) }; + assert_eq!( + got, + [ + 0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10, 0x11121314, 0x15161718, 0x191a1b1c, + 0x1d1e1f20, + ], + "AVX2 load_le_u32x8 must not swap on LE host" + ); +} + +// ---- u32x8 BE loader on LE host (swap) ------------------------------------- + +#[test] +#[cfg_attr(miri, ignore = "x86 AVX2 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn avx2_load_be_u32x8_swaps_on_le_host() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let input: [u8; 32] = [ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, + ]; + let v = unsafe { load_be_u32x8(input.as_ptr()) }; + let got = unsafe { m256i_to_u32x8(v) }; + assert_eq!( + got, + [ + 0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10, 0x11121314, 0x15161718, 0x191a1b1c, + 0x1d1e1f20, + ], + "AVX2 load_be_u32x8 must swap on LE host" + ); +} + +// ---- Generic dispatcher consistency ---------------------------------------- + +#[test] +#[cfg_attr(miri, ignore = "x86 AVX2 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn avx2_load_endian_u16x16_le_dispatcher() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let input: [u8; 32] = [ + 0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f, + 0x12, 0x11, 0x14, 0x13, 0x16, 0x15, 0x18, 0x17, 0x1a, 0x19, 0x1c, 0x1b, 0x1e, 0x1d, 0x20, 0x1f, + ]; + let direct = unsafe { load_le_u16x16(input.as_ptr()) }; + let via_dispatch = unsafe { load_endian_u16x16::(input.as_ptr()) }; + let d = unsafe { m256i_to_u16x16(direct) }; + let g = unsafe { m256i_to_u16x16(via_dispatch) }; + assert_eq!( + d, g, + "load_endian_u16x16:: must match load_le_u16x16" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 AVX2 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn avx2_load_endian_u16x16_be_dispatcher() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let input: [u8; 32] = [ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, + ]; + let direct = unsafe { load_be_u16x16(input.as_ptr()) }; + let via_dispatch = unsafe { load_endian_u16x16::(input.as_ptr()) }; + let d = unsafe { m256i_to_u16x16(direct) }; + let g = unsafe { m256i_to_u16x16(via_dispatch) }; + assert_eq!(d, g, "load_endian_u16x16:: must match load_be_u16x16"); +} diff --git a/src/row/arch/x86_avx2/tests/mod.rs b/src/row/arch/x86_avx2/tests/mod.rs index 0acc31f3..fcb6df4b 100644 --- a/src/row/arch/x86_avx2/tests/mod.rs +++ b/src/row/arch/x86_avx2/tests/mod.rs @@ -1,4 +1,5 @@ mod ayuv64; +mod endian; mod high_bit_4_2_0; mod high_bit_4_4_4_and_pn; mod legacy_rgb; diff --git a/src/row/arch/x86_avx512/endian.rs b/src/row/arch/x86_avx512/endian.rs new file mode 100644 index 00000000..b886d90a --- /dev/null +++ b/src/row/arch/x86_avx512/endian.rs @@ -0,0 +1,135 @@ +//! Endian-aware u16/u32 SIMD loaders for x86_64 AVX-512 (F + BW). +#![allow(dead_code)] // tier kernels (Phase 2 rollout PRs) will consume these +//! +//! Each helper takes a raw byte pointer to LE-encoded (or BE-encoded) data +//! and returns a `__m512i` vector containing the elements in **host-native** +//! byte order, ready for native u16/u32 SIMD math. +//! +//! The host-native conversion is monomorphized at compile time via +//! `cfg(target_endian = ...)`: +//! - `load_le_*` is a no-op on LE targets (all real x86), byte-swap on BE +//! - `load_be_*` is byte-swap on LE targets, no-op on BE targets +//! +//! Byte-swap is implemented with `_mm512_shuffle_epi8` (AVX-512BW) using +//! compile-time shuffle masks. AVX-512's `vpshufb` operates per 128-bit +//! lane, so the mask replicates the same within-lane byte permutation across +//! all four 128-bit lanes of the 512-bit register. + +use core::arch::x86_64::*; + +// ---- Byte-swap shuffle masks ----------------------------------------------- + +/// AVX-512BW `_mm512_shuffle_epi8` mask that swaps bytes within every 2-byte +/// (u16) lane across all four 128-bit lanes. +const BYTESWAP_MASK_U16: __m512i = unsafe { + core::mem::transmute([ + // lane 0 + 1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, // lane 1 + 1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, // lane 2 + 1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, // lane 3 + 1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, + ]) +}; + +/// AVX-512BW `_mm512_shuffle_epi8` mask that swaps bytes within every 4-byte +/// (u32) lane across all four 128-bit lanes. +const BYTESWAP_MASK_U32: __m512i = unsafe { + core::mem::transmute([ + // lane 0 + 3u8, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, // lane 1 + 3u8, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, // lane 2 + 3u8, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, // lane 3 + 3u8, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, + ]) +}; + +// ---- u16x32 loaders -------------------------------------------------------- + +/// Loads 32 × u16 from `ptr` (LE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 64 readable bytes. Caller must have AVX-512F +/// and AVX-512BW enabled via +/// `#[target_feature(enable = "avx512f,avx512bw")]`. +#[inline(always)] +pub(crate) unsafe fn load_le_u16x32(ptr: *const u8) -> __m512i { + let v = unsafe { _mm512_loadu_si512(ptr.cast()) }; + #[cfg(target_endian = "big")] + let v = unsafe { _mm512_shuffle_epi8(v, BYTESWAP_MASK_U16) }; + v +} + +/// Loads 32 × u16 from `ptr` (BE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 64 readable bytes. Caller must have AVX-512F +/// and AVX-512BW enabled. +#[inline(always)] +pub(crate) unsafe fn load_be_u16x32(ptr: *const u8) -> __m512i { + let v = unsafe { _mm512_loadu_si512(ptr.cast()) }; + #[cfg(target_endian = "little")] + let v = unsafe { _mm512_shuffle_epi8(v, BYTESWAP_MASK_U16) }; + v +} + +/// Generic dispatcher: routes to `load_le_u16x32` or `load_be_u16x32` based +/// on the compile-time `BE` const parameter. +/// +/// # Safety +/// +/// Same as `load_le_u16x32` / `load_be_u16x32`. +#[inline(always)] +pub(crate) unsafe fn load_endian_u16x32(ptr: *const u8) -> __m512i { + if BE { + unsafe { load_be_u16x32(ptr) } + } else { + unsafe { load_le_u16x32(ptr) } + } +} + +// ---- u32x16 loaders -------------------------------------------------------- + +/// Loads 16 × u32 from `ptr` (LE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 64 readable bytes. Caller must have AVX-512F +/// and AVX-512BW enabled. +#[inline(always)] +pub(crate) unsafe fn load_le_u32x16(ptr: *const u8) -> __m512i { + let v = unsafe { _mm512_loadu_si512(ptr.cast()) }; + #[cfg(target_endian = "big")] + let v = unsafe { _mm512_shuffle_epi8(v, BYTESWAP_MASK_U32) }; + v +} + +/// Loads 16 × u32 from `ptr` (BE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 64 readable bytes. Caller must have AVX-512F +/// and AVX-512BW enabled. +#[inline(always)] +pub(crate) unsafe fn load_be_u32x16(ptr: *const u8) -> __m512i { + let v = unsafe { _mm512_loadu_si512(ptr.cast()) }; + #[cfg(target_endian = "little")] + let v = unsafe { _mm512_shuffle_epi8(v, BYTESWAP_MASK_U32) }; + v +} + +/// Generic dispatcher: routes to `load_le_u32x16` or `load_be_u32x16` based +/// on the compile-time `BE` const parameter. +/// +/// # Safety +/// +/// Same as `load_le_u32x16` / `load_be_u32x16`. +#[inline(always)] +pub(crate) unsafe fn load_endian_u32x16(ptr: *const u8) -> __m512i { + if BE { + unsafe { load_be_u32x16(ptr) } + } else { + unsafe { load_le_u32x16(ptr) } + } +} diff --git a/src/row/arch/x86_avx512/mod.rs b/src/row/arch/x86_avx512/mod.rs index f93dece5..ec820dc3 100644 --- a/src/row/arch/x86_avx512/mod.rs +++ b/src/row/arch/x86_avx512/mod.rs @@ -73,6 +73,7 @@ pub(super) use crate::{ mod alpha_extract; mod ayuv64; +pub(crate) mod endian; mod gray; mod hsv; pub(crate) mod legacy_rgb; diff --git a/src/row/arch/x86_avx512/tests/endian.rs b/src/row/arch/x86_avx512/tests/endian.rs new file mode 100644 index 00000000..a5f5dd7c --- /dev/null +++ b/src/row/arch/x86_avx512/tests/endian.rs @@ -0,0 +1,213 @@ +use crate::row::arch::x86_avx512::endian::*; + +// Helper: extract __m512i to a stack array of 32 u16 lanes. +#[cfg(target_arch = "x86_64")] +unsafe fn m512i_to_u16x32(v: core::arch::x86_64::__m512i) -> [u16; 32] { + let mut out = [0u16; 32]; + unsafe { core::arch::x86_64::_mm512_storeu_si512(out.as_mut_ptr().cast(), v) }; + out +} + +// Helper: extract __m512i to a stack array of 16 u32 lanes. +#[cfg(target_arch = "x86_64")] +unsafe fn m512i_to_u32x16(v: core::arch::x86_64::__m512i) -> [u32; 16] { + let mut out = [0u32; 16]; + unsafe { core::arch::x86_64::_mm512_storeu_si512(out.as_mut_ptr().cast(), v) }; + out +} + +// ---- LE loader on LE host (no-op) ------------------------------------------ + +#[test] +#[cfg_attr(miri, ignore = "x86 AVX-512 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn avx512_load_le_u16x32_noop_on_le_host() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + // Build 64 bytes: pairs [lo, hi] for values 0x0102..0x2021. + let mut input = [0u8; 64]; + for i in 0usize..32 { + // LE encoding: low byte first + input[i * 2] = ((i + 1) as u8).wrapping_add(1); // low byte + input[i * 2 + 1] = (i + 1) as u8; // high byte + } + let v = unsafe { load_le_u16x32(input.as_ptr()) }; + let got = unsafe { m512i_to_u16x32(v) }; + let expected: [u16; 32] = core::array::from_fn(|i| { + let i = i as u16; + ((i + 1) << 8) | (i + 2) + }); + assert_eq!( + got, expected, + "AVX-512 load_le_u16x32 must not swap on LE host" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 AVX-512 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "big")] +fn avx512_load_le_u16x32_swaps_on_be_host() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let mut input = [0u8; 64]; + for i in 0usize..32 { + input[i * 2] = ((i + 1) as u8).wrapping_add(1); + input[i * 2 + 1] = (i + 1) as u8; + } + let v = unsafe { load_le_u16x32(input.as_ptr()) }; + let got = unsafe { m512i_to_u16x32(v) }; + let expected: [u16; 32] = core::array::from_fn(|i| { + let i = i as u16; + ((i + 1) << 8) | (i + 2) + }); + assert_eq!(got, expected, "AVX-512 load_le_u16x32 must swap on BE host"); +} + +// ---- BE loader on LE host (swap) ------------------------------------------- + +#[test] +#[cfg_attr(miri, ignore = "x86 AVX-512 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn avx512_load_be_u16x32_swaps_on_le_host() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + // BE encoding: high byte first. + let mut input = [0u8; 64]; + for i in 0usize..32 { + input[i * 2] = (i + 1) as u8; // high byte + input[i * 2 + 1] = ((i + 1) as u8).wrapping_add(1); // low byte + } + let v = unsafe { load_be_u16x32(input.as_ptr()) }; + let got = unsafe { m512i_to_u16x32(v) }; + let expected: [u16; 32] = core::array::from_fn(|i| { + let i = i as u16; + ((i + 1) << 8) | (i + 2) + }); + assert_eq!(got, expected, "AVX-512 load_be_u16x32 must swap on LE host"); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 AVX-512 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "big")] +fn avx512_load_be_u16x32_noop_on_be_host() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let mut input = [0u8; 64]; + for i in 0usize..32 { + input[i * 2] = (i + 1) as u8; + input[i * 2 + 1] = ((i + 1) as u8).wrapping_add(1); + } + let v = unsafe { load_be_u16x32(input.as_ptr()) }; + let got = unsafe { m512i_to_u16x32(v) }; + let expected: [u16; 32] = core::array::from_fn(|i| { + let i = i as u16; + ((i + 1) << 8) | (i + 2) + }); + assert_eq!( + got, expected, + "AVX-512 load_be_u16x32 must not swap on BE host" + ); +} + +// ---- u32x16 LE loader on LE host (no-op) ----------------------------------- + +#[test] +#[cfg_attr(miri, ignore = "x86 AVX-512 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn avx512_load_le_u32x16_noop_on_le_host() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let input: [u8; 64] = [ + 0x04, 0x03, 0x02, 0x01, 0x08, 0x07, 0x06, 0x05, 0x0c, 0x0b, 0x0a, 0x09, 0x10, 0x0f, 0x0e, 0x0d, + 0x14, 0x13, 0x12, 0x11, 0x18, 0x17, 0x16, 0x15, 0x1c, 0x1b, 0x1a, 0x19, 0x20, 0x1f, 0x1e, 0x1d, + 0x24, 0x23, 0x22, 0x21, 0x28, 0x27, 0x26, 0x25, 0x2c, 0x2b, 0x2a, 0x29, 0x30, 0x2f, 0x2e, 0x2d, + 0x34, 0x33, 0x32, 0x31, 0x38, 0x37, 0x36, 0x35, 0x3c, 0x3b, 0x3a, 0x39, 0x40, 0x3f, 0x3e, 0x3d, + ]; + let v = unsafe { load_le_u32x16(input.as_ptr()) }; + let got = unsafe { m512i_to_u32x16(v) }; + assert_eq!( + got, + [ + 0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10, 0x11121314, 0x15161718, 0x191a1b1c, + 0x1d1e1f20, 0x21222324, 0x25262728, 0x292a2b2c, 0x2d2e2f30, 0x31323334, 0x35363738, + 0x393a3b3c, 0x3d3e3f40, + ], + "AVX-512 load_le_u32x16 must not swap on LE host" + ); +} + +// ---- u32x16 BE loader on LE host (swap) ------------------------------------ + +#[test] +#[cfg_attr(miri, ignore = "x86 AVX-512 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn avx512_load_be_u32x16_swaps_on_le_host() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let input: [u8; 64] = [ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, + 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, + ]; + let v = unsafe { load_be_u32x16(input.as_ptr()) }; + let got = unsafe { m512i_to_u32x16(v) }; + assert_eq!( + got, + [ + 0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10, 0x11121314, 0x15161718, 0x191a1b1c, + 0x1d1e1f20, 0x21222324, 0x25262728, 0x292a2b2c, 0x2d2e2f30, 0x31323334, 0x35363738, + 0x393a3b3c, 0x3d3e3f40, + ], + "AVX-512 load_be_u32x16 must swap on LE host" + ); +} + +// ---- Generic dispatcher consistency ---------------------------------------- + +#[test] +#[cfg_attr(miri, ignore = "x86 AVX-512 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn avx512_load_endian_u16x32_le_dispatcher() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let mut input = [0u8; 64]; + for i in 0usize..32 { + input[i * 2] = ((i + 1) as u8).wrapping_add(1); + input[i * 2 + 1] = (i + 1) as u8; + } + let direct = unsafe { load_le_u16x32(input.as_ptr()) }; + let via_dispatch = unsafe { load_endian_u16x32::(input.as_ptr()) }; + let d = unsafe { m512i_to_u16x32(direct) }; + let g = unsafe { m512i_to_u16x32(via_dispatch) }; + assert_eq!( + d, g, + "load_endian_u16x32:: must match load_le_u16x32" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 AVX-512 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn avx512_load_endian_u16x32_be_dispatcher() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let mut input = [0u8; 64]; + for i in 0usize..32 { + input[i * 2] = (i + 1) as u8; + input[i * 2 + 1] = ((i + 1) as u8).wrapping_add(1); + } + let direct = unsafe { load_be_u16x32(input.as_ptr()) }; + let via_dispatch = unsafe { load_endian_u16x32::(input.as_ptr()) }; + let d = unsafe { m512i_to_u16x32(direct) }; + let g = unsafe { m512i_to_u16x32(via_dispatch) }; + assert_eq!(d, g, "load_endian_u16x32:: must match load_be_u16x32"); +} diff --git a/src/row/arch/x86_avx512/tests/mod.rs b/src/row/arch/x86_avx512/tests/mod.rs index a66d6ea6..de1c57ad 100644 --- a/src/row/arch/x86_avx512/tests/mod.rs +++ b/src/row/arch/x86_avx512/tests/mod.rs @@ -1,4 +1,5 @@ mod ayuv64; +mod endian; mod high_bit_4_2_0; mod high_bit_4_4_4_and_pn; mod legacy_rgb; diff --git a/src/row/arch/x86_sse41/endian.rs b/src/row/arch/x86_sse41/endian.rs new file mode 100644 index 00000000..f7dc1d38 --- /dev/null +++ b/src/row/arch/x86_sse41/endian.rs @@ -0,0 +1,121 @@ +//! Endian-aware u16/u32 SIMD loaders for x86_64 SSE4.1. +#![allow(dead_code)] // tier kernels (Phase 2 rollout PRs) will consume these +//! +//! Each helper takes a raw byte pointer to LE-encoded (or BE-encoded) data +//! and returns an `__m128i` vector containing the elements in **host-native** +//! byte order, ready for native u16/u32 SIMD math. +//! +//! The host-native conversion is monomorphized at compile time via +//! `cfg(target_endian = ...)`: +//! - `load_le_*` is a no-op on LE targets (all real x86), byte-swap on BE +//! - `load_be_*` is byte-swap on LE targets, no-op on BE targets +//! +//! Byte-swap is implemented with `_mm_shuffle_epi8` (SSSE3, a subset of +//! SSE4.1) using compile-time shuffle masks. The mask constants use +//! `core::mem::transmute` because `__m128i` has no `const` constructor in +//! stable Rust; the transmutes are always safe — `__m128i` is a plain 128-bit +//! bag of bits. + +use core::arch::x86_64::*; + +// ---- Byte-swap shuffle masks ----------------------------------------------- + +/// SSSE3 `_mm_shuffle_epi8` mask that swaps bytes within every 2-byte (u16) +/// lane: `[1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14]`. +const BYTESWAP_MASK_U16: __m128i = + unsafe { core::mem::transmute([1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) }; + +/// SSSE3 `_mm_shuffle_epi8` mask that swaps bytes within every 4-byte (u32) +/// lane: `[3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12]`. +const BYTESWAP_MASK_U32: __m128i = + unsafe { core::mem::transmute([3u8, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12]) }; + +// ---- u16x8 loaders --------------------------------------------------------- + +/// Loads 8 × u16 from `ptr` (LE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 16 readable bytes. Caller must have SSE4.1 +/// (and SSSE3) enabled via `#[target_feature(enable = "sse4.1")]`. +#[inline(always)] +pub(crate) unsafe fn load_le_u16x8(ptr: *const u8) -> __m128i { + let v = unsafe { _mm_loadu_si128(ptr.cast()) }; + #[cfg(target_endian = "big")] + let v = unsafe { _mm_shuffle_epi8(v, BYTESWAP_MASK_U16) }; + v +} + +/// Loads 8 × u16 from `ptr` (BE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 16 readable bytes. Caller must have SSE4.1 +/// (and SSSE3) enabled. +#[inline(always)] +pub(crate) unsafe fn load_be_u16x8(ptr: *const u8) -> __m128i { + let v = unsafe { _mm_loadu_si128(ptr.cast()) }; + #[cfg(target_endian = "little")] + let v = unsafe { _mm_shuffle_epi8(v, BYTESWAP_MASK_U16) }; + v +} + +/// Generic dispatcher: routes to `load_le_u16x8` or `load_be_u16x8` based on +/// the compile-time `BE` const parameter. +/// +/// # Safety +/// +/// Same as `load_le_u16x8` / `load_be_u16x8`. +#[inline(always)] +pub(crate) unsafe fn load_endian_u16x8(ptr: *const u8) -> __m128i { + if BE { + unsafe { load_be_u16x8(ptr) } + } else { + unsafe { load_le_u16x8(ptr) } + } +} + +// ---- u32x4 loaders --------------------------------------------------------- + +/// Loads 4 × u32 from `ptr` (LE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 16 readable bytes. Caller must have SSE4.1 +/// (and SSSE3) enabled. +#[inline(always)] +pub(crate) unsafe fn load_le_u32x4(ptr: *const u8) -> __m128i { + let v = unsafe { _mm_loadu_si128(ptr.cast()) }; + #[cfg(target_endian = "big")] + let v = unsafe { _mm_shuffle_epi8(v, BYTESWAP_MASK_U32) }; + v +} + +/// Loads 4 × u32 from `ptr` (BE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 16 readable bytes. Caller must have SSE4.1 +/// (and SSSE3) enabled. +#[inline(always)] +pub(crate) unsafe fn load_be_u32x4(ptr: *const u8) -> __m128i { + let v = unsafe { _mm_loadu_si128(ptr.cast()) }; + #[cfg(target_endian = "little")] + let v = unsafe { _mm_shuffle_epi8(v, BYTESWAP_MASK_U32) }; + v +} + +/// Generic dispatcher: routes to `load_le_u32x4` or `load_be_u32x4` based on +/// the compile-time `BE` const parameter. +/// +/// # Safety +/// +/// Same as `load_le_u32x4` / `load_be_u32x4`. +#[inline(always)] +pub(crate) unsafe fn load_endian_u32x4(ptr: *const u8) -> __m128i { + if BE { + unsafe { load_be_u32x4(ptr) } + } else { + unsafe { load_le_u32x4(ptr) } + } +} diff --git a/src/row/arch/x86_sse41/mod.rs b/src/row/arch/x86_sse41/mod.rs index 2a54eb76..22a337f0 100644 --- a/src/row/arch/x86_sse41/mod.rs +++ b/src/row/arch/x86_sse41/mod.rs @@ -56,6 +56,7 @@ pub(super) use crate::{ mod alpha_extract; mod ayuv64; +pub(crate) mod endian; mod gray; mod hsv; pub(crate) mod legacy_rgb; diff --git a/src/row/arch/x86_sse41/tests/endian.rs b/src/row/arch/x86_sse41/tests/endian.rs new file mode 100644 index 00000000..c6dca72d --- /dev/null +++ b/src/row/arch/x86_sse41/tests/endian.rs @@ -0,0 +1,207 @@ +use crate::row::arch::x86_sse41::endian::*; + +// Helper: extract __m128i to a stack array of 8 u16 lanes. +#[cfg(target_arch = "x86_64")] +unsafe fn m128i_to_u16x8(v: core::arch::x86_64::__m128i) -> [u16; 8] { + let mut out = [0u16; 8]; + unsafe { core::arch::x86_64::_mm_storeu_si128(out.as_mut_ptr().cast(), v) }; + out +} + +// Helper: extract __m128i to a stack array of 4 u32 lanes. +#[cfg(target_arch = "x86_64")] +unsafe fn m128i_to_u32x4(v: core::arch::x86_64::__m128i) -> [u32; 4] { + let mut out = [0u32; 4]; + unsafe { core::arch::x86_64::_mm_storeu_si128(out.as_mut_ptr().cast(), v) }; + out +} + +// ---- LE loader on LE host (no-op) ------------------------------------------ + +/// On a LE host, `load_le_u16x8` must NOT swap bytes. +#[test] +#[cfg_attr(miri, ignore = "x86 SSE4.1 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn sse41_load_le_u16x8_noop_on_le_host() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let input: [u8; 16] = [ + 0x02, 0x01, // u16[0] = 0x0102 + 0x04, 0x03, // u16[1] = 0x0304 + 0x06, 0x05, // u16[2] = 0x0506 + 0x08, 0x07, // u16[3] = 0x0708 + 0x0a, 0x09, // u16[4] = 0x090a + 0x0c, 0x0b, // u16[5] = 0x0b0c + 0x0e, 0x0d, // u16[6] = 0x0d0e + 0x10, 0x0f, // u16[7] = 0x0f10 + ]; + let v = unsafe { load_le_u16x8(input.as_ptr()) }; + let got = unsafe { m128i_to_u16x8(v) }; + assert_eq!( + got, + [ + 0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10 + ], + "SSE4.1 load_le_u16x8 must not swap on LE host" + ); +} + +/// On a BE host, `load_le_u16x8` MUST swap bytes. +#[test] +#[cfg_attr(miri, ignore = "x86 SSE4.1 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "big")] +fn sse41_load_le_u16x8_swaps_on_be_host() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let input: [u8; 16] = [ + 0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f, + ]; + let v = unsafe { load_le_u16x8(input.as_ptr()) }; + let got = unsafe { m128i_to_u16x8(v) }; + assert_eq!( + got, + [ + 0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10 + ], + "SSE4.1 load_le_u16x8 must swap on BE host" + ); +} + +// ---- BE loader on LE host (swap) ------------------------------------------- + +/// On a LE host, `load_be_u16x8` MUST swap bytes. +#[test] +#[cfg_attr(miri, ignore = "x86 SSE4.1 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn sse41_load_be_u16x8_swaps_on_le_host() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let input: [u8; 16] = [ + 0x01, 0x02, // u16[0] = 0x0102 BE + 0x03, 0x04, // u16[1] = 0x0304 + 0x05, 0x06, // u16[2] = 0x0506 + 0x07, 0x08, // u16[3] = 0x0708 + 0x09, 0x0a, // u16[4] = 0x090a + 0x0b, 0x0c, // u16[5] = 0x0b0c + 0x0d, 0x0e, // u16[6] = 0x0d0e + 0x0f, 0x10, // u16[7] = 0x0f10 + ]; + let v = unsafe { load_be_u16x8(input.as_ptr()) }; + let got = unsafe { m128i_to_u16x8(v) }; + assert_eq!( + got, + [ + 0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10 + ], + "SSE4.1 load_be_u16x8 must swap on LE host" + ); +} + +/// On a BE host, `load_be_u16x8` must NOT swap. +#[test] +#[cfg_attr(miri, ignore = "x86 SSE4.1 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "big")] +fn sse41_load_be_u16x8_noop_on_be_host() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let input: [u8; 16] = [ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, + ]; + let v = unsafe { load_be_u16x8(input.as_ptr()) }; + let got = unsafe { m128i_to_u16x8(v) }; + assert_eq!( + got, + [ + 0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10 + ], + "SSE4.1 load_be_u16x8 must not swap on BE host" + ); +} + +// ---- u32x4 LE loader on LE host (no-op) ------------------------------------ + +#[test] +#[cfg_attr(miri, ignore = "x86 SSE4.1 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn sse41_load_le_u32x4_noop_on_le_host() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let input: [u8; 16] = [ + 0x04, 0x03, 0x02, 0x01, // u32[0] = 0x01020304 LE + 0x08, 0x07, 0x06, 0x05, // u32[1] = 0x05060708 + 0x0c, 0x0b, 0x0a, 0x09, // u32[2] = 0x090a0b0c + 0x10, 0x0f, 0x0e, 0x0d, // u32[3] = 0x0d0e0f10 + ]; + let v = unsafe { load_le_u32x4(input.as_ptr()) }; + let got = unsafe { m128i_to_u32x4(v) }; + assert_eq!( + got, + [0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10], + "SSE4.1 load_le_u32x4 must not swap on LE host" + ); +} + +// ---- u32x4 BE loader on LE host (swap) ------------------------------------- + +#[test] +#[cfg_attr(miri, ignore = "x86 SSE4.1 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn sse41_load_be_u32x4_swaps_on_le_host() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let input: [u8; 16] = [ + 0x01, 0x02, 0x03, 0x04, // u32[0] = 0x01020304 BE + 0x05, 0x06, 0x07, 0x08, // u32[1] = 0x05060708 + 0x09, 0x0a, 0x0b, 0x0c, // u32[2] = 0x090a0b0c + 0x0d, 0x0e, 0x0f, 0x10, // u32[3] = 0x0d0e0f10 + ]; + let v = unsafe { load_be_u32x4(input.as_ptr()) }; + let got = unsafe { m128i_to_u32x4(v) }; + assert_eq!( + got, + [0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10], + "SSE4.1 load_be_u32x4 must swap on LE host" + ); +} + +// ---- Generic dispatcher consistency ---------------------------------------- + +#[test] +#[cfg_attr(miri, ignore = "x86 SSE4.1 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn sse41_load_endian_u16x8_le_dispatcher() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let input: [u8; 16] = [ + 0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f, + ]; + let direct = unsafe { load_le_u16x8(input.as_ptr()) }; + let via_dispatch = unsafe { load_endian_u16x8::(input.as_ptr()) }; + let d = unsafe { m128i_to_u16x8(direct) }; + let g = unsafe { m128i_to_u16x8(via_dispatch) }; + assert_eq!(d, g, "load_endian_u16x8:: must match load_le_u16x8"); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SSE4.1 SIMD intrinsics unsupported by Miri")] +#[cfg(target_endian = "little")] +fn sse41_load_endian_u16x8_be_dispatcher() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let input: [u8; 16] = [ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, + ]; + let direct = unsafe { load_be_u16x8(input.as_ptr()) }; + let via_dispatch = unsafe { load_endian_u16x8::(input.as_ptr()) }; + let d = unsafe { m128i_to_u16x8(direct) }; + let g = unsafe { m128i_to_u16x8(via_dispatch) }; + assert_eq!(d, g, "load_endian_u16x8:: must match load_be_u16x8"); +} diff --git a/src/row/arch/x86_sse41/tests/mod.rs b/src/row/arch/x86_sse41/tests/mod.rs index be4f8679..f5e14528 100644 --- a/src/row/arch/x86_sse41/tests/mod.rs +++ b/src/row/arch/x86_sse41/tests/mod.rs @@ -1,4 +1,5 @@ mod ayuv64; +mod endian; mod high_bit_4_2_0; mod high_bit_4_4_4_and_pn; mod legacy_rgb;