diff --git a/src/row/arch/neon/endian.rs b/src/row/arch/neon/endian.rs index 6f800f7b..55ac4ad5 100644 --- a/src/row/arch/neon/endian.rs +++ b/src/row/arch/neon/endian.rs @@ -107,3 +107,53 @@ pub(crate) unsafe fn load_endian_u32x4(ptr: *const u8) -> uint32 unsafe { load_le_u32x4(ptr) } } } + +// ---- u16x4 loaders (8-byte half-vector) ------------------------------------ +// +// These load only 8 bytes (4 × u16) into a `uint16x4_t` in host-native order. +// Used by Rgbf16 widen kernels (`vcvt_f32_f16` reads 4 × f16 from a +// `uint16x4_t`) when the caller can only guarantee 8 readable bytes — using +// the 16-byte `load_endian_u16x8` would tail-overread. + +/// Loads 4 × u16 from `ptr` (LE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 8 readable bytes, aligned to at least 1 byte. +/// Caller must have NEON enabled. +#[inline(always)] +pub(crate) unsafe fn load_le_u16x4(ptr: *const u8) -> uint16x4_t { + let v = unsafe { vld1_u16(ptr.cast()) }; + #[cfg(target_endian = "big")] + let v = unsafe { vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(v))) }; + v +} + +/// Loads 4 × u16 from `ptr` (BE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 8 readable bytes, aligned to at least 1 byte. +/// Caller must have NEON enabled. +#[inline(always)] +pub(crate) unsafe fn load_be_u16x4(ptr: *const u8) -> uint16x4_t { + let v = unsafe { vld1_u16(ptr.cast()) }; + #[cfg(target_endian = "little")] + let v = unsafe { vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(v))) }; + v +} + +/// Generic dispatcher: routes to `load_le_u16x4` or `load_be_u16x4` based on +/// the compile-time `BE` const parameter. Reads exactly 8 bytes. +/// +/// # Safety +/// +/// Same as `load_le_u16x4` / `load_be_u16x4`. +#[inline(always)] +pub(crate) unsafe fn load_endian_u16x4(ptr: *const u8) -> uint16x4_t { + if BE { + unsafe { load_be_u16x4(ptr) } + } else { + unsafe { load_le_u16x4(ptr) } + } +} diff --git a/src/row/arch/neon/packed_rgb_float.rs b/src/row/arch/neon/packed_rgb_float.rs index 548a81f0..1ea6331a 100644 --- a/src/row/arch/neon/packed_rgb_float.rs +++ b/src/row/arch/neon/packed_rgb_float.rs @@ -15,14 +15,37 @@ //! vector to 4 bytes / 4 u16 elements with `vst*` straight into the //! `R, G, B, R, …` packed output) and trivially fine for the lossless //! `f32` pass-through (just `vst1q_f32`). +//! +//! For `` kernels, each 4-lane f32 load is replaced by +//! an endian-aware u32x4 load (via `load_endian_u32x4::`) followed +//! by a `vreinterpretq_f32_u32` cast. For LE (BE=false) this is a +//! pure load; for BE it adds a `vrev32q_u8` byte-swap. use core::arch::aarch64::*; -use super::scalar; +use super::{endian::load_endian_u32x4, scalar}; + +/// Load 4 `f32` lanes from `ptr` in endian-aware fashion. +/// `BE = false` → host-native load (identical to `vld1q_f32`). +/// `BE = true` → load as u32 with byte-swap, then reinterpret as f32. +/// +/// # Safety +/// +/// * NEON must be available. +/// * `ptr` must be valid for 16 bytes. +#[inline(always)] +unsafe fn load_f32x4(ptr: *const f32) -> float32x4_t { + unsafe { + let u = load_endian_u32x4::(ptr as *const u8); + vreinterpretq_f32_u32(u) + } +} /// f32 RGB → u8 RGB. Clamp `[0, 1]` × 255, saturating round-to-nearest /// cast. /// +/// When `BE = true` the input `f32` values are big-endian encoded. +/// /// # Safety /// /// 1. NEON must be available (`is_aarch64_feature_detected!("neon")`). @@ -30,7 +53,11 @@ use super::scalar; /// 3. `rgb_in` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgb_row( + rgb_in: &[f32], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -45,9 +72,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width let total_lanes = width * 3; let mut lane = 0usize; while lane + 12 <= total_lanes { - let v0 = vld1q_f32(rgb_in.as_ptr().add(lane)); - let v1 = vld1q_f32(rgb_in.as_ptr().add(lane + 4)); - let v2 = vld1q_f32(rgb_in.as_ptr().add(lane + 8)); + let v0 = load_f32x4::(rgb_in.as_ptr().add(lane)); + let v1 = load_f32x4::(rgb_in.as_ptr().add(lane + 4)); + let v2 = load_f32x4::(rgb_in.as_ptr().add(lane + 8)); let s0 = vmulq_f32(vminq_f32(vmaxq_f32(v0, zero), one), scale); let s1 = vmulq_f32(vminq_f32(vmaxq_f32(v1, zero), one), scale); @@ -84,7 +111,7 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width let pix_done = lane / 3; let tail_pix = width - pix_done; if tail_pix > 0 { - scalar::rgbf32_to_rgb_row( + scalar::rgbf32_to_rgb_row::( &rgb_in[pix_done * 3..width * 3], &mut rgb_out[pix_done * 3..width * 3], tail_pix, @@ -95,12 +122,18 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width /// f32 RGB → u8 RGBA (alpha forced to `0xFF`). /// +/// When `BE = true` the input `f32` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf32_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgba_row( + rgb_in: &[f32], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -126,10 +159,39 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid // — the f32→u8 cast itself is the cost, not the gather. for sub in 0..4 { let base = (x + sub * 4) * 3; - let v_rgb = vld3q_f32(rgb_in.as_ptr().add(base)); - let r_clamped = vmulq_f32(vminq_f32(vmaxq_f32(v_rgb.0, zero), one), scale); - let g_clamped = vmulq_f32(vminq_f32(vmaxq_f32(v_rgb.1, zero), one), scale); - let b_clamped = vmulq_f32(vminq_f32(vmaxq_f32(v_rgb.2, zero), one), scale); + // Fast path: on-disk encoding (`BE`) matches host-native, so the raw + // `vld3q_f32` reads host-native bytes that already carry the right f32 + // values. Slow path (`BE != HOST_NATIVE_BE`): each f32 must be byte- + // swapped before deinterleave; load via the endian-aware helper into + // a contiguous buffer, then unstride into per-channel vectors. + let (r_v, g_v, b_v) = if BE == HOST_NATIVE_BE { + let v_rgb = vld3q_f32(rgb_in.as_ptr().add(base)); + (v_rgb.0, v_rgb.1, v_rgb.2) + } else { + let raw0 = load_f32x4::(rgb_in.as_ptr().add(base)); + let raw1 = load_f32x4::(rgb_in.as_ptr().add(base + 4)); + let raw2 = load_f32x4::(rgb_in.as_ptr().add(base + 8)); + // Manual deinterleave: contiguous host-native f32 layout is + // [R0,G0,B0,R1, G1,B1,R2,G2, B2,R3,G3,B3] across raw{0,1,2}. + let mut r_arr = [0.0f32; 4]; + let mut g_arr = [0.0f32; 4]; + let mut b_arr = [0.0f32; 4]; + vst1q_f32(r_arr.as_mut_ptr(), raw0); + vst1q_f32(g_arr.as_mut_ptr(), raw1); + vst1q_f32(b_arr.as_mut_ptr(), raw2); + let r_deint = [r_arr[0], r_arr[3], g_arr[2], b_arr[1]]; + let g_deint = [r_arr[1], g_arr[0], g_arr[3], b_arr[2]]; + let b_deint = [r_arr[2], g_arr[1], b_arr[0], b_arr[3]]; + ( + vld1q_f32(r_deint.as_ptr()), + vld1q_f32(g_deint.as_ptr()), + vld1q_f32(b_deint.as_ptr()), + ) + }; + + let r_clamped = vmulq_f32(vminq_f32(vmaxq_f32(r_v, zero), one), scale); + let g_clamped = vmulq_f32(vminq_f32(vmaxq_f32(g_v, zero), one), scale); + let b_clamped = vmulq_f32(vminq_f32(vmaxq_f32(b_v, zero), one), scale); let r_u32 = vcvtnq_u32_f32(r_clamped); let g_u32 = vcvtnq_u32_f32(g_clamped); @@ -163,7 +225,7 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid } if x < width { - scalar::rgbf32_to_rgba_row( + scalar::rgbf32_to_rgba_row::( &rgb_in[x * 3..width * 3], &mut rgba_out[x * 4..width * 4], width - x, @@ -174,13 +236,19 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid /// f32 RGB → u16 RGB. Clamp `[0, 1]` × 65535, saturating cast. /// +/// When `BE = true` the input `f32` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf32_to_rgb_row`] but `rgb_out` is `&mut [u16]` with /// `len() >= 3 * width` u16 elements. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgb_u16_row( + rgb_in: &[f32], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_u16_out row too short"); @@ -193,9 +261,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], let total_lanes = width * 3; let mut lane = 0usize; while lane + 12 <= total_lanes { - let v0 = vld1q_f32(rgb_in.as_ptr().add(lane)); - let v1 = vld1q_f32(rgb_in.as_ptr().add(lane + 4)); - let v2 = vld1q_f32(rgb_in.as_ptr().add(lane + 8)); + let v0 = load_f32x4::(rgb_in.as_ptr().add(lane)); + let v1 = load_f32x4::(rgb_in.as_ptr().add(lane + 4)); + let v2 = load_f32x4::(rgb_in.as_ptr().add(lane + 8)); let s0 = vmulq_f32(vminq_f32(vmaxq_f32(v0, zero), one), scale); let s1 = vmulq_f32(vminq_f32(vmaxq_f32(v1, zero), one), scale); @@ -212,7 +280,7 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], let pix_done = lane / 3; let tail_pix = width - pix_done; if tail_pix > 0 { - scalar::rgbf32_to_rgb_u16_row( + scalar::rgbf32_to_rgb_u16_row::( &rgb_in[pix_done * 3..width * 3], &mut rgb_out[pix_done * 3..width * 3], tail_pix, @@ -223,13 +291,19 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], /// f32 RGB → u16 RGBA (alpha forced to `0xFFFF`). /// +/// When `BE = true` the input `f32` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf32_to_rgb_u16_row`] but the output is `&mut [u16]` /// with `len() >= 4 * width` u16 elements. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgba_u16_row( + rgb_in: &[f32], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_u16_out row too short"); @@ -253,10 +327,38 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16] let mut b_h = [0u16; 8]; for sub in 0..2 { let base = (x + sub * 4) * 3; - let v_rgb = vld3q_f32(rgb_in.as_ptr().add(base)); - let r_s = vmulq_f32(vminq_f32(vmaxq_f32(v_rgb.0, zero), one), scale); - let g_s = vmulq_f32(vminq_f32(vmaxq_f32(v_rgb.1, zero), one), scale); - let b_s = vmulq_f32(vminq_f32(vmaxq_f32(v_rgb.2, zero), one), scale); + // Fast path: `BE == HOST_NATIVE_BE` means on-disk encoding matches the + // host's native byte order, so `vld3q_f32` (which always reads host- + // native bytes) decodes the encoded f32s correctly. Slow path: the + // encoded bytes are foreign — load each f32 through the endian-aware + // helper (which byte-swaps when `BE != HOST_NATIVE_BE`) into a + // contiguous buffer, then deinterleave into per-channel vectors. + let (r_v, g_v, b_v) = if BE == HOST_NATIVE_BE { + let v_rgb = vld3q_f32(rgb_in.as_ptr().add(base)); + (v_rgb.0, v_rgb.1, v_rgb.2) + } else { + let raw0 = load_f32x4::(rgb_in.as_ptr().add(base)); + let raw1 = load_f32x4::(rgb_in.as_ptr().add(base + 4)); + let raw2 = load_f32x4::(rgb_in.as_ptr().add(base + 8)); + let mut r_arr = [0.0f32; 4]; + let mut g_arr = [0.0f32; 4]; + let mut b_arr = [0.0f32; 4]; + vst1q_f32(r_arr.as_mut_ptr(), raw0); + vst1q_f32(g_arr.as_mut_ptr(), raw1); + vst1q_f32(b_arr.as_mut_ptr(), raw2); + let r_deint = [r_arr[0], r_arr[3], g_arr[2], b_arr[1]]; + let g_deint = [r_arr[1], g_arr[0], g_arr[3], b_arr[2]]; + let b_deint = [r_arr[2], g_arr[1], b_arr[0], b_arr[3]]; + ( + vld1q_f32(r_deint.as_ptr()), + vld1q_f32(g_deint.as_ptr()), + vld1q_f32(b_deint.as_ptr()), + ) + }; + + let r_s = vmulq_f32(vminq_f32(vmaxq_f32(r_v, zero), one), scale); + let g_s = vmulq_f32(vminq_f32(vmaxq_f32(g_v, zero), one), scale); + let b_s = vmulq_f32(vminq_f32(vmaxq_f32(b_v, zero), one), scale); let r_u = vqmovn_u32(vcvtnq_u32_f32(r_s)); let g_u = vqmovn_u32(vcvtnq_u32_f32(g_s)); let b_u = vqmovn_u32(vcvtnq_u32_f32(b_s)); @@ -273,7 +375,7 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16] } if x < width { - scalar::rgbf32_to_rgba_u16_row( + scalar::rgbf32_to_rgba_u16_row::( &rgb_in[x * 3..width * 3], &mut rgba_out[x * 4..width * 4], width - x, @@ -284,27 +386,58 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16] /// f32 RGB → f32 RGB lossless pass-through. /// +/// When `BE = true` the input values are byte-swapped to host-native +/// before being written (big-endian input → host-native output). +/// /// # Safety /// /// Same as [`rgbf32_to_rgb_row`] but `rgb_out` is `&mut [f32]` with /// `len() >= 3 * width` f32 elements. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgb_f32_row( + rgb_in: &[f32], + rgb_out: &mut [f32], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short"); unsafe { let total = width * 3; let mut i = 0usize; - while i + 4 <= total { - let v = vld1q_f32(rgb_in.as_ptr().add(i)); - vst1q_f32(rgb_out.as_mut_ptr().add(i), v); - i += 4; - } - while i < total { - *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i); - i += 1; + // Fast path: when the requested encoding (BE) matches the host's native + // endian, the bytes can be copied verbatim — `vld1q_f32` reads host-native + // bytes which is exactly what we need to emit. Otherwise we must decode + // through `load_f32x4::` (which byte-swaps when BE != host-native) so + // the stored host-native f32 round-trips back to the same value. + if BE == HOST_NATIVE_BE { + while i + 4 <= total { + let v = vld1q_f32(rgb_in.as_ptr().add(i)); + vst1q_f32(rgb_out.as_mut_ptr().add(i), v); + i += 4; + } + while i < total { + *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i); + i += 1; + } + } else { + // Encoding doesn't match host: decode each lane to host-native. + while i + 4 <= total { + let v = load_f32x4::(rgb_in.as_ptr().add(i)); + vst1q_f32(rgb_out.as_mut_ptr().add(i), v); + i += 4; + } + while i < total { + let bits = (*rgb_in.get_unchecked(i)).to_bits(); + let host_bits = if BE { + u32::from_be(bits) + } else { + u32::from_le(bits) + }; + *rgb_out.get_unchecked_mut(i) = f32::from_bits(host_bits); + i += 1; + } } } } @@ -316,25 +449,58 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], // 4 pixels (= 12 f16 values) which matches the Rgbf32 loop granularity. // // `vcvt_f32_f16` widens 4 × f16 to 4 × f32 in a single FCVT instruction. -// We load the raw u16 bits with `vld1_u16` and reinterpret the result as -// `float16x4_t` before calling `vcvt_f32_f16`. +// +// For BE: we load the u16 bits via `load_endian_u16x4::` (loads 4 u16 with +// byte-swap for BE) into a `uint16x4_t`, then reinterpret as `float16x4_t` +// before widening with `vcvt_f32_f16`. `load_endian_u16x4` reads exactly +// 8 bytes regardless of `BE`, matching the 4 × f16 region the kernel owns +// (a 16-byte load via `load_endian_u16x8` would tail-overread). + +use super::endian::load_endian_u16x4; + +/// `BE` value that makes the f32 row loaders treat their input as host-native +/// (a no-op byte-swap). Used by f16→f32 widen-then-convert paths whose stack +/// buffer is already host-native after `vcvt_f32_f16`. On a LE target, host- +/// native == LE so `BE = false`; on a BE target, host-native == BE so +/// `BE = true`. Without this routing the downstream `rgbf32_to_*::` +/// would byte-swap an already-decoded host-native f32 buffer on BE hosts. +/// +/// Also used by the `rgbf32_to_rgb_f32_row` pass-through fast path: the raw +/// `vld1q_f32`/`vst1q_f32` copy is byte-correct only when the source encoding +/// (`BE`) matches the host's native endian, so the kernel falls through to +/// the endian-aware `load_f32x4::` slow path otherwise. +/// +/// Same gate applies to the `rgbf32_to_rgba_row` / `rgbf32_to_rgba_u16_row` +/// `vld3q_f32` deinterleave fast path: `vld3q_f32` reads host-native bytes, +/// so it's only correct when the on-disk encoding matches host-native. +/// Otherwise the kernel falls through to the endian-aware `load_f32x4::` +/// path with a manual deinterleave. +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); /// Widen 4 half-precision floats (`f16x4`, i.e. 8 bytes starting at `ptr`) /// to 4 single-precision floats into `out[0..4]`. /// +/// For `BE = true` the f16 values are stored big-endian (bytes swapped); +/// the byte-swap is applied before the widening conversion. The loader reads +/// exactly 8 bytes regardless of `BE` so the caller's `ptr` only needs 8 +/// readable bytes (a 16-byte load via `load_endian_u16x8` would tail-overread +/// the 4 × f16 region the kernel actually owns). +/// /// # Safety /// /// * NEON must be available. -/// * `ptr` must be valid for 4 × u16 reads (8 bytes, no alignment required -/// because `vld1_u16` accepts unaligned pointers on AArch64). +/// * `ptr` must be valid for 4 × u16 reads (8 bytes). /// * `out` must be valid for 4 × f32 writes. #[inline(always)] -unsafe fn widen_f16x4(ptr: *const half::f16, out: *mut f32) { +unsafe fn widen_f16x4(ptr: *const half::f16, out: *mut f32) { unsafe { - let u16s = vld1_u16(ptr as *const u16); - let f16s = vreinterpret_f16_u16(u16s); - let f32s = vcvt_f32_f16(f16s); - vst1q_f32(out, f32s); + // 8-byte load (4 × u16), byte-swapped per-lane when BE = true so the + // resulting `uint16x4_t` carries host-native f16 bit patterns ready for + // `vcvt_f32_f16`. + let u16x4 = load_endian_u16x4::(ptr as *const u8); + let f16x4 = vreinterpret_f16_u16(u16x4); + let f32x4 = vcvt_f32_f16(f16x4); + vst1q_f32(out, f32x4); } } @@ -342,17 +508,31 @@ unsafe fn widen_f16x4(ptr: *const half::f16, out: *mut f32) { /// `n` must be in `[0, 4]` — `n == 0` is a no-op (the caller passes /// `total_lanes - lane`, which is `0` when `total_lanes` is a multiple of 4 /// and the SIMD loop consumed the whole row). +/// +/// For `BE = true` the source f16 bits are decoded from big-endian to +/// host-native before widening; for `BE = false` they are read as host- +/// native (identical to a plain LE load on every shipping target). This +/// matches the SIMD body's `widen_f16x4::` semantics so partial-pixel +/// tail bytes round-trip identically to the full-vector path. #[inline(always)] -unsafe fn widen_f16_tail(src: &[half::f16], dst: &mut [f32], n: usize) { +unsafe fn widen_f16_tail(src: &[half::f16], dst: &mut [f32], n: usize) { for i in 0..n { unsafe { - *dst.get_unchecked_mut(i) = src.get_unchecked(i).to_f32(); + let raw = src.get_unchecked(i).to_bits(); + let host_bits = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; + *dst.get_unchecked_mut(i) = half::f16::from_bits(host_bits).to_f32(); } } } /// f16 RGB → u8 RGB. /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Safety /// /// 1. NEON must be available. @@ -360,7 +540,11 @@ unsafe fn widen_f16_tail(src: &[half::f16], dst: &mut [f32], n: usize) { /// 3. `rgb_in` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "neon,fp16")] -pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn rgbf16_to_rgb_row( + rgb_in: &[half::f16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -370,16 +554,18 @@ pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], while lane + 12 <= total_lanes { let mut buf = [0.0f32; 12]; unsafe { - widen_f16x4(rgb_in.as_ptr().add(lane), buf.as_mut_ptr()); - widen_f16x4(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4)); - widen_f16x4(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8)); - rgbf32_to_rgb_row(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4); + widen_f16x4::(rgb_in.as_ptr().add(lane), buf.as_mut_ptr()); + widen_f16x4::(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4)); + widen_f16x4::(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8)); + // Buffer is host-native f32 after vcvt_f32_f16; route through the f32 + // kernel with HOST_NATIVE_BE so its loaders perform a no-op swap. + rgbf32_to_rgb_row::(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4); } lane += 12; } let pix_done = lane / 3; if pix_done < width { - scalar::rgbf16_to_rgb_row( + scalar::rgbf16_to_rgb_row::( &rgb_in[pix_done * 3..width * 3], &mut rgb_out[pix_done * 3..width * 3], width - pix_done, @@ -389,12 +575,18 @@ pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], /// f16 RGB → u8 RGBA (alpha forced to `0xFF`). /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "neon,fp16")] -pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn rgbf16_to_rgba_row( + rgb_in: &[half::f16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -404,16 +596,21 @@ pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8 while lane + 12 <= total_lanes { let mut buf = [0.0f32; 12]; unsafe { - widen_f16x4(rgb_in.as_ptr().add(lane), buf.as_mut_ptr()); - widen_f16x4(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4)); - widen_f16x4(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8)); - rgbf32_to_rgba_row(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4); + widen_f16x4::(rgb_in.as_ptr().add(lane), buf.as_mut_ptr()); + widen_f16x4::(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4)); + widen_f16x4::(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8)); + // Buffer is host-native f32; route via HOST_NATIVE_BE (see widen_f16x4). + rgbf32_to_rgba_row::( + &buf, + rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), + 4, + ); } lane += 12; pix += 4; } if pix < width { - scalar::rgbf16_to_rgba_row( + scalar::rgbf16_to_rgba_row::( &rgb_in[pix * 3..width * 3], &mut rgba_out[pix * 4..width * 4], width - pix, @@ -423,13 +620,15 @@ pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8 /// f16 RGB → u16 RGB. /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [u16]` with /// `len() >= 3 * width` u16 elements. #[inline] #[target_feature(enable = "neon,fp16")] -pub(crate) unsafe fn rgbf16_to_rgb_u16_row( +pub(crate) unsafe fn rgbf16_to_rgb_u16_row( rgb_in: &[half::f16], rgb_out: &mut [u16], width: usize, @@ -442,16 +641,17 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row( while lane + 12 <= total_lanes { let mut buf = [0.0f32; 12]; unsafe { - widen_f16x4(rgb_in.as_ptr().add(lane), buf.as_mut_ptr()); - widen_f16x4(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4)); - widen_f16x4(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8)); - rgbf32_to_rgb_u16_row(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4); + widen_f16x4::(rgb_in.as_ptr().add(lane), buf.as_mut_ptr()); + widen_f16x4::(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4)); + widen_f16x4::(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8)); + // Buffer is host-native f32; route via HOST_NATIVE_BE (see widen_f16x4). + rgbf32_to_rgb_u16_row::(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4); } lane += 12; } let pix_done = lane / 3; if pix_done < width { - scalar::rgbf16_to_rgb_u16_row( + scalar::rgbf16_to_rgb_u16_row::( &rgb_in[pix_done * 3..width * 3], &mut rgb_out[pix_done * 3..width * 3], width - pix_done, @@ -461,13 +661,15 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row( /// f16 RGB → u16 RGBA (alpha forced to `0xFFFF`). /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf16_to_rgb_u16_row`] but the output is `&mut [u16]` with /// `len() >= 4 * width` u16 elements. #[inline] #[target_feature(enable = "neon,fp16")] -pub(crate) unsafe fn rgbf16_to_rgba_u16_row( +pub(crate) unsafe fn rgbf16_to_rgba_u16_row( rgb_in: &[half::f16], rgba_out: &mut [u16], width: usize, @@ -481,16 +683,21 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row( while lane + 12 <= total_lanes { let mut buf = [0.0f32; 12]; unsafe { - widen_f16x4(rgb_in.as_ptr().add(lane), buf.as_mut_ptr()); - widen_f16x4(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4)); - widen_f16x4(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8)); - rgbf32_to_rgba_u16_row(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4); + widen_f16x4::(rgb_in.as_ptr().add(lane), buf.as_mut_ptr()); + widen_f16x4::(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4)); + widen_f16x4::(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8)); + // Buffer is host-native f32; route via HOST_NATIVE_BE (see widen_f16x4). + rgbf32_to_rgba_u16_row::( + &buf, + rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), + 4, + ); } lane += 12; pix += 4; } if pix < width { - scalar::rgbf16_to_rgba_u16_row( + scalar::rgbf16_to_rgba_u16_row::( &rgb_in[pix * 3..width * 3], &mut rgba_out[pix * 4..width * 4], width - pix, @@ -500,13 +707,15 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row( /// f16 RGB → f32 RGB (lossless widen). /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [f32]` with /// `len() >= 3 * width` f32 elements. #[inline] #[target_feature(enable = "neon,fp16")] -pub(crate) unsafe fn rgbf16_to_rgb_f32_row( +pub(crate) unsafe fn rgbf16_to_rgb_f32_row( rgb_in: &[half::f16], rgb_out: &mut [f32], width: usize, @@ -518,13 +727,13 @@ pub(crate) unsafe fn rgbf16_to_rgb_f32_row( let mut lane = 0usize; while lane + 4 <= total_lanes { unsafe { - widen_f16x4(rgb_in.as_ptr().add(lane), rgb_out.as_mut_ptr().add(lane)); + widen_f16x4::(rgb_in.as_ptr().add(lane), rgb_out.as_mut_ptr().add(lane)); } lane += 4; } // Scalar tail for the last 0-3 lanes (partial pixel at most). unsafe { - widen_f16_tail( + widen_f16_tail::( rgb_in.get_unchecked(lane..), rgb_out.get_unchecked_mut(lane..), total_lanes - lane, @@ -534,13 +743,16 @@ pub(crate) unsafe fn rgbf16_to_rgb_f32_row( /// f16 RGB → f16 RGB lossless pass-through. /// +/// When `BE = true` the input values are byte-swapped to host-native order +/// on output. +/// /// # Safety /// /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [half::f16]` with /// `len() >= 3 * width` f16 elements. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn rgbf16_to_rgb_f16_row( +pub(crate) unsafe fn rgbf16_to_rgb_f16_row( rgb_in: &[half::f16], rgb_out: &mut [half::f16], width: usize, @@ -548,6 +760,6 @@ pub(crate) unsafe fn rgbf16_to_rgb_f16_row( debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_f16_out row too short"); - // Bit-exact copy: reuse scalar which is already just copy_from_slice. - scalar::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width); + // Bit-exact copy / byte-swap: delegate to scalar. + scalar::rgbf16_to_rgb_f16_row::(rgb_in, rgb_out, width); } diff --git a/src/row/arch/neon/tests/packed_rgb_float.rs b/src/row/arch/neon/tests/packed_rgb_float.rs index 42f566fe..2f2ce7f8 100644 --- a/src/row/arch/neon/tests/packed_rgb_float.rs +++ b/src/row/arch/neon/tests/packed_rgb_float.rs @@ -1,6 +1,47 @@ use super::*; // ---- Tier 9 Rgbf32 SIMD-vs-scalar parity tests -------------------------- +// +// LE-host gating rationale (codex 6th-pass review of PR #83): +// +// The five Rgbf32 SIMD-vs-scalar parity tests below (and the six Rgbf16 +// parity tests further down) build their fixtures via +// `pseudo_random_rgbf32` / `pseudo_random_rgbf16`, which produce +// host-native `f32` / `half::f16` values. They then call the kernels +// with `::`, which means "input is LE-encoded — decode to +// host-native by applying `u32::from_le` / `u16::from_le`". +// +// On a little-endian host (e.g. aarch64-apple-darwin), host-native bits +// and LE-encoded bits are the same byte sequence, so `from_le` is a +// no-op and the assertions hold. +// +// On a big-endian host (e.g. aarch64-be-linux-gnu), host-native f32 / +// f16 bits do NOT lay out little-endian, so the kernel's `from_le` +// byte-swap correctly reinterprets the host-native fixture as if it +// were an LE-encoded payload — producing a different (corrupted) value +// than the test expects. Because both the scalar and NEON kernels +// apply the same `from_le` byte-swap, the SIMD-vs-scalar parity +// assertions still hold on BE — but they're vacuously testing +// "scalar and SIMD are identically wrong", not kernel correctness. +// The two `lossless` host-native equality assertions +// (`assert_eq!(out_neon, input[..w * 3])` for `rgbf32_to_rgb_f32_row` +// and `rgbf16_to_rgb_f16_row`) would fail outright on BE since the +// kernel decodes through `load_f32x4::` / scalar `from_le` to +// produce a byte-swapped (relative to host-native) result. +// +// The kernel itself is correct on BE; this is purely a fixture-vs- +// kernel byte-order mismatch (same class as the scalar tests gated in +// `56342c0`, and the PR #82 alpha_extract / planar_gbr_high_bit gates +// in `8f2e329`). NEON BE-host correctness is locked down separately +// by the dedicated BE-parity tests in this same module (which build +// LE-encoded fixtures via `byte_swap` helpers and assert +// ``/`` parity on every host) and by the LE-decode +// regression tests added in commits c3a6478, dcf40a3, f1161d7, +// 63fdf8f (which build LE-encoded fixtures via +// `f32::from_bits(u32::from_le(_))` / +// `half::f16::from_bits(u16::from_le(_))` and assert kernel output +// matches scalar on every host). Those tests are intentionally NOT +// gated. /// Generates a row of pseudo-random `f32` RGB samples. Mix of in-range /// `[0, 1]` values, exact `0.5` (round-half-even tie), and HDR > 1.0 @@ -28,75 +69,80 @@ fn pseudo_random_rgbf32(width: usize) -> std::vec::Vec { } #[test] +#[cfg(target_endian = "little")] #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] fn rgbf32_to_rgb_neon_matches_scalar_widths() { for w in [1usize, 3, 4, 5, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_neon = std::vec![0u8; w * 3]; - scalar::rgbf32_to_rgb_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgb_row(&input, &mut out_neon, w); + rgbf32_to_rgb_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } } #[test] +#[cfg(target_endian = "little")] #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] fn rgbf32_to_rgba_neon_matches_scalar_widths() { for w in [1usize, 3, 4, 5, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_neon = std::vec![0u8; w * 4]; - scalar::rgbf32_to_rgba_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgba_row(&input, &mut out_neon, w); + rgbf32_to_rgba_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } } #[test] +#[cfg(target_endian = "little")] #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] fn rgbf32_to_rgb_u16_neon_matches_scalar_widths() { for w in [1usize, 3, 4, 5, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_neon = std::vec![0u16; w * 3]; - scalar::rgbf32_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgb_u16_row(&input, &mut out_neon, w); + rgbf32_to_rgb_u16_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } } #[test] +#[cfg(target_endian = "little")] #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] fn rgbf32_to_rgba_u16_neon_matches_scalar_widths() { for w in [1usize, 3, 4, 5, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_neon = std::vec![0u16; w * 4]; - scalar::rgbf32_to_rgba_u16_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgba_u16_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgba_u16_row(&input, &mut out_neon, w); + rgbf32_to_rgba_u16_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } } #[test] +#[cfg(target_endian = "little")] #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] fn rgbf32_to_rgb_f32_neon_matches_scalar_widths() { for w in [1usize, 3, 4, 5, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0.0f32; w * 3]; let mut out_neon = std::vec![0.0f32; w * 3]; - scalar::rgbf32_to_rgb_f32_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgb_f32_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgb_f32_row(&input, &mut out_neon, w); + rgbf32_to_rgb_f32_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); // Lossless: output should equal input bit-exact. @@ -119,6 +165,7 @@ fn pseudo_random_rgbf16(width: usize) -> std::vec::Vec { } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -131,15 +178,16 @@ fn neon_rgbf16_to_rgb_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_neon = std::vec![0u8; w * 3]; - scalar::rgbf16_to_rgb_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgb_row(&input, &mut out_neon, w); + rgbf16_to_rgb_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -152,15 +200,16 @@ fn neon_rgbf16_to_rgba_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_neon = std::vec![0u8; w * 4]; - scalar::rgbf16_to_rgba_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgba_row(&input, &mut out_neon, w); + rgbf16_to_rgba_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -173,15 +222,16 @@ fn neon_rgbf16_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_neon = std::vec![0u16; w * 3]; - scalar::rgbf16_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgb_u16_row(&input, &mut out_neon, w); + rgbf16_to_rgb_u16_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -194,15 +244,16 @@ fn neon_rgbf16_to_rgba_u16_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_neon = std::vec![0u16; w * 4]; - scalar::rgbf16_to_rgba_u16_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgba_u16_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgba_u16_row(&input, &mut out_neon, w); + rgbf16_to_rgba_u16_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -215,15 +266,16 @@ fn neon_rgbf16_to_rgb_f32_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0.0f32; w * 3]; let mut out_neon = std::vec![0.0f32; w * 3]; - scalar::rgbf16_to_rgb_f32_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgb_f32_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgb_f32_row(&input, &mut out_neon, w); + rgbf16_to_rgb_f32_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -233,12 +285,395 @@ fn neon_rgbf16_to_rgb_f16_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![half::f16::ZERO; w * 3]; let mut out_neon = std::vec![half::f16::ZERO; w * 3]; - scalar::rgbf16_to_rgb_f16_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgb_f16_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgb_f16_row(&input, &mut out_neon, w); + rgbf16_to_rgb_f16_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); // Lossless: output should equal input bit-exact. assert_eq!(out_neon, input[..w * 3], "lossless width {w}"); } } + +// ---- BE parity tests — Rgbf32 ----------------------------------------------- +// +// For each kernel: byte-swap the LE f32 inputs into a BE buffer, call the +// kernel with `BE=true`, and assert the output matches the LE run (`BE=false`). + +/// Build a BE-encoded f32 slice by byte-swapping every 32-bit element. +fn be_rgbf32(le: &[f32]) -> std::vec::Vec { + le.iter() + .map(|v| f32::from_bits(v.to_bits().swap_bytes())) + .collect() +} + +/// Build a BE-encoded f16 slice by byte-swapping every 16-bit element. +fn be_rgbf16(le: &[half::f16]) -> std::vec::Vec { + le.iter() + .map(|v| half::f16::from_bits(v.to_bits().swap_bytes())) + .collect() +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_rgbf32_to_rgb_be_matches_le() { + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + rgbf32_to_rgb_row::(&le_in, &mut out_le, w); + rgbf32_to_rgb_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "NEON rgbf32_to_rgb BE parity width {w}"); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_rgbf32_to_rgba_be_matches_le() { + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + rgbf32_to_rgba_row::(&le_in, &mut out_le, w); + rgbf32_to_rgba_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "NEON rgbf32_to_rgba BE parity width {w}"); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_rgbf32_to_rgb_u16_be_matches_le() { + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + rgbf32_to_rgb_u16_row::(&le_in, &mut out_le, w); + rgbf32_to_rgb_u16_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "NEON rgbf32_to_rgb_u16 BE parity width {w}"); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_rgbf32_to_rgba_u16_be_matches_le() { + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + rgbf32_to_rgba_u16_row::(&le_in, &mut out_le, w); + rgbf32_to_rgba_u16_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "NEON rgbf32_to_rgba_u16 BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_rgbf32_to_rgb_f32_be_is_byteswap() { + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0.0f32; w * 3]; + let mut out_be = std::vec![0.0f32; w * 3]; + unsafe { + rgbf32_to_rgb_f32_row::(&le_in, &mut out_le, w); + rgbf32_to_rgb_f32_row::(&be_in, &mut out_be, w); + } + // BE path byte-swaps each f32, producing host-native = same as LE. + assert_eq!(out_le, out_be, "NEON rgbf32_to_rgb_f32 BE parity width {w}"); + } +} + +/// Feeds an explicitly LE-encoded fixture through `rgbf32_to_rgb_f32_row::` +/// and asserts it decodes to the host-native expected values. +/// +/// On LE hosts this is a vacuous sanity check (LE-encoded == host-native), but +/// on BE hosts it guards against the historical bug where the kernel used a raw +/// `vld1q_f32`/`vst1q_f32` copy in the `BE = false` branch, which preserved the +/// LE byte order on store and produced corrupted (byte-swapped) host f32s. +/// The current kernel falls through to the endian-aware `load_f32x4::` +/// slow path on BE hosts (`HOST_NATIVE_BE != BE`) so this test passes on both. +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_rgbf32_to_rgb_f32_row_le_input_decodes_correctly_on_any_host() { + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let expected = pseudo_random_rgbf32(w); // host-native f32 values + // Build LE-encoded input: each lane's bits, written as if LE on disk, then + // reinterpreted as host-native f32. On LE hosts this is identical to + // `expected`; on BE hosts each lane is byte-swapped. + let le_in: std::vec::Vec = expected + .iter() + .map(|v| f32::from_bits(u32::from_le(v.to_bits()))) + .collect(); + let mut out = std::vec![0.0f32; w * 3]; + unsafe { + rgbf32_to_rgb_f32_row::(&le_in, &mut out, w); + } + assert_eq!( + out, expected, + "NEON rgbf32_to_rgb_f32_row:: must decode LE input to host-native (width {w})" + ); + } +} + +/// Feeds an explicitly LE-encoded fixture through `rgbf32_to_rgba_row::` +/// and asserts it produces the same output as the scalar reference run with +/// the same LE-encoded input via ``. +/// +/// On LE hosts this is vacuous (LE-encoded == host-native, both paths agree), +/// but on BE hosts it guards against the historical bug where the kernel used +/// a raw `vld3q_f32` deinterleave in the `BE = false` branch — that reads +/// host-native (BE) bytes from an LE-encoded buffer, mis-decoding the f32s. +/// The fixed kernel uses the `BE == HOST_NATIVE_BE` gate to choose between +/// the raw-load fast path and the endian-aware deinterleave, so it's correct +/// on both LE and BE hosts. +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_rgbf32_to_rgba_row_le_input_decodes_correctly_on_any_host() { + for w in [1usize, 4, 7, 15, 16, 17, 31, 33, 1920, 1921] { + let host_native = pseudo_random_rgbf32(w); + let le_in: std::vec::Vec = host_native + .iter() + .map(|v| f32::from_bits(u32::from_le(v.to_bits()))) + .collect(); + let mut out_simd = std::vec![0u8; w * 4]; + let mut out_scalar = std::vec![0u8; w * 4]; + unsafe { + rgbf32_to_rgba_row::(&le_in, &mut out_simd, w); + } + scalar::rgbf32_to_rgba_row::(&le_in, &mut out_scalar, w); + assert_eq!( + out_simd, out_scalar, + "NEON rgbf32_to_rgba_row:: must decode LE input to match scalar (width {w})" + ); + } +} + +/// Feeds an explicitly LE-encoded fixture through `rgbf32_to_rgba_u16_row::` +/// and asserts it produces the same output as the scalar reference. +/// +/// Same rationale as `neon_rgbf32_to_rgba_row_le_input_decodes_correctly_on_any_host`: +/// the fast `vld3q_f32` deinterleave is only safe when on-disk encoding matches +/// host-native, so the kernel now gates it on `BE == HOST_NATIVE_BE`. +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_rgbf32_to_rgba_u16_row_le_input_decodes_correctly_on_any_host() { + for w in [1usize, 4, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] { + let host_native = pseudo_random_rgbf32(w); + let le_in: std::vec::Vec = host_native + .iter() + .map(|v| f32::from_bits(u32::from_le(v.to_bits()))) + .collect(); + let mut out_simd = std::vec![0u16; w * 4]; + let mut out_scalar = std::vec![0u16; w * 4]; + unsafe { + rgbf32_to_rgba_u16_row::(&le_in, &mut out_simd, w); + } + scalar::rgbf32_to_rgba_u16_row::(&le_in, &mut out_scalar, w); + assert_eq!( + out_simd, out_scalar, + "NEON rgbf32_to_rgba_u16_row:: must decode LE input to match scalar (width {w})" + ); + } +} + +// ---- BE parity tests — Rgbf16 ----------------------------------------------- + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn neon_rgbf16_to_rgb_be_matches_le() { + if !std::arch::is_aarch64_feature_detected!("fp16") { + return; + } + // Widths 4, 5, 16, 33 specifically exercise the f16 widen-tail boundary: + // the BE branch used to over-read past the row via load_endian_u16x8 (16 + // bytes) when only 8 bytes of f16 data are guaranteed at the third + // widen_f16x4 call per 12-lane chunk. With load_endian_u16x4 (8 bytes) + // the read stays within bounds; ASan / Miri pages would have caught the + // over-read as a guarded-page UB. + for w in [1usize, 4, 5, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + rgbf16_to_rgb_row::(&le_in, &mut out_le, w); + rgbf16_to_rgb_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "NEON rgbf16_to_rgb BE parity width {w}"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn neon_rgbf16_to_rgba_be_matches_le() { + if !std::arch::is_aarch64_feature_detected!("fp16") { + return; + } + for w in [1usize, 4, 5, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + rgbf16_to_rgba_row::(&le_in, &mut out_le, w); + rgbf16_to_rgba_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "NEON rgbf16_to_rgba BE parity width {w}"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn neon_rgbf16_to_rgb_u16_be_matches_le() { + if !std::arch::is_aarch64_feature_detected!("fp16") { + return; + } + for w in [1usize, 4, 5, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + rgbf16_to_rgb_u16_row::(&le_in, &mut out_le, w); + rgbf16_to_rgb_u16_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "NEON rgbf16_to_rgb_u16 BE parity width {w}"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn neon_rgbf16_to_rgba_u16_be_matches_le() { + if !std::arch::is_aarch64_feature_detected!("fp16") { + return; + } + for w in [1usize, 4, 5, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + rgbf16_to_rgba_u16_row::(&le_in, &mut out_le, w); + rgbf16_to_rgba_u16_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "NEON rgbf16_to_rgba_u16 BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn neon_rgbf16_to_rgb_f32_be_matches_le() { + if !std::arch::is_aarch64_feature_detected!("fp16") { + return; + } + for w in [1usize, 4, 5, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0.0f32; w * 3]; + let mut out_be = std::vec![0.0f32; w * 3]; + unsafe { + rgbf16_to_rgb_f32_row::(&le_in, &mut out_le, w); + rgbf16_to_rgb_f32_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "NEON rgbf16_to_rgb_f32 BE parity width {w}"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn neon_rgbf16_to_rgb_f16_be_is_byteswap() { + for w in [1usize, 4, 5, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![half::f16::ZERO; w * 3]; + let mut out_be = std::vec![half::f16::ZERO; w * 3]; + unsafe { + rgbf16_to_rgb_f16_row::(&le_in, &mut out_le, w); + rgbf16_to_rgb_f16_row::(&be_in, &mut out_be, w); + } + // BE byte-swap should reconstruct original LE output bit-exact. + assert_eq!(out_le, out_be, "NEON rgbf16_to_rgb_f16 BE parity width {w}"); + } +} + +// ---- Tail-overread regression tests (NEON BE Rgbf16) ------------------------ +// +// These specifically place the BE f16 input at the end of an exact-sized +// allocation so any read past `3 * width * 2` bytes lands on the next +// allocation (or, with ASan / Miri / guarded pages, an unmapped page). The +// previous bug — `widen_f16x4::` calling `load_endian_u16x8` (16-byte +// load) when only 8 bytes were guaranteed — would over-read 8 bytes past the +// row at widths where lane+12 reaches total_lanes (multiples of 4 pixels). +// +// Widths 4, 16, 32 are exact multiples of 4 pixels (the SIMD chunk size) so +// the tail-overread happens on the LAST chunk; widths 5 and 33 leave 1 pixel +// of scalar tail, so the over-read happens on the SECOND-TO-LAST chunk. + +fn assert_rgbf16_tail_overread_safe(width: usize, kernel: F) +where + F: Fn(&[half::f16], &mut [u8], usize), +{ + // Allocate exact-sized input/output so any over-read lands outside the + // Vec's allocation. The exact-fit Vec mostly catches over-reads that go + // outside the page boundary — pair with ASan in CI for the strict case. + let width_lanes = width * 3; + let mut le_in = std::vec::Vec::::with_capacity(width_lanes); + for v in pseudo_random_rgbf16(width) { + le_in.push(v); + } + let raw: std::vec::Vec = if BE { be_rgbf16(&le_in) } else { le_in.clone() }; + let mut out = std::vec![0u8; width * 3]; + kernel(&raw, &mut out, width); +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn neon_rgbf16_be_tail_overread_widths_4_5_16_33() { + if !std::arch::is_aarch64_feature_detected!("fp16") { + return; + } + for &w in &[4usize, 5, 16, 33] { + assert_rgbf16_tail_overread_safe::(w, |inp, out, w| unsafe { + rgbf16_to_rgb_row::(inp, out, w); + }); + } +} diff --git a/src/row/arch/wasm_simd128/packed_rgb_float.rs b/src/row/arch/wasm_simd128/packed_rgb_float.rs index 82b6c538..1cfedf3b 100644 --- a/src/row/arch/wasm_simd128/packed_rgb_float.rs +++ b/src/row/arch/wasm_simd128/packed_rgb_float.rs @@ -10,7 +10,24 @@ use core::arch::wasm32::*; -use super::scalar; +use super::{endian::load_endian_u32x4, scalar}; + +/// `BE` value that makes the f32 row loaders treat their input as host-native +/// (a no-op byte-swap). Used by f16→f32 widen-then-convert paths whose stack +/// buffer is already host-native after `half::f16::to_f32()`. On a LE target, +/// host-native == LE so `BE = false`; on a BE target, host-native == BE so +/// `BE = true`. Without this routing the downstream `rgbf32_to_*::` +/// would byte-swap an already-decoded host-native f32 buffer on BE hosts. +/// (`wasm32-*` is LE today, but keeping the routing endian-agnostic future- +/// proofs against any BE wasm target.) +/// +/// Also used by the `rgbf32_to_rgb_f32_row` pass-through fast path: the raw +/// `v128_load`/`v128_store` copy is byte-correct only when the source encoding +/// (`BE`) matches the host's native endian, so the kernel falls through to +/// the endian-aware `load_f32x4::` slow path otherwise. +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + +// ---- helpers ------------------------------------------------------------------ #[inline(always)] fn clamp_scale_to_i32(v: v128, zero: v128, one: v128, scale: v128) -> v128 { @@ -21,6 +38,23 @@ fn clamp_scale_to_i32(v: v128, zero: v128, one: v128, scale: v128) -> v128 { i32x4_trunc_sat_f32x4(rounded) } +/// Load 4 f32 values from `ptr`, byte-swapping each 32-bit element when +/// `BE = true`. The returned `v128` holds f32 bit patterns in host-native +/// order so downstream float arithmetic is correct. +/// +/// # Safety +/// +/// `ptr` must point to at least 16 readable bytes. simd128 must be +/// available (compile-time `target_feature`). +#[inline(always)] +unsafe fn load_f32x4(ptr: *const f32) -> v128 { + // load_endian_u32x4 byte-swaps each 32-bit lane when BE=true, giving us + // host-native f32 bit patterns. + unsafe { load_endian_u32x4::(ptr as *const u8) } +} + +// ---- Tier 9 — Rgbf32 wasm-simd128 kernels ------------------------------------ + /// f32 RGB → u8 RGB. /// /// # Safety @@ -30,7 +64,11 @@ fn clamp_scale_to_i32(v: v128, zero: v128, one: v128, scale: v128) -> v128 { /// 3. `rgb_in` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgb_row( + rgb_in: &[f32], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -43,9 +81,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width // 4 pixels = 12 lanes per iter. while lane + 12 <= total_lanes { unsafe { - let v0 = v128_load(rgb_in.as_ptr().add(lane) as *const v128); - let v1 = v128_load(rgb_in.as_ptr().add(lane + 4) as *const v128); - let v2 = v128_load(rgb_in.as_ptr().add(lane + 8) as *const v128); + let v0 = load_f32x4::(rgb_in.as_ptr().add(lane)); + let v1 = load_f32x4::(rgb_in.as_ptr().add(lane + 4)); + let v2 = load_f32x4::(rgb_in.as_ptr().add(lane + 8)); let i0 = clamp_scale_to_i32(v0, zero, one, scale); let i1 = clamp_scale_to_i32(v1, zero, one, scale); @@ -68,7 +106,7 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width } let pix_done = lane / 3; if pix_done < width { - scalar::rgbf32_to_rgb_row( + scalar::rgbf32_to_rgb_row::( &rgb_in[pix_done * 3..width * 3], &mut rgb_out[pix_done * 3..width * 3], width - pix_done, @@ -79,7 +117,11 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width /// f32 RGB → u8 RGBA (alpha forced to `0xFF`). #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgba_row( + rgb_in: &[f32], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -92,9 +134,9 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid let mut pix = 0usize; while lane + 12 <= total_lanes { unsafe { - let v0 = v128_load(rgb_in.as_ptr().add(lane) as *const v128); - let v1 = v128_load(rgb_in.as_ptr().add(lane + 4) as *const v128); - let v2 = v128_load(rgb_in.as_ptr().add(lane + 8) as *const v128); + let v0 = load_f32x4::(rgb_in.as_ptr().add(lane)); + let v1 = load_f32x4::(rgb_in.as_ptr().add(lane + 4)); + let v2 = load_f32x4::(rgb_in.as_ptr().add(lane + 8)); let i0 = clamp_scale_to_i32(v0, zero, one, scale); let i1 = clamp_scale_to_i32(v1, zero, one, scale); @@ -118,7 +160,7 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid pix += 4; } if pix < width { - scalar::rgbf32_to_rgba_row( + scalar::rgbf32_to_rgba_row::( &rgb_in[pix * 3..width * 3], &mut rgba_out[pix * 4..width * 4], width - pix, @@ -129,7 +171,11 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid /// f32 RGB → u16 RGB. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgb_u16_row( + rgb_in: &[f32], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_u16_out row too short"); @@ -141,9 +187,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], let mut lane = 0usize; while lane + 12 <= total_lanes { unsafe { - let v0 = v128_load(rgb_in.as_ptr().add(lane) as *const v128); - let v1 = v128_load(rgb_in.as_ptr().add(lane + 4) as *const v128); - let v2 = v128_load(rgb_in.as_ptr().add(lane + 8) as *const v128); + let v0 = load_f32x4::(rgb_in.as_ptr().add(lane)); + let v1 = load_f32x4::(rgb_in.as_ptr().add(lane + 4)); + let v2 = load_f32x4::(rgb_in.as_ptr().add(lane + 8)); let i0 = clamp_scale_to_i32(v0, zero, one, scale); let i1 = clamp_scale_to_i32(v1, zero, one, scale); @@ -167,7 +213,7 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], } let pix_done = lane / 3; if pix_done < width { - scalar::rgbf32_to_rgb_u16_row( + scalar::rgbf32_to_rgb_u16_row::( &rgb_in[pix_done * 3..width * 3], &mut rgb_out[pix_done * 3..width * 3], width - pix_done, @@ -178,7 +224,11 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], /// f32 RGB → u16 RGBA (alpha forced to `0xFFFF`). #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgba_u16_row( + rgb_in: &[f32], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_u16_out row too short"); @@ -191,9 +241,9 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16] let mut pix = 0usize; while lane + 12 <= total_lanes { unsafe { - let v0 = v128_load(rgb_in.as_ptr().add(lane) as *const v128); - let v1 = v128_load(rgb_in.as_ptr().add(lane + 4) as *const v128); - let v2 = v128_load(rgb_in.as_ptr().add(lane + 8) as *const v128); + let v0 = load_f32x4::(rgb_in.as_ptr().add(lane)); + let v1 = load_f32x4::(rgb_in.as_ptr().add(lane + 4)); + let v2 = load_f32x4::(rgb_in.as_ptr().add(lane + 8)); let i0 = clamp_scale_to_i32(v0, zero, one, scale); let i1 = clamp_scale_to_i32(v1, zero, one, scale); @@ -217,7 +267,7 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16] pix += 4; } if pix < width { - scalar::rgbf32_to_rgba_u16_row( + scalar::rgbf32_to_rgba_u16_row::( &rgb_in[pix * 3..width * 3], &mut rgba_out[pix * 4..width * 4], width - pix, @@ -225,27 +275,66 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16] } } -/// f32 RGB → f32 RGB lossless pass-through. +/// f32 RGB → f32 RGB lossless pass-through / byte-swap. +/// +/// - `BE == HOST_NATIVE_BE`: fast `v128_load` → `v128_store` copy (no math). +/// - otherwise: load each element through endian-aware `load_f32x4::` +/// (byte-swap to host-native), store as f32. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgb_f32_row( + rgb_in: &[f32], + rgb_out: &mut [f32], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short"); - let total = width * 3; - let mut i = 0usize; - while i + 4 <= total { - unsafe { - let v = v128_load(rgb_in.as_ptr().add(i) as *const v128); - v128_store(rgb_out.as_mut_ptr().add(i) as *mut v128, v); + // Fast path: when the requested encoding (BE) matches the host's native + // endian, the bytes can be copied verbatim — `v128_load` reads host-native + // bytes which is exactly what we need to emit. Otherwise we must decode + // through `load_f32x4::` (which byte-swaps when BE differs from + // host-native) so the stored host-native f32 round-trips to the same value. + if BE == HOST_NATIVE_BE { + let total = width * 3; + let mut i = 0usize; + while i + 4 <= total { + unsafe { + let v = v128_load(rgb_in.as_ptr().add(i) as *const v128); + v128_store(rgb_out.as_mut_ptr().add(i) as *mut v128, v); + } + i += 4; } - i += 4; - } - while i < total { - unsafe { - *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i); + while i < total { + unsafe { + *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i); + } + i += 1; + } + } else { + // Encoding doesn't match host: decode each lane to host-native via the + // endian-aware loader (`load_endian_u32x4::` byte-swaps each lane). + let total = width * 3; + let mut i = 0usize; + while i + 4 <= total { + unsafe { + let swapped = load_f32x4::(rgb_in.as_ptr().add(i)); + v128_store(rgb_out.as_mut_ptr().add(i) as *mut v128, swapped); + } + i += 4; + } + while i < total { + unsafe { + let bits = rgb_in.get_unchecked(i).to_bits(); + let host_bits = if BE { + u32::from_be(bits) + } else { + u32::from_le(bits) + }; + *rgb_out.get_unchecked_mut(i) = f32::from_bits(host_bits); + } + i += 1; } - i += 1; } } @@ -256,9 +345,10 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], // `[f32; CHUNK_PIXELS * 3]` buffer, then call the existing wasm-simd128 // Rgbf32 downstream kernels for the f32→u8/u16/f32 work. // -// The widening loop is cheap relative to the subsequent SIMD integer conversion, -// so this hybrid strategy avoids a full scalar fallback while keeping the -// heavier per-sample math in SIMD. +// For BE inputs the byte-swap is applied before widening so the widened f32 +// buffer is already host-native; downstream f32 kernels are called with +// `HOST_NATIVE_BE` so their loaders perform a no-op byte-swap (correct on +// both LE and BE hosts). // // CHUNK_PIXELS = 4 (= 12 f32 lanes), matching the simd128 Rgbf32 loop stride. @@ -271,7 +361,11 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], /// 3. `rgb_in` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn rgbf16_to_rgb_row( + rgb_in: &[half::f16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -281,16 +375,25 @@ pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], while lane + 12 <= total_lanes { let mut buf = [0.0f32; 12]; for k in 0..12 { - buf[k] = unsafe { rgb_in.get_unchecked(lane + k).to_f32() }; + let f = unsafe { rgb_in.get_unchecked(lane + k) }; + let raw = f.to_bits(); + let bits = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; + buf[k] = half::f16::from_bits(bits).to_f32(); } unsafe { - rgbf32_to_rgb_row(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4); + // Buffer is now host-native f32; route via HOST_NATIVE_BE so the f32 + // loaders perform a no-op byte-swap on both LE and BE hosts. + rgbf32_to_rgb_row::(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4); } lane += 12; } let pix_done = lane / 3; if pix_done < width { - scalar::rgbf16_to_rgb_row( + scalar::rgbf16_to_rgb_row::( &rgb_in[pix_done * 3..width * 3], &mut rgb_out[pix_done * 3..width * 3], width - pix_done, @@ -305,7 +408,11 @@ pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], /// Same as [`rgbf16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn rgbf16_to_rgba_row( + rgb_in: &[half::f16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -315,16 +422,28 @@ pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8 while lane + 12 <= total_lanes { let mut buf = [0.0f32; 12]; for k in 0..12 { - buf[k] = unsafe { rgb_in.get_unchecked(lane + k).to_f32() }; + let f = unsafe { rgb_in.get_unchecked(lane + k) }; + let raw = f.to_bits(); + let bits = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; + buf[k] = half::f16::from_bits(bits).to_f32(); } unsafe { - rgbf32_to_rgba_row(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4); + // Buffer is host-native f32; route via HOST_NATIVE_BE. + rgbf32_to_rgba_row::( + &buf, + rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), + 4, + ); } lane += 12; pix += 4; } if pix < width { - scalar::rgbf16_to_rgba_row( + scalar::rgbf16_to_rgba_row::( &rgb_in[pix * 3..width * 3], &mut rgba_out[pix * 4..width * 4], width - pix, @@ -340,7 +459,7 @@ pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8 /// `len() >= 3 * width` u16 elements. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn rgbf16_to_rgb_u16_row( +pub(crate) unsafe fn rgbf16_to_rgb_u16_row( rgb_in: &[half::f16], rgb_out: &mut [u16], width: usize, @@ -353,16 +472,24 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row( while lane + 12 <= total_lanes { let mut buf = [0.0f32; 12]; for k in 0..12 { - buf[k] = unsafe { rgb_in.get_unchecked(lane + k).to_f32() }; + let f = unsafe { rgb_in.get_unchecked(lane + k) }; + let raw = f.to_bits(); + let bits = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; + buf[k] = half::f16::from_bits(bits).to_f32(); } unsafe { - rgbf32_to_rgb_u16_row(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4); + // Buffer is host-native f32; route via HOST_NATIVE_BE. + rgbf32_to_rgb_u16_row::(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4); } lane += 12; } let pix_done = lane / 3; if pix_done < width { - scalar::rgbf16_to_rgb_u16_row( + scalar::rgbf16_to_rgb_u16_row::( &rgb_in[pix_done * 3..width * 3], &mut rgb_out[pix_done * 3..width * 3], width - pix_done, @@ -377,7 +504,7 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row( /// Same as [`rgbf16_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn rgbf16_to_rgba_u16_row( +pub(crate) unsafe fn rgbf16_to_rgba_u16_row( rgb_in: &[half::f16], rgba_out: &mut [u16], width: usize, @@ -391,16 +518,28 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row( while lane + 12 <= total_lanes { let mut buf = [0.0f32; 12]; for k in 0..12 { - buf[k] = unsafe { rgb_in.get_unchecked(lane + k).to_f32() }; + let f = unsafe { rgb_in.get_unchecked(lane + k) }; + let raw = f.to_bits(); + let bits = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; + buf[k] = half::f16::from_bits(bits).to_f32(); } unsafe { - rgbf32_to_rgba_u16_row(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4); + // Buffer is host-native f32; route via HOST_NATIVE_BE. + rgbf32_to_rgba_u16_row::( + &buf, + rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), + 4, + ); } lane += 12; pix += 4; } if pix < width { - scalar::rgbf16_to_rgba_u16_row( + scalar::rgbf16_to_rgba_u16_row::( &rgb_in[pix * 3..width * 3], &mut rgba_out[pix * 4..width * 4], width - pix, @@ -416,7 +555,7 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row( /// `len() >= 3 * width` f32 elements. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn rgbf16_to_rgb_f32_row( +pub(crate) unsafe fn rgbf16_to_rgb_f32_row( rgb_in: &[half::f16], rgb_out: &mut [f32], width: usize, @@ -428,12 +567,19 @@ pub(crate) unsafe fn rgbf16_to_rgb_f32_row( let total_lanes = width * 3; for i in 0..total_lanes { unsafe { - *rgb_out.get_unchecked_mut(i) = rgb_in.get_unchecked(i).to_f32(); + let f = rgb_in.get_unchecked(i); + let raw = f.to_bits(); + let bits = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; + *rgb_out.get_unchecked_mut(i) = half::f16::from_bits(bits).to_f32(); } } } -/// f16 RGB → f16 RGB lossless pass-through (wasm-simd128). +/// f16 RGB → f16 RGB lossless pass-through / byte-swap (wasm-simd128). /// /// # Safety /// @@ -441,12 +587,12 @@ pub(crate) unsafe fn rgbf16_to_rgb_f32_row( /// `len() >= 3 * width` f16 elements. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn rgbf16_to_rgb_f16_row( +pub(crate) unsafe fn rgbf16_to_rgb_f16_row( rgb_in: &[half::f16], rgb_out: &mut [half::f16], width: usize, ) { debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_f16_out row too short"); - scalar::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width); + scalar::rgbf16_to_rgb_f16_row::(rgb_in, rgb_out, width); } diff --git a/src/row/arch/wasm_simd128/tests/packed_rgb_float.rs b/src/row/arch/wasm_simd128/tests/packed_rgb_float.rs index 280889d3..0b151c52 100644 --- a/src/row/arch/wasm_simd128/tests/packed_rgb_float.rs +++ b/src/row/arch/wasm_simd128/tests/packed_rgb_float.rs @@ -1,6 +1,22 @@ use super::*; // ---- Tier 9 Rgbf32 SIMD-vs-scalar parity tests -------------------------- +// +// LE-host gating rationale (codex 6th-pass review of PR #83): +// +// The five Rgbf32 / six Rgbf16 SIMD-vs-scalar parity tests below build +// fixtures via host-native f32 / f16 (`pseudo_random_rgbf32` / +// `pseudo_random_rgbf16`) and call the kernels with `::`. Same +// fixture-vs-kernel byte-order class as the scalar tests gated in +// `56342c0` and the NEON tests gated alongside this commit. +// +// `target_arch = "wasm32"` is little-endian by spec, so the +// `#[cfg(target_endian = "little")]` gate is functionally a no-op on +// every supported configuration. It's added here for structural +// consistency with the NEON / scalar gating pattern. The +// `wasm_rgbf32_to_rgb_f32_row_le_input_decodes_correctly_on_any_host` +// regression test is correctly vacuous on LE hosts (probe-the-bug on +// hypothetical BE wasm) and is intentionally NOT gated. fn pseudo_random_rgbf32(width: usize) -> std::vec::Vec { let n = width * 3; @@ -21,70 +37,75 @@ fn pseudo_random_rgbf32(width: usize) -> std::vec::Vec { } #[test] +#[cfg(target_endian = "little")] fn wasm_rgbf32_to_rgb_matches_scalar() { for w in [1usize, 3, 4, 5, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_simd = std::vec![0u8; w * 3]; - scalar::rgbf32_to_rgb_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgb_row(&input, &mut out_simd, w); + rgbf32_to_rgb_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm rgbf32_to_rgb width {w}"); } } #[test] +#[cfg(target_endian = "little")] fn wasm_rgbf32_to_rgba_matches_scalar() { for w in [1usize, 3, 4, 5, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_simd = std::vec![0u8; w * 4]; - scalar::rgbf32_to_rgba_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgba_row(&input, &mut out_simd, w); + rgbf32_to_rgba_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm rgbf32_to_rgba width {w}"); } } #[test] +#[cfg(target_endian = "little")] fn wasm_rgbf32_to_rgb_u16_matches_scalar() { for w in [1usize, 3, 4, 5, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_simd = std::vec![0u16; w * 3]; - scalar::rgbf32_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgb_u16_row(&input, &mut out_simd, w); + rgbf32_to_rgb_u16_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm rgbf32_to_rgb_u16 width {w}"); } } #[test] +#[cfg(target_endian = "little")] fn wasm_rgbf32_to_rgba_u16_matches_scalar() { for w in [1usize, 3, 4, 5, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_simd = std::vec![0u16; w * 4]; - scalar::rgbf32_to_rgba_u16_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgba_u16_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgba_u16_row(&input, &mut out_simd, w); + rgbf32_to_rgba_u16_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm rgbf32_to_rgba_u16 width {w}"); } } #[test] +#[cfg(target_endian = "little")] fn wasm_rgbf32_to_rgb_f32_matches_scalar() { for w in [1usize, 3, 4, 5, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0.0f32; w * 3]; let mut out_simd = std::vec![0.0f32; w * 3]; - scalar::rgbf32_to_rgb_f32_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgb_f32_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgb_f32_row(&input, &mut out_simd, w); + rgbf32_to_rgb_f32_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm rgbf32_to_rgb_f32 width {w}"); assert_eq!(out_simd, input[..w * 3], "lossless width {w}"); @@ -101,6 +122,7 @@ fn pseudo_random_rgbf16(width: usize) -> std::vec::Vec { } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -110,15 +132,16 @@ fn wasm_rgbf16_to_rgb_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_simd = std::vec![0u8; w * 3]; - scalar::rgbf16_to_rgb_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgb_row(&input, &mut out_simd, w); + rgbf16_to_rgb_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm rgbf16_to_rgb width {w}"); } } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -128,15 +151,16 @@ fn wasm_rgbf16_to_rgba_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_simd = std::vec![0u8; w * 4]; - scalar::rgbf16_to_rgba_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgba_row(&input, &mut out_simd, w); + rgbf16_to_rgba_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm rgbf16_to_rgba width {w}"); } } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -146,15 +170,16 @@ fn wasm_rgbf16_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_simd = std::vec![0u16; w * 3]; - scalar::rgbf16_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgb_u16_row(&input, &mut out_simd, w); + rgbf16_to_rgb_u16_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm rgbf16_to_rgb_u16 width {w}"); } } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -164,15 +189,16 @@ fn wasm_rgbf16_to_rgba_u16_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_simd = std::vec![0u16; w * 4]; - scalar::rgbf16_to_rgba_u16_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgba_u16_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgba_u16_row(&input, &mut out_simd, w); + rgbf16_to_rgba_u16_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm rgbf16_to_rgba_u16 width {w}"); } } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -182,15 +208,16 @@ fn wasm_rgbf16_to_rgb_f32_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0.0f32; w * 3]; let mut out_simd = std::vec![0.0f32; w * 3]; - scalar::rgbf16_to_rgb_f32_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgb_f32_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgb_f32_row(&input, &mut out_simd, w); + rgbf16_to_rgb_f32_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm rgbf16_to_rgb_f32 width {w}"); } } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -200,11 +227,255 @@ fn wasm_rgbf16_to_rgb_f16_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![half::f16::ZERO; w * 3]; let mut out_simd = std::vec![half::f16::ZERO; w * 3]; - scalar::rgbf16_to_rgb_f16_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgb_f16_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgb_f16_row(&input, &mut out_simd, w); + rgbf16_to_rgb_f16_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm rgbf16_to_rgb_f16 width {w}"); assert_eq!(out_simd, input[..w * 3], "lossless width {w}"); } } + +// ---- BE parity tests — wasm-simd128 Rgbf32 ----------------------------------- + +fn be_rgbf32(le: &[f32]) -> std::vec::Vec { + le.iter() + .map(|v| f32::from_bits(v.to_bits().swap_bytes())) + .collect() +} + +fn be_rgbf16(le: &[half::f16]) -> std::vec::Vec { + le.iter() + .map(|v| half::f16::from_bits(v.to_bits().swap_bytes())) + .collect() +} + +#[test] +fn wasm_rgbf32_to_rgb_be_matches_le() { + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + rgbf32_to_rgb_row::(&le_in, &mut out_le, w); + rgbf32_to_rgb_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "wasm rgbf32_to_rgb BE parity width {w}"); + } +} + +#[test] +fn wasm_rgbf32_to_rgba_be_matches_le() { + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + rgbf32_to_rgba_row::(&le_in, &mut out_le, w); + rgbf32_to_rgba_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "wasm rgbf32_to_rgba BE parity width {w}"); + } +} + +#[test] +fn wasm_rgbf32_to_rgb_u16_be_matches_le() { + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + rgbf32_to_rgb_u16_row::(&le_in, &mut out_le, w); + rgbf32_to_rgb_u16_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "wasm rgbf32_to_rgb_u16 BE parity width {w}"); + } +} + +#[test] +fn wasm_rgbf32_to_rgba_u16_be_matches_le() { + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + rgbf32_to_rgba_u16_row::(&le_in, &mut out_le, w); + rgbf32_to_rgba_u16_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "wasm rgbf32_to_rgba_u16 BE parity width {w}" + ); + } +} + +#[test] +fn wasm_rgbf32_to_rgb_f32_be_is_byteswap() { + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0.0f32; w * 3]; + let mut out_be = std::vec![0.0f32; w * 3]; + unsafe { + rgbf32_to_rgb_f32_row::(&le_in, &mut out_le, w); + rgbf32_to_rgb_f32_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "wasm rgbf32_to_rgb_f32 BE parity width {w}"); + } +} + +/// Feeds an explicitly LE-encoded fixture through `rgbf32_to_rgb_f32_row::` +/// and asserts it decodes to the host-native expected values. +/// +/// On LE hosts this is a vacuous sanity check (LE-encoded == host-native), but +/// on BE hosts it guards against the historical bug where the kernel used a raw +/// `v128_load`/`v128_store` copy in the `BE = false` branch, which preserved +/// the LE byte order on store and produced corrupted (byte-swapped) host f32s. +/// The current kernel falls through to the endian-aware `load_f32x4::` +/// slow path on BE hosts (`HOST_NATIVE_BE != BE`) so this test passes on both. +/// (`wasm32-*` is LE today, but the routing is endian-agnostic for any future +/// BE wasm target.) +#[test] +fn wasm_rgbf32_to_rgb_f32_row_le_input_decodes_correctly_on_any_host() { + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let expected = pseudo_random_rgbf32(w); // host-native f32 values + // Build LE-encoded input: each lane's bits, written as if LE on disk, then + // reinterpreted as host-native f32. On LE hosts this is identical to + // `expected`; on BE hosts each lane is byte-swapped. + let le_in: std::vec::Vec = expected + .iter() + .map(|v| f32::from_bits(u32::from_le(v.to_bits()))) + .collect(); + let mut out = std::vec![0.0f32; w * 3]; + unsafe { + rgbf32_to_rgb_f32_row::(&le_in, &mut out, w); + } + assert_eq!( + out, expected, + "wasm rgbf32_to_rgb_f32_row:: must decode LE input to host-native (width {w})" + ); + } +} + +// ---- BE parity tests — wasm-simd128 Rgbf16 ----------------------------------- + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn wasm_rgbf16_to_rgb_be_matches_le() { + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + rgbf16_to_rgb_row::(&le_in, &mut out_le, w); + rgbf16_to_rgb_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "wasm rgbf16_to_rgb BE parity width {w}"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn wasm_rgbf16_to_rgba_be_matches_le() { + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + rgbf16_to_rgba_row::(&le_in, &mut out_le, w); + rgbf16_to_rgba_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "wasm rgbf16_to_rgba BE parity width {w}"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn wasm_rgbf16_to_rgb_u16_be_matches_le() { + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + rgbf16_to_rgb_u16_row::(&le_in, &mut out_le, w); + rgbf16_to_rgb_u16_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "wasm rgbf16_to_rgb_u16 BE parity width {w}"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn wasm_rgbf16_to_rgba_u16_be_matches_le() { + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + rgbf16_to_rgba_u16_row::(&le_in, &mut out_le, w); + rgbf16_to_rgba_u16_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "wasm rgbf16_to_rgba_u16 BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn wasm_rgbf16_to_rgb_f32_be_matches_le() { + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0.0f32; w * 3]; + let mut out_be = std::vec![0.0f32; w * 3]; + unsafe { + rgbf16_to_rgb_f32_row::(&le_in, &mut out_le, w); + rgbf16_to_rgb_f32_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "wasm rgbf16_to_rgb_f32 BE parity width {w}"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn wasm_rgbf16_to_rgb_f16_be_is_byteswap() { + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![half::f16::ZERO; w * 3]; + let mut out_be = std::vec![half::f16::ZERO; w * 3]; + unsafe { + rgbf16_to_rgb_f16_row::(&le_in, &mut out_le, w); + rgbf16_to_rgb_f16_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "wasm rgbf16_to_rgb_f16 BE parity width {w}"); + } +} diff --git a/src/row/arch/x86_avx2/packed_rgb_float.rs b/src/row/arch/x86_avx2/packed_rgb_float.rs index d40d622b..32daae3f 100644 --- a/src/row/arch/x86_avx2/packed_rgb_float.rs +++ b/src/row/arch/x86_avx2/packed_rgb_float.rs @@ -6,12 +6,46 @@ //! narrow); cross-lane unpacks need `_mm256_permute4x64_epi64` to fix //! the 128-bit lane interleave that AVX2 packs leave behind. //! +//! For `` kernels, each 8-lane f32 load is replaced by +//! `load_endian_u32x8::` (a `__m256i` with byte-swapped u32 lanes +//! for BE inputs) followed by `_mm256_castsi256_ps` to reinterpret as f32. +//! //! Pixel-aligned chunks of 8 pixels = 24 lanes per iteration so the //! tail handles 0–7 leftover pixels. use core::arch::x86_64::*; +use super::endian::load_endian_u32x8; +// For f16 widen we need 128-bit u16 load (8 × u16). use super::scalar; +use crate::row::arch::x86_sse41::endian::load_endian_u16x8; + +/// `BE` value that makes the f32 row loaders treat their input as host-native +/// (a no-op byte-swap). Used by f16→f32 widen-then-convert paths whose stack +/// buffer is already host-native after `_mm256_cvtph_ps`. On a LE target, +/// host-native == LE so `BE = false`; on a BE target, host-native == BE so +/// `BE = true`. Without this routing the downstream `rgbf32_to_*::` +/// would byte-swap an already-decoded host-native f32 buffer on BE hosts. +/// +/// Also used by the `rgbf32_to_rgb_f32_row` pass-through fast path: the raw +/// `_mm256_loadu_ps`/`_mm256_storeu_ps` copy is byte-correct only when the +/// source encoding (`BE`) matches the host's native endian, so the kernel +/// falls through to the endian-aware `load_f32x8::` slow path otherwise. +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + +/// Load 8 f32 lanes from `ptr` in endian-aware fashion. +/// +/// # Safety +/// +/// AVX2 must be available; `ptr` must be valid for 32 bytes. +#[inline] +#[target_feature(enable = "avx2")] +unsafe fn load_f32x8(ptr: *const f32) -> __m256 { + unsafe { + let u = load_endian_u32x8::(ptr as *const u8); + _mm256_castsi256_ps(u) + } +} #[inline(always)] unsafe fn clamp_scale_to_u32_256(v: __m256, zero: __m256, one: __m256, scale: __m256) -> __m256i { @@ -29,6 +63,8 @@ unsafe fn clamp_scale_to_u32_256(v: __m256, zero: __m256, one: __m256, scale: __ /// f32 RGB → u8 RGB. /// +/// When `BE = true` the input `f32` values are big-endian encoded. +/// /// # Safety /// /// 1. AVX2 must be available. @@ -36,7 +72,11 @@ unsafe fn clamp_scale_to_u32_256(v: __m256, zero: __m256, one: __m256, scale: __ /// 3. `rgb_in` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgb_row( + rgb_in: &[f32], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -49,9 +89,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width let mut lane = 0usize; // 8 pixels = 24 lanes per iter. Three 256-bit f32 loads → 24 lanes. while lane + 24 <= total_lanes { - let v0 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane)); - let v1 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane + 8)); - let v2 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane + 16)); + let v0 = load_f32x8::(rgb_in.as_ptr().add(lane)); + let v1 = load_f32x8::(rgb_in.as_ptr().add(lane + 8)); + let v2 = load_f32x8::(rgb_in.as_ptr().add(lane + 16)); let i0 = clamp_scale_to_u32_256(v0, zero, one, scale); let i1 = clamp_scale_to_u32_256(v1, zero, one, scale); @@ -88,7 +128,7 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width } let pix_done = lane / 3; if pix_done < width { - scalar::rgbf32_to_rgb_row( + scalar::rgbf32_to_rgb_row::( &rgb_in[pix_done * 3..width * 3], &mut rgb_out[pix_done * 3..width * 3], width - pix_done, @@ -98,9 +138,15 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width } /// f32 RGB → u8 RGBA (alpha forced to `0xFF`). +/// +/// When `BE = true` the input `f32` values are big-endian encoded. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgba_row( + rgb_in: &[f32], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -113,9 +159,9 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid let mut lane = 0usize; let mut pix = 0usize; while lane + 24 <= total_lanes { - let v0 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane)); - let v1 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane + 8)); - let v2 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane + 16)); + let v0 = load_f32x8::(rgb_in.as_ptr().add(lane)); + let v1 = load_f32x8::(rgb_in.as_ptr().add(lane + 8)); + let v2 = load_f32x8::(rgb_in.as_ptr().add(lane + 16)); let i0 = clamp_scale_to_u32_256(v0, zero, one, scale); let i1 = clamp_scale_to_u32_256(v1, zero, one, scale); @@ -144,7 +190,7 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid pix += 8; } if pix < width { - scalar::rgbf32_to_rgba_row( + scalar::rgbf32_to_rgba_row::( &rgb_in[pix * 3..width * 3], &mut rgba_out[pix * 4..width * 4], width - pix, @@ -154,9 +200,15 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid } /// f32 RGB → u16 RGB. +/// +/// When `BE = true` the input `f32` values are big-endian encoded. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgb_u16_row( + rgb_in: &[f32], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_u16_out row too short"); @@ -168,9 +220,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], let total_lanes = width * 3; let mut lane = 0usize; while lane + 24 <= total_lanes { - let v0 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane)); - let v1 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane + 8)); - let v2 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane + 16)); + let v0 = load_f32x8::(rgb_in.as_ptr().add(lane)); + let v1 = load_f32x8::(rgb_in.as_ptr().add(lane + 8)); + let v2 = load_f32x8::(rgb_in.as_ptr().add(lane + 16)); let i0 = clamp_scale_to_u32_256(v0, zero, one, scale); let i1 = clamp_scale_to_u32_256(v1, zero, one, scale); @@ -197,7 +249,7 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], } let pix_done = lane / 3; if pix_done < width { - scalar::rgbf32_to_rgb_u16_row( + scalar::rgbf32_to_rgb_u16_row::( &rgb_in[pix_done * 3..width * 3], &mut rgb_out[pix_done * 3..width * 3], width - pix_done, @@ -207,9 +259,15 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], } /// f32 RGB → u16 RGBA (alpha forced to `0xFFFF`). +/// +/// When `BE = true` the input `f32` values are big-endian encoded. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgba_u16_row( + rgb_in: &[f32], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_u16_out row too short"); @@ -222,9 +280,9 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16] let mut lane = 0usize; let mut pix = 0usize; while lane + 24 <= total_lanes { - let v0 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane)); - let v1 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane + 8)); - let v2 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane + 16)); + let v0 = load_f32x8::(rgb_in.as_ptr().add(lane)); + let v1 = load_f32x8::(rgb_in.as_ptr().add(lane + 8)); + let v2 = load_f32x8::(rgb_in.as_ptr().add(lane + 16)); let i0 = clamp_scale_to_u32_256(v0, zero, one, scale); let i1 = clamp_scale_to_u32_256(v1, zero, one, scale); @@ -250,7 +308,7 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16] pix += 8; } if pix < width { - scalar::rgbf32_to_rgba_u16_row( + scalar::rgbf32_to_rgba_u16_row::( &rgb_in[pix * 3..width * 3], &mut rgba_out[pix * 4..width * 4], width - pix, @@ -260,23 +318,53 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16] } /// f32 RGB → f32 RGB lossless pass-through. +/// +/// When `BE = true` the input values are byte-swapped to host-native before +/// being written. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgb_f32_row( + rgb_in: &[f32], + rgb_out: &mut [f32], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short"); unsafe { let total = width * 3; let mut i = 0usize; - while i + 8 <= total { - let v = _mm256_loadu_ps(rgb_in.as_ptr().add(i)); - _mm256_storeu_ps(rgb_out.as_mut_ptr().add(i), v); - i += 8; - } - while i < total { - *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i); - i += 1; + // Fast path: when the requested encoding (BE) matches the host's native + // endian, the bytes can be copied verbatim — `_mm256_loadu_ps` reads host- + // native bytes which is exactly what we need to emit. Otherwise we must + // decode through `load_f32x8::` (which byte-swaps when BE differs from + // host-native) so the stored host-native f32 round-trips to the same value. + if BE == HOST_NATIVE_BE { + while i + 8 <= total { + let v = _mm256_loadu_ps(rgb_in.as_ptr().add(i)); + _mm256_storeu_ps(rgb_out.as_mut_ptr().add(i), v); + i += 8; + } + while i < total { + *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i); + i += 1; + } + } else { + while i + 8 <= total { + let v = load_f32x8::(rgb_in.as_ptr().add(i)); + _mm256_storeu_ps(rgb_out.as_mut_ptr().add(i), v); + i += 8; + } + while i < total { + let bits = (*rgb_in.get_unchecked(i)).to_bits(); + let host_bits = if BE { + u32::from_be(bits) + } else { + u32::from_le(bits) + }; + *rgb_out.get_unchecked_mut(i) = f32::from_bits(host_bits); + i += 1; + } } } } @@ -285,31 +373,38 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], // // `_mm256_cvtph_ps` (F16C) widens 8 × f16 (stored as 8 × i16 in a __m128i) // to 8 × f32 in a __m256. We load 16 bytes (8 f16 values) via -// `_mm_loadu_si128`. -// -// Downstream: after widening a 24-lane chunk (= 8 pixels) to f32, we call the -// existing AVX2 Rgbf32 kernels. The scalar tail uses -// `crate::row::scalar::rgbf16_to_*_row`. +// `load_endian_u16x8::` which routes to a host-native pass-through when +// the on-disk encoding matches host-native and to a byte-swap shuffle +// otherwise — correct on both LE and BE hosts. // // `#[target_feature(enable = "avx2,f16c")]` ensures both features are active. /// Widen 8 × f16 (at `ptr`, 16 bytes) to 8 × f32 (returned as `__m256`). /// +/// For `BE = true` the f16 values are stored big-endian; bytes are swapped +/// before the F16C widening conversion. The historical `BE = false` branch +/// used a raw `_mm_loadu_si128` which assumed LE-encoded input on a LE host; +/// `load_endian_u16x8::` is correct on both LE and BE hosts because it +/// monomorphizes to a no-op load when on-disk encoding matches host-native +/// and to a byte-swap shuffle otherwise. +/// /// # Safety /// /// * AVX2 + F16C must be available. /// * `ptr` must be valid for 16 bytes (8 × u16 / f16). #[inline] #[target_feature(enable = "avx2,f16c")] -unsafe fn widen_f16x8_avx(ptr: *const half::f16) -> __m256 { +unsafe fn widen_f16x8_avx(ptr: *const half::f16) -> __m256 { unsafe { - let raw = _mm_loadu_si128(ptr as *const __m128i); + let raw = load_endian_u16x8::(ptr as *const u8); _mm256_cvtph_ps(raw) } } /// f16 RGB → u8 RGB (AVX2 + F16C). /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Safety /// /// 1. AVX2 and F16C must be available. @@ -317,7 +412,11 @@ unsafe fn widen_f16x8_avx(ptr: *const half::f16) -> __m256 { /// 3. `rgb_in` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "avx2,f16c")] -pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn rgbf16_to_rgb_row( + rgb_in: &[half::f16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -327,19 +426,22 @@ pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], while lane + 24 <= total_lanes { let mut buf = [0.0f32; 24]; unsafe { - let f0 = widen_f16x8_avx(rgb_in.as_ptr().add(lane)); - let f1 = widen_f16x8_avx(rgb_in.as_ptr().add(lane + 8)); - let f2 = widen_f16x8_avx(rgb_in.as_ptr().add(lane + 16)); + let f0 = widen_f16x8_avx::(rgb_in.as_ptr().add(lane)); + let f1 = widen_f16x8_avx::(rgb_in.as_ptr().add(lane + 8)); + let f2 = widen_f16x8_avx::(rgb_in.as_ptr().add(lane + 16)); _mm256_storeu_ps(buf.as_mut_ptr(), f0); _mm256_storeu_ps(buf.as_mut_ptr().add(8), f1); _mm256_storeu_ps(buf.as_mut_ptr().add(16), f2); - rgbf32_to_rgb_row(&buf, rgb_out.get_unchecked_mut(lane..lane + 24), 8); + // Buffer is host-native f32 after _mm256_cvtph_ps; route via + // HOST_NATIVE_BE so the f32 loaders perform a no-op byte-swap on + // both LE and BE hosts. + rgbf32_to_rgb_row::(&buf, rgb_out.get_unchecked_mut(lane..lane + 24), 8); } lane += 24; } let pix_done = lane / 3; if pix_done < width { - scalar::rgbf16_to_rgb_row( + scalar::rgbf16_to_rgb_row::( &rgb_in[pix_done * 3..width * 3], &mut rgb_out[pix_done * 3..width * 3], width - pix_done, @@ -349,12 +451,18 @@ pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], /// f16 RGB → u8 RGBA (alpha `0xFF`) (AVX2 + F16C). /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx2,f16c")] -pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn rgbf16_to_rgba_row( + rgb_in: &[half::f16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -364,19 +472,24 @@ pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8 while lane + 24 <= total_lanes { let mut buf = [0.0f32; 24]; unsafe { - let f0 = widen_f16x8_avx(rgb_in.as_ptr().add(lane)); - let f1 = widen_f16x8_avx(rgb_in.as_ptr().add(lane + 8)); - let f2 = widen_f16x8_avx(rgb_in.as_ptr().add(lane + 16)); + let f0 = widen_f16x8_avx::(rgb_in.as_ptr().add(lane)); + let f1 = widen_f16x8_avx::(rgb_in.as_ptr().add(lane + 8)); + let f2 = widen_f16x8_avx::(rgb_in.as_ptr().add(lane + 16)); _mm256_storeu_ps(buf.as_mut_ptr(), f0); _mm256_storeu_ps(buf.as_mut_ptr().add(8), f1); _mm256_storeu_ps(buf.as_mut_ptr().add(16), f2); - rgbf32_to_rgba_row(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 32), 8); + // Buffer is host-native f32; route via HOST_NATIVE_BE. + rgbf32_to_rgba_row::( + &buf, + rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 32), + 8, + ); } lane += 24; pix += 8; } if pix < width { - scalar::rgbf16_to_rgba_row( + scalar::rgbf16_to_rgba_row::( &rgb_in[pix * 3..width * 3], &mut rgba_out[pix * 4..width * 4], width - pix, @@ -386,13 +499,15 @@ pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8 /// f16 RGB → u16 RGB (AVX2 + F16C). /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [u16]` with /// `len() >= 3 * width` u16 elements. #[inline] #[target_feature(enable = "avx2,f16c")] -pub(crate) unsafe fn rgbf16_to_rgb_u16_row( +pub(crate) unsafe fn rgbf16_to_rgb_u16_row( rgb_in: &[half::f16], rgb_out: &mut [u16], width: usize, @@ -405,19 +520,20 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row( while lane + 24 <= total_lanes { let mut buf = [0.0f32; 24]; unsafe { - let f0 = widen_f16x8_avx(rgb_in.as_ptr().add(lane)); - let f1 = widen_f16x8_avx(rgb_in.as_ptr().add(lane + 8)); - let f2 = widen_f16x8_avx(rgb_in.as_ptr().add(lane + 16)); + let f0 = widen_f16x8_avx::(rgb_in.as_ptr().add(lane)); + let f1 = widen_f16x8_avx::(rgb_in.as_ptr().add(lane + 8)); + let f2 = widen_f16x8_avx::(rgb_in.as_ptr().add(lane + 16)); _mm256_storeu_ps(buf.as_mut_ptr(), f0); _mm256_storeu_ps(buf.as_mut_ptr().add(8), f1); _mm256_storeu_ps(buf.as_mut_ptr().add(16), f2); - rgbf32_to_rgb_u16_row(&buf, rgb_out.get_unchecked_mut(lane..lane + 24), 8); + // Buffer is host-native f32; route via HOST_NATIVE_BE. + rgbf32_to_rgb_u16_row::(&buf, rgb_out.get_unchecked_mut(lane..lane + 24), 8); } lane += 24; } let pix_done = lane / 3; if pix_done < width { - scalar::rgbf16_to_rgb_u16_row( + scalar::rgbf16_to_rgb_u16_row::( &rgb_in[pix_done * 3..width * 3], &mut rgb_out[pix_done * 3..width * 3], width - pix_done, @@ -427,12 +543,14 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row( /// f16 RGB → u16 RGBA (alpha `0xFFFF`) (AVX2 + F16C). /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf16_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx2,f16c")] -pub(crate) unsafe fn rgbf16_to_rgba_u16_row( +pub(crate) unsafe fn rgbf16_to_rgba_u16_row( rgb_in: &[half::f16], rgba_out: &mut [u16], width: usize, @@ -446,19 +564,24 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row( while lane + 24 <= total_lanes { let mut buf = [0.0f32; 24]; unsafe { - let f0 = widen_f16x8_avx(rgb_in.as_ptr().add(lane)); - let f1 = widen_f16x8_avx(rgb_in.as_ptr().add(lane + 8)); - let f2 = widen_f16x8_avx(rgb_in.as_ptr().add(lane + 16)); + let f0 = widen_f16x8_avx::(rgb_in.as_ptr().add(lane)); + let f1 = widen_f16x8_avx::(rgb_in.as_ptr().add(lane + 8)); + let f2 = widen_f16x8_avx::(rgb_in.as_ptr().add(lane + 16)); _mm256_storeu_ps(buf.as_mut_ptr(), f0); _mm256_storeu_ps(buf.as_mut_ptr().add(8), f1); _mm256_storeu_ps(buf.as_mut_ptr().add(16), f2); - rgbf32_to_rgba_u16_row(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 32), 8); + // Buffer is host-native f32; route via HOST_NATIVE_BE. + rgbf32_to_rgba_u16_row::( + &buf, + rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 32), + 8, + ); } lane += 24; pix += 8; } if pix < width { - scalar::rgbf16_to_rgba_u16_row( + scalar::rgbf16_to_rgba_u16_row::( &rgb_in[pix * 3..width * 3], &mut rgba_out[pix * 4..width * 4], width - pix, @@ -468,13 +591,15 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row( /// f16 RGB → f32 RGB (lossless widen) (AVX2 + F16C). /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [f32]` with /// `len() >= 3 * width` f32 elements. #[inline] #[target_feature(enable = "avx2,f16c")] -pub(crate) unsafe fn rgbf16_to_rgb_f32_row( +pub(crate) unsafe fn rgbf16_to_rgb_f32_row( rgb_in: &[half::f16], rgb_out: &mut [f32], width: usize, @@ -486,33 +611,42 @@ pub(crate) unsafe fn rgbf16_to_rgb_f32_row( let mut lane = 0usize; while lane + 8 <= total_lanes { unsafe { - let f = widen_f16x8_avx(rgb_in.as_ptr().add(lane)); + let f = widen_f16x8_avx::(rgb_in.as_ptr().add(lane)); _mm256_storeu_ps(rgb_out.as_mut_ptr().add(lane), f); } lane += 8; } // Scalar tail for the last 0-7 lanes. + #[allow(clippy::needless_range_loop)] for i in lane..total_lanes { + let bits = rgb_in[i].to_bits(); + let h = half::f16::from_bits(if BE { + u16::from_be(bits) + } else { + u16::from_le(bits) + }); unsafe { - *rgb_out.get_unchecked_mut(i) = rgb_in.get_unchecked(i).to_f32(); + *rgb_out.get_unchecked_mut(i) = h.to_f32(); } } } /// f16 RGB → f16 RGB lossless pass-through (AVX2 + F16C). /// +/// When `BE = true` the input values are byte-swapped to host-native order. +/// /// # Safety /// /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [half::f16]` with /// `len() >= 3 * width` f16 elements. #[inline] #[target_feature(enable = "avx2,f16c")] -pub(crate) unsafe fn rgbf16_to_rgb_f16_row( +pub(crate) unsafe fn rgbf16_to_rgb_f16_row( rgb_in: &[half::f16], rgb_out: &mut [half::f16], width: usize, ) { debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_f16_out row too short"); - scalar::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width); + scalar::rgbf16_to_rgb_f16_row::(rgb_in, rgb_out, width); } diff --git a/src/row/arch/x86_avx2/tests/packed_rgb_float.rs b/src/row/arch/x86_avx2/tests/packed_rgb_float.rs index ab32d1a0..0da58e97 100644 --- a/src/row/arch/x86_avx2/tests/packed_rgb_float.rs +++ b/src/row/arch/x86_avx2/tests/packed_rgb_float.rs @@ -1,6 +1,23 @@ use super::super::*; // ---- Tier 9 Rgbf32 SIMD-vs-scalar parity tests -------------------------- +// +// LE-host gating rationale (codex 6th-pass review of PR #83): +// +// The five Rgbf32 / six Rgbf16 SIMD-vs-scalar parity tests below build +// fixtures via host-native f32 / f16 (`pseudo_random_rgbf32` / +// `pseudo_random_rgbf16`) and call the kernels with `::`. Same +// fixture-vs-kernel byte-order class as the scalar tests gated in +// `56342c0` and the NEON tests gated alongside this commit. +// +// `target_arch = "x86_64"` always implies `target_endian = "little"`, +// so the `#[cfg(target_endian = "little")]` gate is functionally a +// no-op on every supported configuration. It's added here for +// structural consistency with the NEON / scalar gating pattern. The +// LE-decode regression tests further down build LE-encoded fixtures +// via `half::f16::from_bits(u16::from_le(_))` and are correctly +// vacuous on LE hosts (probe-the-bug on hypothetical BE), so they +// are intentionally NOT gated. fn pseudo_random_rgbf32(width: usize) -> std::vec::Vec { let n = width * 3; @@ -21,6 +38,7 @@ fn pseudo_random_rgbf32(width: usize) -> std::vec::Vec { } #[test] +#[cfg(target_endian = "little")] fn avx2_rgbf32_to_rgb_matches_scalar() { if !std::arch::is_x86_feature_detected!("avx2") { return; @@ -29,15 +47,16 @@ fn avx2_rgbf32_to_rgb_matches_scalar() { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_simd = std::vec![0u8; w * 3]; - scalar::rgbf32_to_rgb_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgb_row(&input, &mut out_simd, w); + rgbf32_to_rgb_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "AVX2 rgbf32_to_rgb width {w}"); } } #[test] +#[cfg(target_endian = "little")] fn avx2_rgbf32_to_rgba_matches_scalar() { if !std::arch::is_x86_feature_detected!("avx2") { return; @@ -46,15 +65,16 @@ fn avx2_rgbf32_to_rgba_matches_scalar() { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_simd = std::vec![0u8; w * 4]; - scalar::rgbf32_to_rgba_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgba_row(&input, &mut out_simd, w); + rgbf32_to_rgba_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "AVX2 rgbf32_to_rgba width {w}"); } } #[test] +#[cfg(target_endian = "little")] fn avx2_rgbf32_to_rgb_u16_matches_scalar() { if !std::arch::is_x86_feature_detected!("avx2") { return; @@ -63,15 +83,16 @@ fn avx2_rgbf32_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_simd = std::vec![0u16; w * 3]; - scalar::rgbf32_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgb_u16_row(&input, &mut out_simd, w); + rgbf32_to_rgb_u16_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "AVX2 rgbf32_to_rgb_u16 width {w}"); } } #[test] +#[cfg(target_endian = "little")] fn avx2_rgbf32_to_rgba_u16_matches_scalar() { if !std::arch::is_x86_feature_detected!("avx2") { return; @@ -80,15 +101,16 @@ fn avx2_rgbf32_to_rgba_u16_matches_scalar() { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_simd = std::vec![0u16; w * 4]; - scalar::rgbf32_to_rgba_u16_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgba_u16_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgba_u16_row(&input, &mut out_simd, w); + rgbf32_to_rgba_u16_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "AVX2 rgbf32_to_rgba_u16 width {w}"); } } #[test] +#[cfg(target_endian = "little")] fn avx2_rgbf32_to_rgb_f32_matches_scalar() { if !std::arch::is_x86_feature_detected!("avx2") { return; @@ -97,9 +119,9 @@ fn avx2_rgbf32_to_rgb_f32_matches_scalar() { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0.0f32; w * 3]; let mut out_simd = std::vec![0.0f32; w * 3]; - scalar::rgbf32_to_rgb_f32_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgb_f32_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgb_f32_row(&input, &mut out_simd, w); + rgbf32_to_rgb_f32_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "AVX2 rgbf32_to_rgb_f32 width {w}"); assert_eq!(out_simd, input[..w * 3], "lossless width {w}"); @@ -116,6 +138,7 @@ fn pseudo_random_rgbf16(width: usize) -> std::vec::Vec { } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -128,15 +151,16 @@ fn avx2_rgbf16_to_rgb_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_simd = std::vec![0u8; w * 3]; - scalar::rgbf16_to_rgb_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgb_row(&input, &mut out_simd, w); + rgbf16_to_rgb_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "AVX2+F16C rgbf16_to_rgb width {w}"); } } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -149,15 +173,16 @@ fn avx2_rgbf16_to_rgba_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_simd = std::vec![0u8; w * 4]; - scalar::rgbf16_to_rgba_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgba_row(&input, &mut out_simd, w); + rgbf16_to_rgba_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "AVX2+F16C rgbf16_to_rgba width {w}"); } } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -170,9 +195,9 @@ fn avx2_rgbf16_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_simd = std::vec![0u16; w * 3]; - scalar::rgbf16_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgb_u16_row(&input, &mut out_simd, w); + rgbf16_to_rgb_u16_row::(&input, &mut out_simd, w); } assert_eq!( out_scalar, out_simd, @@ -182,6 +207,7 @@ fn avx2_rgbf16_to_rgb_u16_matches_scalar() { } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -194,9 +220,9 @@ fn avx2_rgbf16_to_rgba_u16_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_simd = std::vec![0u16; w * 4]; - scalar::rgbf16_to_rgba_u16_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgba_u16_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgba_u16_row(&input, &mut out_simd, w); + rgbf16_to_rgba_u16_row::(&input, &mut out_simd, w); } assert_eq!( out_scalar, out_simd, @@ -206,6 +232,7 @@ fn avx2_rgbf16_to_rgba_u16_matches_scalar() { } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -218,9 +245,9 @@ fn avx2_rgbf16_to_rgb_f32_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0.0f32; w * 3]; let mut out_simd = std::vec![0.0f32; w * 3]; - scalar::rgbf16_to_rgb_f32_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgb_f32_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgb_f32_row(&input, &mut out_simd, w); + rgbf16_to_rgb_f32_row::(&input, &mut out_simd, w); } assert_eq!( out_scalar, out_simd, @@ -230,6 +257,7 @@ fn avx2_rgbf16_to_rgb_f32_matches_scalar() { } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -242,9 +270,9 @@ fn avx2_rgbf16_to_rgb_f16_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![half::f16::ZERO; w * 3]; let mut out_simd = std::vec![half::f16::ZERO; w * 3]; - scalar::rgbf16_to_rgb_f16_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgb_f16_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgb_f16_row(&input, &mut out_simd, w); + rgbf16_to_rgb_f16_row::(&input, &mut out_simd, w); } assert_eq!( out_scalar, out_simd, @@ -253,3 +281,448 @@ fn avx2_rgbf16_to_rgb_f16_matches_scalar() { assert_eq!(out_simd, input[..w * 3], "lossless width {w}"); } } + +// ---- BE parity tests — AVX2 Rgbf32 ------------------------------------------ + +fn be_rgbf32(le: &[f32]) -> std::vec::Vec { + le.iter() + .map(|v| f32::from_bits(v.to_bits().swap_bytes())) + .collect() +} + +fn be_rgbf16(le: &[half::f16]) -> std::vec::Vec { + le.iter() + .map(|v| half::f16::from_bits(v.to_bits().swap_bytes())) + .collect() +} + +#[test] +#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] +fn avx2_rgbf32_to_rgb_be_matches_le() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [1usize, 4, 8, 17, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + rgbf32_to_rgb_row::(&le_in, &mut out_le, w); + rgbf32_to_rgb_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "AVX2 rgbf32_to_rgb BE parity width {w}"); + } +} + +#[test] +#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] +fn avx2_rgbf32_to_rgba_be_matches_le() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [1usize, 4, 8, 17, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + rgbf32_to_rgba_row::(&le_in, &mut out_le, w); + rgbf32_to_rgba_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "AVX2 rgbf32_to_rgba BE parity width {w}"); + } +} + +#[test] +#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] +fn avx2_rgbf32_to_rgb_u16_be_matches_le() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [1usize, 4, 8, 17, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + rgbf32_to_rgb_u16_row::(&le_in, &mut out_le, w); + rgbf32_to_rgb_u16_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "AVX2 rgbf32_to_rgb_u16 BE parity width {w}"); + } +} + +#[test] +#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] +fn avx2_rgbf32_to_rgba_u16_be_matches_le() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [1usize, 4, 8, 17, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + rgbf32_to_rgba_u16_row::(&le_in, &mut out_le, w); + rgbf32_to_rgba_u16_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX2 rgbf32_to_rgba_u16 BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] +fn avx2_rgbf32_to_rgb_f32_be_is_byteswap() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [1usize, 4, 8, 17, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0.0f32; w * 3]; + let mut out_be = std::vec![0.0f32; w * 3]; + unsafe { + rgbf32_to_rgb_f32_row::(&le_in, &mut out_le, w); + rgbf32_to_rgb_f32_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "AVX2 rgbf32_to_rgb_f32 BE parity width {w}"); + } +} + +/// Feeds an explicitly LE-encoded fixture through `rgbf32_to_rgb_f32_row::` +/// and asserts it decodes to the host-native expected values. +/// +/// On LE hosts this is a vacuous sanity check (LE-encoded == host-native), but +/// on BE hosts it guards against the historical bug where the kernel used a raw +/// `_mm256_loadu_ps`/`_mm256_storeu_ps` copy in the `BE = false` branch, which +/// preserved the LE byte order on store and produced corrupted (byte-swapped) +/// host f32s. The current kernel falls through to the endian-aware +/// `load_f32x8::` slow path on BE hosts (`HOST_NATIVE_BE != BE`) so this +/// test passes on both. +#[test] +#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] +fn avx2_rgbf32_to_rgb_f32_row_le_input_decodes_correctly_on_any_host() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [1usize, 4, 8, 17, 33, 1920, 1921] { + let expected = pseudo_random_rgbf32(w); // host-native f32 values + // Build LE-encoded input: each lane's bits, written as if LE on disk, then + // reinterpreted as host-native f32. On LE hosts this is identical to + // `expected`; on BE hosts each lane is byte-swapped. + let le_in: std::vec::Vec = expected + .iter() + .map(|v| f32::from_bits(u32::from_le(v.to_bits()))) + .collect(); + let mut out = std::vec![0.0f32; w * 3]; + unsafe { + rgbf32_to_rgb_f32_row::(&le_in, &mut out, w); + } + assert_eq!( + out, expected, + "AVX2 rgbf32_to_rgb_f32_row:: must decode LE input to host-native (width {w})" + ); + } +} + +// ---- BE parity tests — AVX2 + F16C Rgbf16 ------------------------------------ + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx2_rgbf16_to_rgb_be_matches_le() { + if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") { + return; + } + for w in [1usize, 4, 8, 17, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + rgbf16_to_rgb_row::(&le_in, &mut out_le, w); + rgbf16_to_rgb_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX2+F16C rgbf16_to_rgb BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx2_rgbf16_to_rgba_be_matches_le() { + if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") { + return; + } + for w in [1usize, 4, 8, 17, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + rgbf16_to_rgba_row::(&le_in, &mut out_le, w); + rgbf16_to_rgba_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX2+F16C rgbf16_to_rgba BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx2_rgbf16_to_rgb_u16_be_matches_le() { + if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") { + return; + } + for w in [1usize, 4, 8, 17, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + rgbf16_to_rgb_u16_row::(&le_in, &mut out_le, w); + rgbf16_to_rgb_u16_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX2+F16C rgbf16_to_rgb_u16 BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx2_rgbf16_to_rgba_u16_be_matches_le() { + if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") { + return; + } + for w in [1usize, 4, 8, 17, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + rgbf16_to_rgba_u16_row::(&le_in, &mut out_le, w); + rgbf16_to_rgba_u16_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX2+F16C rgbf16_to_rgba_u16 BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx2_rgbf16_to_rgb_f32_be_matches_le() { + if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") { + return; + } + for w in [1usize, 4, 8, 17, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0.0f32; w * 3]; + let mut out_be = std::vec![0.0f32; w * 3]; + unsafe { + rgbf16_to_rgb_f32_row::(&le_in, &mut out_le, w); + rgbf16_to_rgb_f32_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX2+F16C rgbf16_to_rgb_f32 BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx2_rgbf16_to_rgb_f16_be_is_byteswap() { + if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") { + return; + } + for w in [1usize, 4, 8, 17, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![half::f16::ZERO; w * 3]; + let mut out_be = std::vec![half::f16::ZERO; w * 3]; + unsafe { + rgbf16_to_rgb_f16_row::(&le_in, &mut out_le, w); + rgbf16_to_rgb_f16_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX2+F16C rgbf16_to_rgb_f16 BE parity width {w}" + ); + } +} + +// ---- LE-decode regression tests — AVX2 + F16C `widen_f16x8_avx` ------------- +// +// These guard against the historical bug where `widen_f16x8_avx::` used +// a raw `_mm_loadu_si128` to read 8 × f16 — that's a host-native u16 load, so +// on a BE host with LE-encoded input the lanes were mis-decoded as host-BE +// before the F16C widening conversion. The fixed helper always routes through +// `load_endian_u16x8::` which monomorphizes to a pass-through on the +// matching host-encoding axis and to a byte-swap shuffle otherwise, so each +// kernel decodes correctly on both LE and BE hosts. + +/// Build an LE-encoded f16 fixture from a host-native one. On LE hosts this is +/// identity; on BE hosts each lane is byte-swapped (so an LE-encoded buffer +/// looks byte-swapped relative to host-native bits). +fn le_rgbf16_from_host(host: &[half::f16]) -> std::vec::Vec { + host + .iter() + .map(|v| half::f16::from_bits(u16::from_le(v.to_bits()))) + .collect() +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx2_rgbf16_to_rgb_row_le_input_decodes_correctly_on_any_host() { + if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") { + return; + } + for w in [1usize, 4, 8, 15, 17, 33, 1920, 1921] { + let host = pseudo_random_rgbf16(w); + let le_in = le_rgbf16_from_host(&host); + let mut out_simd = std::vec![0u8; w * 3]; + let mut out_scalar = std::vec![0u8; w * 3]; + unsafe { + rgbf16_to_rgb_row::(&le_in, &mut out_simd, w); + } + scalar::rgbf16_to_rgb_row::(&le_in, &mut out_scalar, w); + assert_eq!( + out_simd, out_scalar, + "AVX2+F16C rgbf16_to_rgb_row:: must decode LE input to match scalar (width {w})" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx2_rgbf16_to_rgba_row_le_input_decodes_correctly_on_any_host() { + if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") { + return; + } + for w in [1usize, 4, 8, 15, 17, 33, 1920, 1921] { + let host = pseudo_random_rgbf16(w); + let le_in = le_rgbf16_from_host(&host); + let mut out_simd = std::vec![0u8; w * 4]; + let mut out_scalar = std::vec![0u8; w * 4]; + unsafe { + rgbf16_to_rgba_row::(&le_in, &mut out_simd, w); + } + scalar::rgbf16_to_rgba_row::(&le_in, &mut out_scalar, w); + assert_eq!( + out_simd, out_scalar, + "AVX2+F16C rgbf16_to_rgba_row:: must decode LE input to match scalar (width {w})" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx2_rgbf16_to_rgb_u16_row_le_input_decodes_correctly_on_any_host() { + if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") { + return; + } + for w in [1usize, 4, 8, 15, 17, 33, 1920, 1921] { + let host = pseudo_random_rgbf16(w); + let le_in = le_rgbf16_from_host(&host); + let mut out_simd = std::vec![0u16; w * 3]; + let mut out_scalar = std::vec![0u16; w * 3]; + unsafe { + rgbf16_to_rgb_u16_row::(&le_in, &mut out_simd, w); + } + scalar::rgbf16_to_rgb_u16_row::(&le_in, &mut out_scalar, w); + assert_eq!( + out_simd, out_scalar, + "AVX2+F16C rgbf16_to_rgb_u16_row:: must decode LE input to match scalar (width {w})" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx2_rgbf16_to_rgba_u16_row_le_input_decodes_correctly_on_any_host() { + if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") { + return; + } + for w in [1usize, 4, 8, 15, 17, 33, 1920, 1921] { + let host = pseudo_random_rgbf16(w); + let le_in = le_rgbf16_from_host(&host); + let mut out_simd = std::vec![0u16; w * 4]; + let mut out_scalar = std::vec![0u16; w * 4]; + unsafe { + rgbf16_to_rgba_u16_row::(&le_in, &mut out_simd, w); + } + scalar::rgbf16_to_rgba_u16_row::(&le_in, &mut out_scalar, w); + assert_eq!( + out_simd, out_scalar, + "AVX2+F16C rgbf16_to_rgba_u16_row:: must decode LE input to match scalar (width {w})" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx2_rgbf16_to_rgb_f32_row_le_input_decodes_correctly_on_any_host() { + if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") { + return; + } + for w in [1usize, 4, 8, 15, 17, 33, 1920, 1921] { + let host = pseudo_random_rgbf16(w); + let le_in = le_rgbf16_from_host(&host); + let mut out_simd = std::vec![0.0f32; w * 3]; + let mut out_scalar = std::vec![0.0f32; w * 3]; + unsafe { + rgbf16_to_rgb_f32_row::(&le_in, &mut out_simd, w); + } + scalar::rgbf16_to_rgb_f32_row::(&le_in, &mut out_scalar, w); + assert_eq!( + out_simd, out_scalar, + "AVX2+F16C rgbf16_to_rgb_f32_row:: must decode LE input to match scalar (width {w})" + ); + } +} diff --git a/src/row/arch/x86_avx512/packed_rgb_float.rs b/src/row/arch/x86_avx512/packed_rgb_float.rs index 90bdbd78..d4d80802 100644 --- a/src/row/arch/x86_avx512/packed_rgb_float.rs +++ b/src/row/arch/x86_avx512/packed_rgb_float.rs @@ -2,13 +2,47 @@ //! (`Rgbf32`) source. 16-lane `__m512` registers; same lane-aligned //! pixel chunking as the AVX2 backend at twice the throughput. //! +//! For `` kernels, each 16-lane f32 load is replaced by +//! `load_endian_u32x16::` (a `__m512i` with byte-swapped u32 lanes +//! for BE inputs) followed by `_mm512_castsi512_ps` to reinterpret as f32. +//! //! Process 16 pixels = 48 lanes per iteration so the loop boundary //! lands on a pixel boundary; the scalar tail handles the leftover //! 0–15 pixels. use core::arch::x86_64::*; +use super::endian::load_endian_u32x16; +// For f16 widen we need a 256-bit u16 load (16 × u16 = 32 bytes). use super::scalar; +use crate::row::arch::x86_avx2::endian::load_endian_u16x16; + +/// `BE` value that makes the f32 row loaders treat their input as host-native +/// (a no-op byte-swap). Used by f16→f32 widen-then-convert paths whose stack +/// buffer is already host-native after `_mm512_cvtph_ps`. On a LE target, +/// host-native == LE so `BE = false`; on a BE target, host-native == BE so +/// `BE = true`. Without this routing the downstream `rgbf32_to_*::` +/// would byte-swap an already-decoded host-native f32 buffer on BE hosts. +/// +/// Also used by the `rgbf32_to_rgb_f32_row` pass-through fast path: the raw +/// `_mm512_loadu_ps`/`_mm512_storeu_ps` copy is byte-correct only when the +/// source encoding (`BE`) matches the host's native endian, so the kernel +/// falls through to the endian-aware `load_f32x16::` slow path otherwise. +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + +/// Load 16 f32 lanes from `ptr` in endian-aware fashion. +/// +/// # Safety +/// +/// AVX-512F must be available; `ptr` must be valid for 64 bytes. +#[inline] +#[target_feature(enable = "avx512f")] +unsafe fn load_f32x16(ptr: *const f32) -> __m512 { + unsafe { + let u = load_endian_u32x16::(ptr as *const u8); + _mm512_castsi512_ps(u) + } +} #[inline(always)] unsafe fn clamp_scale_to_u32_512(v: __m512, zero: __m512, one: __m512, scale: __m512) -> __m512i { @@ -24,6 +58,8 @@ unsafe fn clamp_scale_to_u32_512(v: __m512, zero: __m512, one: __m512, scale: __ /// f32 RGB → u8 RGB. /// +/// When `BE = true` the input `f32` values are big-endian encoded. +/// /// # Safety /// /// 1. AVX-512F + AVX-512BW must be available. @@ -31,7 +67,11 @@ unsafe fn clamp_scale_to_u32_512(v: __m512, zero: __m512, one: __m512, scale: __ /// 3. `rgb_in` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgb_row( + rgb_in: &[f32], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -44,9 +84,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width let mut lane = 0usize; // 16 pixels = 48 lanes per iter (3 × 16-lane f32 loads). while lane + 48 <= total_lanes { - let v0 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane)); - let v1 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane + 16)); - let v2 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane + 32)); + let v0 = load_f32x16::(rgb_in.as_ptr().add(lane)); + let v1 = load_f32x16::(rgb_in.as_ptr().add(lane + 16)); + let v2 = load_f32x16::(rgb_in.as_ptr().add(lane + 32)); let i0 = clamp_scale_to_u32_512(v0, zero, one, scale); let i1 = clamp_scale_to_u32_512(v1, zero, one, scale); @@ -67,7 +107,7 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width } let pix_done = lane / 3; if pix_done < width { - scalar::rgbf32_to_rgb_row( + scalar::rgbf32_to_rgb_row::( &rgb_in[pix_done * 3..width * 3], &mut rgb_out[pix_done * 3..width * 3], width - pix_done, @@ -77,9 +117,15 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width } /// f32 RGB → u8 RGBA (alpha forced to `0xFF`). +/// +/// When `BE = true` the input `f32` values are big-endian encoded. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgba_row( + rgb_in: &[f32], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -92,9 +138,9 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid let mut lane = 0usize; let mut pix = 0usize; while lane + 48 <= total_lanes { - let v0 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane)); - let v1 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane + 16)); - let v2 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane + 32)); + let v0 = load_f32x16::(rgb_in.as_ptr().add(lane)); + let v1 = load_f32x16::(rgb_in.as_ptr().add(lane + 16)); + let v2 = load_f32x16::(rgb_in.as_ptr().add(lane + 32)); let i0 = clamp_scale_to_u32_512(v0, zero, one, scale); let i1 = clamp_scale_to_u32_512(v1, zero, one, scale); @@ -122,7 +168,7 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid pix += 16; } if pix < width { - scalar::rgbf32_to_rgba_row( + scalar::rgbf32_to_rgba_row::( &rgb_in[pix * 3..width * 3], &mut rgba_out[pix * 4..width * 4], width - pix, @@ -132,9 +178,15 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid } /// f32 RGB → u16 RGB. +/// +/// When `BE = true` the input `f32` values are big-endian encoded. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgb_u16_row( + rgb_in: &[f32], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_u16_out row too short"); @@ -146,9 +198,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], let total_lanes = width * 3; let mut lane = 0usize; while lane + 48 <= total_lanes { - let v0 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane)); - let v1 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane + 16)); - let v2 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane + 32)); + let v0 = load_f32x16::(rgb_in.as_ptr().add(lane)); + let v1 = load_f32x16::(rgb_in.as_ptr().add(lane + 16)); + let v2 = load_f32x16::(rgb_in.as_ptr().add(lane + 32)); let i0 = clamp_scale_to_u32_512(v0, zero, one, scale); let i1 = clamp_scale_to_u32_512(v1, zero, one, scale); @@ -168,7 +220,7 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], } let pix_done = lane / 3; if pix_done < width { - scalar::rgbf32_to_rgb_u16_row( + scalar::rgbf32_to_rgb_u16_row::( &rgb_in[pix_done * 3..width * 3], &mut rgb_out[pix_done * 3..width * 3], width - pix_done, @@ -178,9 +230,15 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], } /// f32 RGB → u16 RGBA (alpha forced to `0xFFFF`). +/// +/// When `BE = true` the input `f32` values are big-endian encoded. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgba_u16_row( + rgb_in: &[f32], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_u16_out row too short"); @@ -193,9 +251,9 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16] let mut lane = 0usize; let mut pix = 0usize; while lane + 48 <= total_lanes { - let v0 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane)); - let v1 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane + 16)); - let v2 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane + 32)); + let v0 = load_f32x16::(rgb_in.as_ptr().add(lane)); + let v1 = load_f32x16::(rgb_in.as_ptr().add(lane + 16)); + let v2 = load_f32x16::(rgb_in.as_ptr().add(lane + 32)); let i0 = clamp_scale_to_u32_512(v0, zero, one, scale); let i1 = clamp_scale_to_u32_512(v1, zero, one, scale); @@ -221,7 +279,7 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16] pix += 16; } if pix < width { - scalar::rgbf32_to_rgba_u16_row( + scalar::rgbf32_to_rgba_u16_row::( &rgb_in[pix * 3..width * 3], &mut rgba_out[pix * 4..width * 4], width - pix, @@ -231,23 +289,53 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16] } /// f32 RGB → f32 RGB lossless pass-through. +/// +/// When `BE = true` the input values are byte-swapped to host-native before +/// being written. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgb_f32_row( + rgb_in: &[f32], + rgb_out: &mut [f32], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short"); unsafe { let total = width * 3; let mut i = 0usize; - while i + 16 <= total { - let v = _mm512_loadu_ps(rgb_in.as_ptr().add(i)); - _mm512_storeu_ps(rgb_out.as_mut_ptr().add(i), v); - i += 16; - } - while i < total { - *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i); - i += 1; + // Fast path: when the requested encoding (BE) matches the host's native + // endian, the bytes can be copied verbatim — `_mm512_loadu_ps` reads host- + // native bytes which is exactly what we need to emit. Otherwise we must + // decode through `load_f32x16::` (which byte-swaps when BE differs from + // host-native) so the stored host-native f32 round-trips to the same value. + if BE == HOST_NATIVE_BE { + while i + 16 <= total { + let v = _mm512_loadu_ps(rgb_in.as_ptr().add(i)); + _mm512_storeu_ps(rgb_out.as_mut_ptr().add(i), v); + i += 16; + } + while i < total { + *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i); + i += 1; + } + } else { + while i + 16 <= total { + let v = load_f32x16::(rgb_in.as_ptr().add(i)); + _mm512_storeu_ps(rgb_out.as_mut_ptr().add(i), v); + i += 16; + } + while i < total { + let bits = (*rgb_in.get_unchecked(i)).to_bits(); + let host_bits = if BE { + u32::from_be(bits) + } else { + u32::from_le(bits) + }; + *rgb_out.get_unchecked_mut(i) = f32::from_bits(host_bits); + i += 1; + } } } } @@ -255,36 +343,41 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], // ---- Tier 9 — Rgbf16 AVX-512 + F16C entry points --------------------------- // // `_mm512_cvtph_ps` (F16C + AVX-512F) widens 16 × f16 (stored as 16 × i16 in -// a __m256i) to 16 × f32 in a __m512. We load 32 bytes (16 f16 values) via -// `_mm256_loadu_si256`. +// a __m256i) to 16 × f32 in a __m512. // -// Downstream: after widening a 48-lane chunk (= 16 pixels) to f32, we call the -// existing AVX-512 Rgbf32 kernels. The scalar tail uses -// `crate::row::scalar::rgbf16_to_*_row`. +// Load 32 bytes as __m256i via `load_endian_u16x16::` which byte-swaps +// each u16 when the on-disk encoding doesn't match host-native, then call +// `_mm512_cvtph_ps`. // // `#[target_feature(enable = "avx512f,f16c")]` — `f16c` is the half↔single -// narrowing/widening extension. AVX-512F + F16C is the minimum for -// `_mm512_cvtph_ps`. AVX-512BW is a separate CPU-feature bit and is NOT -// implied by AVX-512F; only enable `avx512bw` on functions that actually -// use byte/word AVX-512 ops. +// narrowing/widening extension. /// Widen 16 × f16 (at `ptr`, 32 bytes) to 16 × f32 (returned as `__m512`). /// +/// For `BE = true` the f16 values are stored big-endian; bytes are swapped +/// before the F16C widening conversion. The historical `BE = false` branch +/// used a raw `_mm256_loadu_si256` which assumed LE-encoded input on a LE +/// host; `load_endian_u16x16::` is correct on both LE and BE hosts because +/// it monomorphizes to a no-op load when on-disk encoding matches host-native +/// and to a byte-swap shuffle otherwise. +/// /// # Safety /// /// * AVX-512F + F16C must be available. /// * `ptr` must be valid for 32 bytes (16 × u16 / f16). #[inline] #[target_feature(enable = "avx512f,f16c")] -unsafe fn widen_f16x16_avx512(ptr: *const half::f16) -> __m512 { +unsafe fn widen_f16x16_avx512(ptr: *const half::f16) -> __m512 { unsafe { - let raw = _mm256_loadu_si256(ptr as *const __m256i); + let raw = load_endian_u16x16::(ptr as *const u8); _mm512_cvtph_ps(raw) } } /// f16 RGB → u8 RGB (AVX-512F + F16C). /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Safety /// /// 1. AVX-512F, AVX-512BW, and F16C must be available. @@ -292,7 +385,11 @@ unsafe fn widen_f16x16_avx512(ptr: *const half::f16) -> __m512 { /// 3. `rgb_in` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "avx512f,avx512bw,f16c")] -pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn rgbf16_to_rgb_row( + rgb_in: &[half::f16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -302,19 +399,22 @@ pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], while lane + 48 <= total_lanes { let mut buf = [0.0f32; 48]; unsafe { - let f0 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane)); - let f1 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane + 16)); - let f2 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane + 32)); + let f0 = widen_f16x16_avx512::(rgb_in.as_ptr().add(lane)); + let f1 = widen_f16x16_avx512::(rgb_in.as_ptr().add(lane + 16)); + let f2 = widen_f16x16_avx512::(rgb_in.as_ptr().add(lane + 32)); _mm512_storeu_ps(buf.as_mut_ptr(), f0); _mm512_storeu_ps(buf.as_mut_ptr().add(16), f1); _mm512_storeu_ps(buf.as_mut_ptr().add(32), f2); - rgbf32_to_rgb_row(&buf, rgb_out.get_unchecked_mut(lane..lane + 48), 16); + // Buffer is host-native f32 after _mm512_cvtph_ps; route via + // HOST_NATIVE_BE so the f32 loaders perform a no-op byte-swap on + // both LE and BE hosts. + rgbf32_to_rgb_row::(&buf, rgb_out.get_unchecked_mut(lane..lane + 48), 16); } lane += 48; } let pix_done = lane / 3; if pix_done < width { - scalar::rgbf16_to_rgb_row( + scalar::rgbf16_to_rgb_row::( &rgb_in[pix_done * 3..width * 3], &mut rgb_out[pix_done * 3..width * 3], width - pix_done, @@ -324,12 +424,18 @@ pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], /// f16 RGB → u8 RGBA (alpha `0xFF`) (AVX-512F + F16C). /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw,f16c")] -pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn rgbf16_to_rgba_row( + rgb_in: &[half::f16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -339,19 +445,24 @@ pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8 while lane + 48 <= total_lanes { let mut buf = [0.0f32; 48]; unsafe { - let f0 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane)); - let f1 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane + 16)); - let f2 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane + 32)); + let f0 = widen_f16x16_avx512::(rgb_in.as_ptr().add(lane)); + let f1 = widen_f16x16_avx512::(rgb_in.as_ptr().add(lane + 16)); + let f2 = widen_f16x16_avx512::(rgb_in.as_ptr().add(lane + 32)); _mm512_storeu_ps(buf.as_mut_ptr(), f0); _mm512_storeu_ps(buf.as_mut_ptr().add(16), f1); _mm512_storeu_ps(buf.as_mut_ptr().add(32), f2); - rgbf32_to_rgba_row(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 64), 16); + // Buffer is host-native f32; route via HOST_NATIVE_BE. + rgbf32_to_rgba_row::( + &buf, + rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 64), + 16, + ); } lane += 48; pix += 16; } if pix < width { - scalar::rgbf16_to_rgba_row( + scalar::rgbf16_to_rgba_row::( &rgb_in[pix * 3..width * 3], &mut rgba_out[pix * 4..width * 4], width - pix, @@ -361,13 +472,15 @@ pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8 /// f16 RGB → u16 RGB (AVX-512F + F16C). /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [u16]` with /// `len() >= 3 * width` u16 elements. #[inline] #[target_feature(enable = "avx512f,avx512bw,f16c")] -pub(crate) unsafe fn rgbf16_to_rgb_u16_row( +pub(crate) unsafe fn rgbf16_to_rgb_u16_row( rgb_in: &[half::f16], rgb_out: &mut [u16], width: usize, @@ -380,19 +493,20 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row( while lane + 48 <= total_lanes { let mut buf = [0.0f32; 48]; unsafe { - let f0 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane)); - let f1 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane + 16)); - let f2 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane + 32)); + let f0 = widen_f16x16_avx512::(rgb_in.as_ptr().add(lane)); + let f1 = widen_f16x16_avx512::(rgb_in.as_ptr().add(lane + 16)); + let f2 = widen_f16x16_avx512::(rgb_in.as_ptr().add(lane + 32)); _mm512_storeu_ps(buf.as_mut_ptr(), f0); _mm512_storeu_ps(buf.as_mut_ptr().add(16), f1); _mm512_storeu_ps(buf.as_mut_ptr().add(32), f2); - rgbf32_to_rgb_u16_row(&buf, rgb_out.get_unchecked_mut(lane..lane + 48), 16); + // Buffer is host-native f32; route via HOST_NATIVE_BE. + rgbf32_to_rgb_u16_row::(&buf, rgb_out.get_unchecked_mut(lane..lane + 48), 16); } lane += 48; } let pix_done = lane / 3; if pix_done < width { - scalar::rgbf16_to_rgb_u16_row( + scalar::rgbf16_to_rgb_u16_row::( &rgb_in[pix_done * 3..width * 3], &mut rgb_out[pix_done * 3..width * 3], width - pix_done, @@ -402,12 +516,14 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row( /// f16 RGB → u16 RGBA (alpha `0xFFFF`) (AVX-512F + F16C). /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf16_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw,f16c")] -pub(crate) unsafe fn rgbf16_to_rgba_u16_row( +pub(crate) unsafe fn rgbf16_to_rgba_u16_row( rgb_in: &[half::f16], rgba_out: &mut [u16], width: usize, @@ -421,19 +537,24 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row( while lane + 48 <= total_lanes { let mut buf = [0.0f32; 48]; unsafe { - let f0 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane)); - let f1 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane + 16)); - let f2 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane + 32)); + let f0 = widen_f16x16_avx512::(rgb_in.as_ptr().add(lane)); + let f1 = widen_f16x16_avx512::(rgb_in.as_ptr().add(lane + 16)); + let f2 = widen_f16x16_avx512::(rgb_in.as_ptr().add(lane + 32)); _mm512_storeu_ps(buf.as_mut_ptr(), f0); _mm512_storeu_ps(buf.as_mut_ptr().add(16), f1); _mm512_storeu_ps(buf.as_mut_ptr().add(32), f2); - rgbf32_to_rgba_u16_row(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 64), 16); + // Buffer is host-native f32; route via HOST_NATIVE_BE. + rgbf32_to_rgba_u16_row::( + &buf, + rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 64), + 16, + ); } lane += 48; pix += 16; } if pix < width { - scalar::rgbf16_to_rgba_u16_row( + scalar::rgbf16_to_rgba_u16_row::( &rgb_in[pix * 3..width * 3], &mut rgba_out[pix * 4..width * 4], width - pix, @@ -443,13 +564,15 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row( /// f16 RGB → f32 RGB (lossless widen) (AVX-512F + F16C). /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [f32]` with /// `len() >= 3 * width` f32 elements. #[inline] #[target_feature(enable = "avx512f,avx512bw,f16c")] -pub(crate) unsafe fn rgbf16_to_rgb_f32_row( +pub(crate) unsafe fn rgbf16_to_rgb_f32_row( rgb_in: &[half::f16], rgb_out: &mut [f32], width: usize, @@ -461,33 +584,42 @@ pub(crate) unsafe fn rgbf16_to_rgb_f32_row( let mut lane = 0usize; while lane + 16 <= total_lanes { unsafe { - let f = widen_f16x16_avx512(rgb_in.as_ptr().add(lane)); + let f = widen_f16x16_avx512::(rgb_in.as_ptr().add(lane)); _mm512_storeu_ps(rgb_out.as_mut_ptr().add(lane), f); } lane += 16; } // Scalar tail for the last 0-15 lanes. + #[allow(clippy::needless_range_loop)] for i in lane..total_lanes { + let bits = rgb_in[i].to_bits(); + let h = half::f16::from_bits(if BE { + u16::from_be(bits) + } else { + u16::from_le(bits) + }); unsafe { - *rgb_out.get_unchecked_mut(i) = rgb_in.get_unchecked(i).to_f32(); + *rgb_out.get_unchecked_mut(i) = h.to_f32(); } } } /// f16 RGB → f16 RGB lossless pass-through (AVX-512F + F16C). /// +/// When `BE = true` the input values are byte-swapped to host-native order. +/// /// # Safety /// /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [half::f16]` with /// `len() >= 3 * width` f16 elements. #[inline] #[target_feature(enable = "avx512f,avx512bw,f16c")] -pub(crate) unsafe fn rgbf16_to_rgb_f16_row( +pub(crate) unsafe fn rgbf16_to_rgb_f16_row( rgb_in: &[half::f16], rgb_out: &mut [half::f16], width: usize, ) { debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_f16_out row too short"); - scalar::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width); + scalar::rgbf16_to_rgb_f16_row::(rgb_in, rgb_out, width); } diff --git a/src/row/arch/x86_avx512/tests/packed_rgb_float.rs b/src/row/arch/x86_avx512/tests/packed_rgb_float.rs index 38c27804..261ce54c 100644 --- a/src/row/arch/x86_avx512/tests/packed_rgb_float.rs +++ b/src/row/arch/x86_avx512/tests/packed_rgb_float.rs @@ -1,6 +1,23 @@ use super::super::*; // ---- Tier 9 Rgbf32 SIMD-vs-scalar parity tests -------------------------- +// +// LE-host gating rationale (codex 6th-pass review of PR #83): +// +// The five Rgbf32 / six Rgbf16 SIMD-vs-scalar parity tests below build +// fixtures via host-native f32 / f16 (`pseudo_random_rgbf32` / +// `pseudo_random_rgbf16`) and call the kernels with `::`. Same +// fixture-vs-kernel byte-order class as the scalar tests gated in +// `56342c0` and the NEON tests gated alongside this commit. +// +// `target_arch = "x86_64"` always implies `target_endian = "little"`, +// so the `#[cfg(target_endian = "little")]` gate is functionally a +// no-op on every supported configuration. It's added here for +// structural consistency with the NEON / scalar gating pattern. The +// LE-decode regression tests further down build LE-encoded fixtures +// via `half::f16::from_bits(u16::from_le(_))` and are correctly +// vacuous on LE hosts (probe-the-bug on hypothetical BE), so they +// are intentionally NOT gated. fn pseudo_random_rgbf32(width: usize) -> std::vec::Vec { let n = width * 3; @@ -21,6 +38,7 @@ fn pseudo_random_rgbf32(width: usize) -> std::vec::Vec { } #[test] +#[cfg(target_endian = "little")] fn avx512_rgbf32_to_rgb_matches_scalar() { if !std::arch::is_x86_feature_detected!("avx512bw") { return; @@ -29,15 +47,16 @@ fn avx512_rgbf32_to_rgb_matches_scalar() { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_simd = std::vec![0u8; w * 3]; - scalar::rgbf32_to_rgb_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgb_row(&input, &mut out_simd, w); + rgbf32_to_rgb_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "AVX-512 rgbf32_to_rgb width {w}"); } } #[test] +#[cfg(target_endian = "little")] fn avx512_rgbf32_to_rgba_matches_scalar() { if !std::arch::is_x86_feature_detected!("avx512bw") { return; @@ -46,15 +65,16 @@ fn avx512_rgbf32_to_rgba_matches_scalar() { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_simd = std::vec![0u8; w * 4]; - scalar::rgbf32_to_rgba_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgba_row(&input, &mut out_simd, w); + rgbf32_to_rgba_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "AVX-512 rgbf32_to_rgba width {w}"); } } #[test] +#[cfg(target_endian = "little")] fn avx512_rgbf32_to_rgb_u16_matches_scalar() { if !std::arch::is_x86_feature_detected!("avx512bw") { return; @@ -63,15 +83,16 @@ fn avx512_rgbf32_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_simd = std::vec![0u16; w * 3]; - scalar::rgbf32_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgb_u16_row(&input, &mut out_simd, w); + rgbf32_to_rgb_u16_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "AVX-512 rgbf32_to_rgb_u16 width {w}"); } } #[test] +#[cfg(target_endian = "little")] fn avx512_rgbf32_to_rgba_u16_matches_scalar() { if !std::arch::is_x86_feature_detected!("avx512bw") { return; @@ -80,15 +101,16 @@ fn avx512_rgbf32_to_rgba_u16_matches_scalar() { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_simd = std::vec![0u16; w * 4]; - scalar::rgbf32_to_rgba_u16_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgba_u16_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgba_u16_row(&input, &mut out_simd, w); + rgbf32_to_rgba_u16_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "AVX-512 rgbf32_to_rgba_u16 width {w}"); } } #[test] +#[cfg(target_endian = "little")] fn avx512_rgbf32_to_rgb_f32_matches_scalar() { if !std::arch::is_x86_feature_detected!("avx512bw") { return; @@ -97,9 +119,9 @@ fn avx512_rgbf32_to_rgb_f32_matches_scalar() { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0.0f32; w * 3]; let mut out_simd = std::vec![0.0f32; w * 3]; - scalar::rgbf32_to_rgb_f32_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgb_f32_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgb_f32_row(&input, &mut out_simd, w); + rgbf32_to_rgb_f32_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "AVX-512 rgbf32_to_rgb_f32 width {w}"); assert_eq!(out_simd, input[..w * 3], "lossless width {w}"); @@ -116,6 +138,7 @@ fn pseudo_random_rgbf16(width: usize) -> std::vec::Vec { } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -130,15 +153,16 @@ fn avx512_rgbf16_to_rgb_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_simd = std::vec![0u8; w * 3]; - scalar::rgbf16_to_rgb_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgb_row(&input, &mut out_simd, w); + rgbf16_to_rgb_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "AVX-512+F16C rgbf16_to_rgb width {w}"); } } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -153,9 +177,9 @@ fn avx512_rgbf16_to_rgba_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_simd = std::vec![0u8; w * 4]; - scalar::rgbf16_to_rgba_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgba_row(&input, &mut out_simd, w); + rgbf16_to_rgba_row::(&input, &mut out_simd, w); } assert_eq!( out_scalar, out_simd, @@ -165,6 +189,7 @@ fn avx512_rgbf16_to_rgba_matches_scalar() { } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -179,9 +204,9 @@ fn avx512_rgbf16_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_simd = std::vec![0u16; w * 3]; - scalar::rgbf16_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgb_u16_row(&input, &mut out_simd, w); + rgbf16_to_rgb_u16_row::(&input, &mut out_simd, w); } assert_eq!( out_scalar, out_simd, @@ -191,6 +216,7 @@ fn avx512_rgbf16_to_rgb_u16_matches_scalar() { } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -205,9 +231,9 @@ fn avx512_rgbf16_to_rgba_u16_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_simd = std::vec![0u16; w * 4]; - scalar::rgbf16_to_rgba_u16_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgba_u16_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgba_u16_row(&input, &mut out_simd, w); + rgbf16_to_rgba_u16_row::(&input, &mut out_simd, w); } assert_eq!( out_scalar, out_simd, @@ -217,6 +243,7 @@ fn avx512_rgbf16_to_rgba_u16_matches_scalar() { } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -231,9 +258,9 @@ fn avx512_rgbf16_to_rgb_f32_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0.0f32; w * 3]; let mut out_simd = std::vec![0.0f32; w * 3]; - scalar::rgbf16_to_rgb_f32_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgb_f32_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgb_f32_row(&input, &mut out_simd, w); + rgbf16_to_rgb_f32_row::(&input, &mut out_simd, w); } assert_eq!( out_scalar, out_simd, @@ -243,6 +270,7 @@ fn avx512_rgbf16_to_rgb_f32_matches_scalar() { } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -257,9 +285,9 @@ fn avx512_rgbf16_to_rgb_f16_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![half::f16::ZERO; w * 3]; let mut out_simd = std::vec![half::f16::ZERO; w * 3]; - scalar::rgbf16_to_rgb_f16_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgb_f16_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgb_f16_row(&input, &mut out_simd, w); + rgbf16_to_rgb_f16_row::(&input, &mut out_simd, w); } assert_eq!( out_scalar, out_simd, @@ -268,3 +296,486 @@ fn avx512_rgbf16_to_rgb_f16_matches_scalar() { assert_eq!(out_simd, input[..w * 3], "lossless width {w}"); } } + +// ---- BE parity tests — AVX-512 Rgbf32 ---------------------------------------- + +fn be_rgbf32(le: &[f32]) -> std::vec::Vec { + le.iter() + .map(|v| f32::from_bits(v.to_bits().swap_bytes())) + .collect() +} + +fn be_rgbf16(le: &[half::f16]) -> std::vec::Vec { + le.iter() + .map(|v| half::f16::from_bits(v.to_bits().swap_bytes())) + .collect() +} + +#[test] +#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] +fn avx512_rgbf32_to_rgb_be_matches_le() { + if !std::arch::is_x86_feature_detected!("avx512f") + || !std::arch::is_x86_feature_detected!("avx512bw") + { + return; + } + for w in [1usize, 4, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + rgbf32_to_rgb_row::(&le_in, &mut out_le, w); + rgbf32_to_rgb_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "AVX-512 rgbf32_to_rgb BE parity width {w}"); + } +} + +#[test] +#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] +fn avx512_rgbf32_to_rgba_be_matches_le() { + if !std::arch::is_x86_feature_detected!("avx512f") + || !std::arch::is_x86_feature_detected!("avx512bw") + { + return; + } + for w in [1usize, 4, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + rgbf32_to_rgba_row::(&le_in, &mut out_le, w); + rgbf32_to_rgba_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "AVX-512 rgbf32_to_rgba BE parity width {w}"); + } +} + +#[test] +#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] +fn avx512_rgbf32_to_rgb_u16_be_matches_le() { + if !std::arch::is_x86_feature_detected!("avx512f") + || !std::arch::is_x86_feature_detected!("avx512bw") + { + return; + } + for w in [1usize, 4, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + rgbf32_to_rgb_u16_row::(&le_in, &mut out_le, w); + rgbf32_to_rgb_u16_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX-512 rgbf32_to_rgb_u16 BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] +fn avx512_rgbf32_to_rgba_u16_be_matches_le() { + if !std::arch::is_x86_feature_detected!("avx512f") + || !std::arch::is_x86_feature_detected!("avx512bw") + { + return; + } + for w in [1usize, 4, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + rgbf32_to_rgba_u16_row::(&le_in, &mut out_le, w); + rgbf32_to_rgba_u16_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX-512 rgbf32_to_rgba_u16 BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] +fn avx512_rgbf32_to_rgb_f32_be_is_byteswap() { + if !std::arch::is_x86_feature_detected!("avx512f") { + return; + } + for w in [1usize, 4, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0.0f32; w * 3]; + let mut out_be = std::vec![0.0f32; w * 3]; + unsafe { + rgbf32_to_rgb_f32_row::(&le_in, &mut out_le, w); + rgbf32_to_rgb_f32_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX-512 rgbf32_to_rgb_f32 BE parity width {w}" + ); + } +} + +/// Feeds an explicitly LE-encoded fixture through `rgbf32_to_rgb_f32_row::` +/// and asserts it decodes to the host-native expected values. +/// +/// On LE hosts this is a vacuous sanity check (LE-encoded == host-native), but +/// on BE hosts it guards against the historical bug where the kernel used a raw +/// `_mm512_loadu_ps`/`_mm512_storeu_ps` copy in the `BE = false` branch, which +/// preserved the LE byte order on store and produced corrupted (byte-swapped) +/// host f32s. The current kernel falls through to the endian-aware +/// `load_f32x16::` slow path on BE hosts (`HOST_NATIVE_BE != BE`) so +/// this test passes on both. +#[test] +#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] +fn avx512_rgbf32_to_rgb_f32_row_le_input_decodes_correctly_on_any_host() { + if !std::arch::is_x86_feature_detected!("avx512f") { + return; + } + for w in [1usize, 4, 16, 33, 1920, 1921] { + let expected = pseudo_random_rgbf32(w); // host-native f32 values + // Build LE-encoded input: each lane's bits, written as if LE on disk, then + // reinterpreted as host-native f32. On LE hosts this is identical to + // `expected`; on BE hosts each lane is byte-swapped. + let le_in: std::vec::Vec = expected + .iter() + .map(|v| f32::from_bits(u32::from_le(v.to_bits()))) + .collect(); + let mut out = std::vec![0.0f32; w * 3]; + unsafe { + rgbf32_to_rgb_f32_row::(&le_in, &mut out, w); + } + assert_eq!( + out, expected, + "AVX-512 rgbf32_to_rgb_f32_row:: must decode LE input to host-native (width {w})" + ); + } +} + +// ---- BE parity tests — AVX-512 + F16C Rgbf16 --------------------------------- + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx512_rgbf16_to_rgb_be_matches_le() { + if !std::arch::is_x86_feature_detected!("avx512f") + || !std::arch::is_x86_feature_detected!("avx512bw") + || !std::arch::is_x86_feature_detected!("f16c") + { + return; + } + for w in [1usize, 4, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + rgbf16_to_rgb_row::(&le_in, &mut out_le, w); + rgbf16_to_rgb_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX-512+F16C rgbf16_to_rgb BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx512_rgbf16_to_rgba_be_matches_le() { + if !std::arch::is_x86_feature_detected!("avx512f") + || !std::arch::is_x86_feature_detected!("avx512bw") + || !std::arch::is_x86_feature_detected!("f16c") + { + return; + } + for w in [1usize, 4, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + rgbf16_to_rgba_row::(&le_in, &mut out_le, w); + rgbf16_to_rgba_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX-512+F16C rgbf16_to_rgba BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx512_rgbf16_to_rgb_u16_be_matches_le() { + if !std::arch::is_x86_feature_detected!("avx512f") + || !std::arch::is_x86_feature_detected!("avx512bw") + || !std::arch::is_x86_feature_detected!("f16c") + { + return; + } + for w in [1usize, 4, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + rgbf16_to_rgb_u16_row::(&le_in, &mut out_le, w); + rgbf16_to_rgb_u16_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX-512+F16C rgbf16_to_rgb_u16 BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx512_rgbf16_to_rgba_u16_be_matches_le() { + if !std::arch::is_x86_feature_detected!("avx512f") + || !std::arch::is_x86_feature_detected!("avx512bw") + || !std::arch::is_x86_feature_detected!("f16c") + { + return; + } + for w in [1usize, 4, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + rgbf16_to_rgba_u16_row::(&le_in, &mut out_le, w); + rgbf16_to_rgba_u16_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX-512+F16C rgbf16_to_rgba_u16 BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx512_rgbf16_to_rgb_f32_be_matches_le() { + if !std::arch::is_x86_feature_detected!("avx512f") + || !std::arch::is_x86_feature_detected!("avx512bw") + || !std::arch::is_x86_feature_detected!("f16c") + { + return; + } + for w in [1usize, 4, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0.0f32; w * 3]; + let mut out_be = std::vec![0.0f32; w * 3]; + unsafe { + rgbf16_to_rgb_f32_row::(&le_in, &mut out_le, w); + rgbf16_to_rgb_f32_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX-512+F16C rgbf16_to_rgb_f32 BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx512_rgbf16_to_rgb_f16_be_is_byteswap() { + if !std::arch::is_x86_feature_detected!("avx512f") || !std::arch::is_x86_feature_detected!("f16c") + { + return; + } + for w in [1usize, 4, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![half::f16::ZERO; w * 3]; + let mut out_be = std::vec![half::f16::ZERO; w * 3]; + unsafe { + rgbf16_to_rgb_f16_row::(&le_in, &mut out_le, w); + rgbf16_to_rgb_f16_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX-512+F16C rgbf16_to_rgb_f16 BE parity width {w}" + ); + } +} + +// ---- LE-decode regression tests — AVX-512 + F16C `widen_f16x16_avx512` ------ +// +// These guard against the historical bug where `widen_f16x16_avx512::` +// used a raw `_mm256_loadu_si256` to read 16 × f16 — that's a host-native u16 +// load, so on a BE host with LE-encoded input the lanes were mis-decoded as +// host-BE before the F16C widening conversion. The fixed helper always routes +// through `load_endian_u16x16::` which monomorphizes to a pass-through on +// the matching host-encoding axis and to a byte-swap shuffle otherwise, so +// each kernel decodes correctly on both LE and BE hosts. + +/// Build an LE-encoded f16 fixture from a host-native one. +fn le_rgbf16_from_host(host: &[half::f16]) -> std::vec::Vec { + host + .iter() + .map(|v| half::f16::from_bits(u16::from_le(v.to_bits()))) + .collect() +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx512_rgbf16_to_rgb_row_le_input_decodes_correctly_on_any_host() { + if !std::arch::is_x86_feature_detected!("avx512bw") + || !std::arch::is_x86_feature_detected!("f16c") + { + return; + } + for w in [1usize, 4, 16, 17, 33, 1920, 1921] { + let host = pseudo_random_rgbf16(w); + let le_in = le_rgbf16_from_host(&host); + let mut out_simd = std::vec![0u8; w * 3]; + let mut out_scalar = std::vec![0u8; w * 3]; + unsafe { + rgbf16_to_rgb_row::(&le_in, &mut out_simd, w); + } + scalar::rgbf16_to_rgb_row::(&le_in, &mut out_scalar, w); + assert_eq!( + out_simd, out_scalar, + "AVX-512+F16C rgbf16_to_rgb_row:: must decode LE input to match scalar (width {w})" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx512_rgbf16_to_rgba_row_le_input_decodes_correctly_on_any_host() { + if !std::arch::is_x86_feature_detected!("avx512bw") + || !std::arch::is_x86_feature_detected!("f16c") + { + return; + } + for w in [1usize, 4, 16, 17, 33, 1920, 1921] { + let host = pseudo_random_rgbf16(w); + let le_in = le_rgbf16_from_host(&host); + let mut out_simd = std::vec![0u8; w * 4]; + let mut out_scalar = std::vec![0u8; w * 4]; + unsafe { + rgbf16_to_rgba_row::(&le_in, &mut out_simd, w); + } + scalar::rgbf16_to_rgba_row::(&le_in, &mut out_scalar, w); + assert_eq!( + out_simd, out_scalar, + "AVX-512+F16C rgbf16_to_rgba_row:: must decode LE input to match scalar (width {w})" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx512_rgbf16_to_rgb_u16_row_le_input_decodes_correctly_on_any_host() { + if !std::arch::is_x86_feature_detected!("avx512bw") + || !std::arch::is_x86_feature_detected!("f16c") + { + return; + } + for w in [1usize, 4, 16, 17, 33, 1920, 1921] { + let host = pseudo_random_rgbf16(w); + let le_in = le_rgbf16_from_host(&host); + let mut out_simd = std::vec![0u16; w * 3]; + let mut out_scalar = std::vec![0u16; w * 3]; + unsafe { + rgbf16_to_rgb_u16_row::(&le_in, &mut out_simd, w); + } + scalar::rgbf16_to_rgb_u16_row::(&le_in, &mut out_scalar, w); + assert_eq!( + out_simd, out_scalar, + "AVX-512+F16C rgbf16_to_rgb_u16_row:: must decode LE input to match scalar (width {w})" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx512_rgbf16_to_rgba_u16_row_le_input_decodes_correctly_on_any_host() { + if !std::arch::is_x86_feature_detected!("avx512bw") + || !std::arch::is_x86_feature_detected!("f16c") + { + return; + } + for w in [1usize, 4, 16, 17, 33, 1920, 1921] { + let host = pseudo_random_rgbf16(w); + let le_in = le_rgbf16_from_host(&host); + let mut out_simd = std::vec![0u16; w * 4]; + let mut out_scalar = std::vec![0u16; w * 4]; + unsafe { + rgbf16_to_rgba_u16_row::(&le_in, &mut out_simd, w); + } + scalar::rgbf16_to_rgba_u16_row::(&le_in, &mut out_scalar, w); + assert_eq!( + out_simd, out_scalar, + "AVX-512+F16C rgbf16_to_rgba_u16_row:: must decode LE input to match scalar (width {w})" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx512_rgbf16_to_rgb_f32_row_le_input_decodes_correctly_on_any_host() { + if !std::arch::is_x86_feature_detected!("avx512bw") + || !std::arch::is_x86_feature_detected!("f16c") + { + return; + } + for w in [1usize, 4, 16, 17, 33, 1920, 1921] { + let host = pseudo_random_rgbf16(w); + let le_in = le_rgbf16_from_host(&host); + let mut out_simd = std::vec![0.0f32; w * 3]; + let mut out_scalar = std::vec![0.0f32; w * 3]; + unsafe { + rgbf16_to_rgb_f32_row::(&le_in, &mut out_simd, w); + } + scalar::rgbf16_to_rgb_f32_row::(&le_in, &mut out_scalar, w); + assert_eq!( + out_simd, out_scalar, + "AVX-512+F16C rgbf16_to_rgb_f32_row:: must decode LE input to match scalar (width {w})" + ); + } +} diff --git a/src/row/arch/x86_sse41/endian.rs b/src/row/arch/x86_sse41/endian.rs index f7dc1d38..992ca30e 100644 --- a/src/row/arch/x86_sse41/endian.rs +++ b/src/row/arch/x86_sse41/endian.rs @@ -119,3 +119,59 @@ pub(crate) unsafe fn load_endian_u32x4(ptr: *const u8) -> __m128 unsafe { load_le_u32x4(ptr) } } } + +// ---- u16x4 loaders (8-byte half-vector) ------------------------------------ +// +// These load only 8 bytes (4 × u16) into the low half of an `__m128i` and +// zero the upper half. Used by Rgbf16 widen kernels (`_mm_cvtph_ps` reads +// the low 64 bits = 4 × f16) when the caller can only guarantee 8 readable +// bytes — using the 16-byte `load_endian_u16x8` would tail-overread. + +/// Loads 4 × u16 from `ptr` (LE-encoded on disk/wire) into the low half of +/// an `__m128i` in host-native order; the upper half is zero. +/// +/// # Safety +/// +/// `ptr` must point to at least 8 readable bytes. Caller must have SSE4.1 +/// (and SSSE3) enabled. +#[inline(always)] +pub(crate) unsafe fn load_le_u16x4(ptr: *const u8) -> __m128i { + let v = unsafe { _mm_loadl_epi64(ptr.cast()) }; + // On LE hosts the on-disk LE bytes already match host-native; on BE hosts + // we'd need to byte-swap, but the shuffle mask references only source + // bytes [0..8) which are the loaded bytes (upper half is zero from + // `_mm_loadl_epi64`), so the byte-swap is correct. + #[cfg(target_endian = "big")] + let v = unsafe { _mm_shuffle_epi8(v, BYTESWAP_MASK_U16) }; + v +} + +/// Loads 4 × u16 from `ptr` (BE-encoded on disk/wire) into the low half of +/// an `__m128i` in host-native order; the upper half is zero. +/// +/// # Safety +/// +/// `ptr` must point to at least 8 readable bytes. Caller must have SSE4.1 +/// (and SSSE3) enabled. +#[inline(always)] +pub(crate) unsafe fn load_be_u16x4(ptr: *const u8) -> __m128i { + let v = unsafe { _mm_loadl_epi64(ptr.cast()) }; + #[cfg(target_endian = "little")] + let v = unsafe { _mm_shuffle_epi8(v, BYTESWAP_MASK_U16) }; + v +} + +/// Generic dispatcher: routes to `load_le_u16x4` or `load_be_u16x4` based on +/// the compile-time `BE` const parameter. Reads exactly 8 bytes. +/// +/// # Safety +/// +/// Same as `load_le_u16x4` / `load_be_u16x4`. +#[inline(always)] +pub(crate) unsafe fn load_endian_u16x4(ptr: *const u8) -> __m128i { + if BE { + unsafe { load_be_u16x4(ptr) } + } else { + unsafe { load_le_u16x4(ptr) } + } +} diff --git a/src/row/arch/x86_sse41/packed_rgb_float.rs b/src/row/arch/x86_sse41/packed_rgb_float.rs index b4aec862..9b68dee9 100644 --- a/src/row/arch/x86_sse41/packed_rgb_float.rs +++ b/src/row/arch/x86_sse41/packed_rgb_float.rs @@ -8,13 +8,31 @@ //! round-to-nearest-even cast, and `_mm_packus_*` for the saturating //! narrow. //! +//! For `` kernels, each 4-lane f32 load is replaced by +//! `load_endian_u32x4::` (a `__m128i` with byte-swapped u32 lanes +//! for BE inputs) followed by `_mm_castsi128_ps` to reinterpret as f32. +//! //! Pixel-aligned chunks (4 pixels = 12 lanes per iter for the u8/u16 //! integer-output paths) keep the loop boundary on a pixel boundary //! so the scalar tail handles only the final 0–3 pixels. use core::arch::x86_64::*; -use super::scalar; +use super::{endian::load_endian_u32x4, scalar}; + +/// Load 4 f32 lanes from `ptr` in endian-aware fashion. +/// +/// # Safety +/// +/// SSE4.1 + SSSE3 must be available; `ptr` must be valid for 16 bytes. +#[inline] +#[target_feature(enable = "sse4.1")] +unsafe fn load_f32x4(ptr: *const f32) -> __m128 { + unsafe { + let u = load_endian_u32x4::(ptr as *const u8); + _mm_castsi128_ps(u) + } +} #[inline(always)] unsafe fn clamp_scale_to_u32(v: __m128, zero: __m128, one: __m128, scale: __m128) -> __m128i { @@ -34,6 +52,8 @@ unsafe fn clamp_scale_to_u32(v: __m128, zero: __m128, one: __m128, scale: __m128 /// f32 RGB → u8 RGB. Clamp `[0, 1]` × 255, saturating cast. /// +/// When `BE = true` the input `f32` values are big-endian encoded. +/// /// # Safety /// /// 1. SSE4.1 must be available. @@ -41,7 +61,11 @@ unsafe fn clamp_scale_to_u32(v: __m128, zero: __m128, one: __m128, scale: __m128 /// 3. `rgb_in` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgb_row( + rgb_in: &[f32], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -53,9 +77,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width let total_lanes = width * 3; let mut lane = 0usize; while lane + 12 <= total_lanes { - let v0 = _mm_loadu_ps(rgb_in.as_ptr().add(lane)); - let v1 = _mm_loadu_ps(rgb_in.as_ptr().add(lane + 4)); - let v2 = _mm_loadu_ps(rgb_in.as_ptr().add(lane + 8)); + let v0 = load_f32x4::(rgb_in.as_ptr().add(lane)); + let v1 = load_f32x4::(rgb_in.as_ptr().add(lane + 4)); + let v2 = load_f32x4::(rgb_in.as_ptr().add(lane + 8)); let i0 = clamp_scale_to_u32(v0, zero, one, scale); let i1 = clamp_scale_to_u32(v1, zero, one, scale); @@ -78,7 +102,7 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width } let pix_done = lane / 3; if pix_done < width { - scalar::rgbf32_to_rgb_row( + scalar::rgbf32_to_rgb_row::( &rgb_in[pix_done * 3..width * 3], &mut rgb_out[pix_done * 3..width * 3], width - pix_done, @@ -89,12 +113,18 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width /// f32 RGB → u8 RGBA (alpha forced to `0xFF`). /// +/// When `BE = true` the input `f32` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf32_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgba_row( + rgb_in: &[f32], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -111,9 +141,9 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid // R, G, B, R, G, B, … layout, so we widen the 12 bytes to 16 by // inserting alpha at the trailing position of each 4-byte group). while lane + 12 <= total_lanes { - let v0 = _mm_loadu_ps(rgb_in.as_ptr().add(lane)); - let v1 = _mm_loadu_ps(rgb_in.as_ptr().add(lane + 4)); - let v2 = _mm_loadu_ps(rgb_in.as_ptr().add(lane + 8)); + let v0 = load_f32x4::(rgb_in.as_ptr().add(lane)); + let v1 = load_f32x4::(rgb_in.as_ptr().add(lane + 4)); + let v2 = load_f32x4::(rgb_in.as_ptr().add(lane + 8)); let i0 = clamp_scale_to_u32(v0, zero, one, scale); let i1 = clamp_scale_to_u32(v1, zero, one, scale); @@ -139,7 +169,7 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid pix += 4; } if pix < width { - scalar::rgbf32_to_rgba_row( + scalar::rgbf32_to_rgba_row::( &rgb_in[pix * 3..width * 3], &mut rgba_out[pix * 4..width * 4], width - pix, @@ -150,13 +180,19 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid /// f32 RGB → u16 RGB. Clamp `[0, 1]` × 65535, saturating cast. /// +/// When `BE = true` the input `f32` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf32_to_rgb_row`] but `rgb_out` is `&mut [u16]` with /// `len() >= 3 * width` u16 elements. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgb_u16_row( + rgb_in: &[f32], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_u16_out row too short"); @@ -168,9 +204,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], let total_lanes = width * 3; let mut lane = 0usize; while lane + 12 <= total_lanes { - let v0 = _mm_loadu_ps(rgb_in.as_ptr().add(lane)); - let v1 = _mm_loadu_ps(rgb_in.as_ptr().add(lane + 4)); - let v2 = _mm_loadu_ps(rgb_in.as_ptr().add(lane + 8)); + let v0 = load_f32x4::(rgb_in.as_ptr().add(lane)); + let v1 = load_f32x4::(rgb_in.as_ptr().add(lane + 4)); + let v2 = load_f32x4::(rgb_in.as_ptr().add(lane + 8)); let i0 = clamp_scale_to_u32(v0, zero, one, scale); let i1 = clamp_scale_to_u32(v1, zero, one, scale); @@ -189,7 +225,7 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], } let pix_done = lane / 3; if pix_done < width { - scalar::rgbf32_to_rgb_u16_row( + scalar::rgbf32_to_rgb_u16_row::( &rgb_in[pix_done * 3..width * 3], &mut rgb_out[pix_done * 3..width * 3], width - pix_done, @@ -200,13 +236,19 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], /// f32 RGB → u16 RGBA (alpha forced to `0xFFFF`). /// +/// When `BE = true` the input `f32` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf32_to_rgb_u16_row`] but the output is `&mut [u16]` /// with `len() >= 4 * width` u16 elements. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgba_u16_row( + rgb_in: &[f32], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_u16_out row too short"); @@ -219,9 +261,9 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16] let mut lane = 0usize; let mut pix = 0usize; while lane + 12 <= total_lanes { - let v0 = _mm_loadu_ps(rgb_in.as_ptr().add(lane)); - let v1 = _mm_loadu_ps(rgb_in.as_ptr().add(lane + 4)); - let v2 = _mm_loadu_ps(rgb_in.as_ptr().add(lane + 8)); + let v0 = load_f32x4::(rgb_in.as_ptr().add(lane)); + let v1 = load_f32x4::(rgb_in.as_ptr().add(lane + 4)); + let v2 = load_f32x4::(rgb_in.as_ptr().add(lane + 8)); let i0 = clamp_scale_to_u32(v0, zero, one, scale); let i1 = clamp_scale_to_u32(v1, zero, one, scale); @@ -246,7 +288,7 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16] pix += 4; } if pix < width { - scalar::rgbf32_to_rgba_u16_row( + scalar::rgbf32_to_rgba_u16_row::( &rgb_in[pix * 3..width * 3], &mut rgba_out[pix * 4..width * 4], width - pix, @@ -257,27 +299,57 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16] /// f32 RGB → f32 RGB lossless pass-through. /// +/// When `BE = true` the input values are byte-swapped to host-native +/// before being written. +/// /// # Safety /// /// Same as [`rgbf32_to_rgb_row`] but `rgb_out` is `&mut [f32]` with /// `len() >= 3 * width` f32 elements. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], width: usize) { +pub(crate) unsafe fn rgbf32_to_rgb_f32_row( + rgb_in: &[f32], + rgb_out: &mut [f32], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short"); unsafe { let total = width * 3; let mut i = 0usize; - while i + 4 <= total { - let v = _mm_loadu_ps(rgb_in.as_ptr().add(i)); - _mm_storeu_ps(rgb_out.as_mut_ptr().add(i), v); - i += 4; - } - while i < total { - *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i); - i += 1; + // Fast path: when the requested encoding (BE) matches the host's native + // endian, the bytes can be copied verbatim — `_mm_loadu_ps` reads host- + // native bytes which is exactly what we need to emit. Otherwise we must + // decode through `load_f32x4::` (which byte-swaps when BE differs from + // host-native) so the stored host-native f32 round-trips to the same value. + if BE == HOST_NATIVE_BE { + while i + 4 <= total { + let v = _mm_loadu_ps(rgb_in.as_ptr().add(i)); + _mm_storeu_ps(rgb_out.as_mut_ptr().add(i), v); + i += 4; + } + while i < total { + *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i); + i += 1; + } + } else { + while i + 4 <= total { + let v = load_f32x4::(rgb_in.as_ptr().add(i)); + _mm_storeu_ps(rgb_out.as_mut_ptr().add(i), v); + i += 4; + } + while i < total { + let bits = (*rgb_in.get_unchecked(i)).to_bits(); + let host_bits = if BE { + u32::from_be(bits) + } else { + u32::from_le(bits) + }; + *rgb_out.get_unchecked_mut(i) = f32::from_bits(host_bits); + i += 1; + } } } } @@ -288,31 +360,56 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], // of a __m128i) to 4 × f32 in a __m128. We load 8 bytes (4 f16 values) via // `_mm_loadl_epi64` (64-bit load into the low half of __m128i). // -// Downstream: after widening a 12-lane chunk (= 4 pixels) to f32, we call the -// existing SSE4.1 Rgbf32 kernels. The scalar tail uses -// `crate::row::scalar::rgbf16_to_*_row`. +// For BE: load 8 bytes via `load_endian_u16x8::` which byte-swaps each +// u16 for big-endian inputs, then call `_mm_cvtph_ps` on the result. // // `#[target_feature(enable = "sse4.1,f16c")]` ensures both features are active // in the body even though F16C is an independent feature bit. +use super::endian::load_endian_u16x4; + +/// `BE` value that makes the f32 row loaders treat their input as host-native +/// (a no-op byte-swap). Used by f16→f32 widen-then-convert paths whose stack +/// buffer is already host-native after `_mm_cvtph_ps`. On a LE target, host- +/// native == LE so `BE = false`; on a BE target, host-native == BE so +/// `BE = true`. Without this routing the downstream `rgbf32_to_*::` +/// would byte-swap an already-decoded host-native f32 buffer on BE hosts. +/// +/// Also used by the `rgbf32_to_rgb_f32_row` pass-through fast path: the raw +/// `_mm_loadu_ps`/`_mm_storeu_ps` copy is byte-correct only when the source +/// encoding (`BE`) matches the host's native endian, so the kernel falls +/// through to the endian-aware `load_f32x4::` slow path otherwise. +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + /// Widen 4 × f16 (at `ptr`, 8 bytes) to 4 × f32 (returned as `__m128`). /// +/// For `BE = true` the f16 values are stored big-endian; bytes are swapped +/// before the F16C widening conversion. The loader reads exactly 8 bytes +/// regardless of `BE` so the caller's `ptr` only needs 8 readable bytes +/// (a 16-byte load via `load_endian_u16x8` would tail-overread the 4 × f16 +/// region the kernel actually owns). +/// /// # Safety /// /// * SSE4.1 + F16C must be available. /// * `ptr` must be valid for 8 bytes (4 × u16 / f16). #[inline] #[target_feature(enable = "sse4.1,f16c")] -unsafe fn widen_f16x4_sse(ptr: *const half::f16) -> __m128 { +unsafe fn widen_f16x4_sse(ptr: *const half::f16) -> __m128 { unsafe { - // _mm_loadl_epi64: 64-bit load into the low half of __m128i. - let raw = _mm_loadl_epi64(ptr as *const __m128i); + // 8-byte load (low 64 bits of __m128i, upper half zero). For `BE = true` + // the loader byte-swaps each u16 in place; for `BE = false` it's a plain + // load. `_mm_cvtph_ps` reads only the low 4 × f16 (low 64 bits), so the + // upper half being zero is harmless. + let raw = load_endian_u16x4::(ptr as *const u8); _mm_cvtph_ps(raw) } } /// f16 RGB → u8 RGB (SSE4.1 + F16C). /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Safety /// /// 1. SSE4.1 and F16C must be available. @@ -320,7 +417,11 @@ unsafe fn widen_f16x4_sse(ptr: *const half::f16) -> __m128 { /// 3. `rgb_in` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "sse4.1,f16c")] -pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn rgbf16_to_rgb_row( + rgb_in: &[half::f16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -330,19 +431,21 @@ pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], while lane + 12 <= total_lanes { let mut buf = [0.0f32; 12]; unsafe { - let f0 = widen_f16x4_sse(rgb_in.as_ptr().add(lane)); - let f1 = widen_f16x4_sse(rgb_in.as_ptr().add(lane + 4)); - let f2 = widen_f16x4_sse(rgb_in.as_ptr().add(lane + 8)); + let f0 = widen_f16x4_sse::(rgb_in.as_ptr().add(lane)); + let f1 = widen_f16x4_sse::(rgb_in.as_ptr().add(lane + 4)); + let f2 = widen_f16x4_sse::(rgb_in.as_ptr().add(lane + 8)); _mm_storeu_ps(buf.as_mut_ptr(), f0); _mm_storeu_ps(buf.as_mut_ptr().add(4), f1); _mm_storeu_ps(buf.as_mut_ptr().add(8), f2); - rgbf32_to_rgb_row(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4); + // Buffer is host-native f32; route via HOST_NATIVE_BE so the f32 loaders + // perform a no-op swap on both LE and BE hosts. + rgbf32_to_rgb_row::(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4); } lane += 12; } let pix_done = lane / 3; if pix_done < width { - scalar::rgbf16_to_rgb_row( + scalar::rgbf16_to_rgb_row::( &rgb_in[pix_done * 3..width * 3], &mut rgb_out[pix_done * 3..width * 3], width - pix_done, @@ -352,12 +455,18 @@ pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], /// f16 RGB → u8 RGBA (alpha `0xFF`) (SSE4.1 + F16C). /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "sse4.1,f16c")] -pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn rgbf16_to_rgba_row( + rgb_in: &[half::f16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -367,19 +476,24 @@ pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8 while lane + 12 <= total_lanes { let mut buf = [0.0f32; 12]; unsafe { - let f0 = widen_f16x4_sse(rgb_in.as_ptr().add(lane)); - let f1 = widen_f16x4_sse(rgb_in.as_ptr().add(lane + 4)); - let f2 = widen_f16x4_sse(rgb_in.as_ptr().add(lane + 8)); + let f0 = widen_f16x4_sse::(rgb_in.as_ptr().add(lane)); + let f1 = widen_f16x4_sse::(rgb_in.as_ptr().add(lane + 4)); + let f2 = widen_f16x4_sse::(rgb_in.as_ptr().add(lane + 8)); _mm_storeu_ps(buf.as_mut_ptr(), f0); _mm_storeu_ps(buf.as_mut_ptr().add(4), f1); _mm_storeu_ps(buf.as_mut_ptr().add(8), f2); - rgbf32_to_rgba_row(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4); + // Buffer is host-native f32; route via HOST_NATIVE_BE. + rgbf32_to_rgba_row::( + &buf, + rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), + 4, + ); } lane += 12; pix += 4; } if pix < width { - scalar::rgbf16_to_rgba_row( + scalar::rgbf16_to_rgba_row::( &rgb_in[pix * 3..width * 3], &mut rgba_out[pix * 4..width * 4], width - pix, @@ -389,13 +503,15 @@ pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8 /// f16 RGB → u16 RGB (SSE4.1 + F16C). /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [u16]` with /// `len() >= 3 * width` u16 elements. #[inline] #[target_feature(enable = "sse4.1,f16c")] -pub(crate) unsafe fn rgbf16_to_rgb_u16_row( +pub(crate) unsafe fn rgbf16_to_rgb_u16_row( rgb_in: &[half::f16], rgb_out: &mut [u16], width: usize, @@ -408,19 +524,20 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row( while lane + 12 <= total_lanes { let mut buf = [0.0f32; 12]; unsafe { - let f0 = widen_f16x4_sse(rgb_in.as_ptr().add(lane)); - let f1 = widen_f16x4_sse(rgb_in.as_ptr().add(lane + 4)); - let f2 = widen_f16x4_sse(rgb_in.as_ptr().add(lane + 8)); + let f0 = widen_f16x4_sse::(rgb_in.as_ptr().add(lane)); + let f1 = widen_f16x4_sse::(rgb_in.as_ptr().add(lane + 4)); + let f2 = widen_f16x4_sse::(rgb_in.as_ptr().add(lane + 8)); _mm_storeu_ps(buf.as_mut_ptr(), f0); _mm_storeu_ps(buf.as_mut_ptr().add(4), f1); _mm_storeu_ps(buf.as_mut_ptr().add(8), f2); - rgbf32_to_rgb_u16_row(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4); + // Buffer is host-native f32; route via HOST_NATIVE_BE. + rgbf32_to_rgb_u16_row::(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4); } lane += 12; } let pix_done = lane / 3; if pix_done < width { - scalar::rgbf16_to_rgb_u16_row( + scalar::rgbf16_to_rgb_u16_row::( &rgb_in[pix_done * 3..width * 3], &mut rgb_out[pix_done * 3..width * 3], width - pix_done, @@ -430,12 +547,14 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row( /// f16 RGB → u16 RGBA (alpha `0xFFFF`) (SSE4.1 + F16C). /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf16_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "sse4.1,f16c")] -pub(crate) unsafe fn rgbf16_to_rgba_u16_row( +pub(crate) unsafe fn rgbf16_to_rgba_u16_row( rgb_in: &[half::f16], rgba_out: &mut [u16], width: usize, @@ -449,19 +568,24 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row( while lane + 12 <= total_lanes { let mut buf = [0.0f32; 12]; unsafe { - let f0 = widen_f16x4_sse(rgb_in.as_ptr().add(lane)); - let f1 = widen_f16x4_sse(rgb_in.as_ptr().add(lane + 4)); - let f2 = widen_f16x4_sse(rgb_in.as_ptr().add(lane + 8)); + let f0 = widen_f16x4_sse::(rgb_in.as_ptr().add(lane)); + let f1 = widen_f16x4_sse::(rgb_in.as_ptr().add(lane + 4)); + let f2 = widen_f16x4_sse::(rgb_in.as_ptr().add(lane + 8)); _mm_storeu_ps(buf.as_mut_ptr(), f0); _mm_storeu_ps(buf.as_mut_ptr().add(4), f1); _mm_storeu_ps(buf.as_mut_ptr().add(8), f2); - rgbf32_to_rgba_u16_row(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4); + // Buffer is host-native f32; route via HOST_NATIVE_BE. + rgbf32_to_rgba_u16_row::( + &buf, + rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), + 4, + ); } lane += 12; pix += 4; } if pix < width { - scalar::rgbf16_to_rgba_u16_row( + scalar::rgbf16_to_rgba_u16_row::( &rgb_in[pix * 3..width * 3], &mut rgba_out[pix * 4..width * 4], width - pix, @@ -471,13 +595,15 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row( /// f16 RGB → f32 RGB (lossless widen) (SSE4.1 + F16C). /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Safety /// /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [f32]` with /// `len() >= 3 * width` f32 elements. #[inline] #[target_feature(enable = "sse4.1,f16c")] -pub(crate) unsafe fn rgbf16_to_rgb_f32_row( +pub(crate) unsafe fn rgbf16_to_rgb_f32_row( rgb_in: &[half::f16], rgb_out: &mut [f32], width: usize, @@ -489,7 +615,7 @@ pub(crate) unsafe fn rgbf16_to_rgb_f32_row( let mut lane = 0usize; while lane + 4 <= total_lanes { unsafe { - let f = widen_f16x4_sse(rgb_in.as_ptr().add(lane)); + let f = widen_f16x4_sse::(rgb_in.as_ptr().add(lane)); _mm_storeu_ps(rgb_out.as_mut_ptr().add(lane), f); } lane += 4; @@ -497,25 +623,39 @@ pub(crate) unsafe fn rgbf16_to_rgb_f32_row( // Scalar tail for the last 0-3 lanes. for i in lane..total_lanes { unsafe { - *rgb_out.get_unchecked_mut(i) = rgb_in.get_unchecked(i).to_f32(); + let v = load_f16_scalar::(rgb_in, i); + *rgb_out.get_unchecked_mut(i) = v.to_f32(); } } } /// f16 RGB → f16 RGB lossless pass-through (SSE4.1 + F16C). /// +/// When `BE = true` the input values are byte-swapped to host-native order. +/// /// # Safety /// /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [half::f16]` with /// `len() >= 3 * width` f16 elements. #[inline] #[target_feature(enable = "sse4.1,f16c")] -pub(crate) unsafe fn rgbf16_to_rgb_f16_row( +pub(crate) unsafe fn rgbf16_to_rgb_f16_row( rgb_in: &[half::f16], rgb_out: &mut [half::f16], width: usize, ) { debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_f16_out row too short"); - scalar::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width); + scalar::rgbf16_to_rgb_f16_row::(rgb_in, rgb_out, width); +} + +/// Scalar f16 load helper for tail loops (SSE4.1 module). +#[inline(always)] +fn load_f16_scalar(rgb_in: &[half::f16], i: usize) -> half::f16 { + let bits = rgb_in[i].to_bits(); + half::f16::from_bits(if BE { + u16::from_be(bits) + } else { + u16::from_le(bits) + }) } diff --git a/src/row/arch/x86_sse41/tests/packed_rgb_float.rs b/src/row/arch/x86_sse41/tests/packed_rgb_float.rs index 2b917752..3d6f59f1 100644 --- a/src/row/arch/x86_sse41/tests/packed_rgb_float.rs +++ b/src/row/arch/x86_sse41/tests/packed_rgb_float.rs @@ -1,6 +1,28 @@ use super::super::*; // ---- Tier 9 Rgbf32 SIMD-vs-scalar parity tests -------------------------- +// +// LE-host gating rationale (codex 6th-pass review of PR #83): +// +// The MXCSR regression test plus the five Rgbf32 / six Rgbf16 SIMD-vs- +// scalar parity tests below build their fixtures via host-native f32 +// (`vec![0.5_f32; ...]`, `pseudo_random_rgbf32`) or host-native f16 +// (`pseudo_random_rgbf16`) and call the kernels with `::`. Same +// fixture-vs-kernel byte-order class as the scalar tests gated in +// `56342c0` and the NEON tests gated alongside this commit. +// +// `target_arch = "x86_64"` always implies `target_endian = "little"`, +// so the `#[cfg(target_endian = "little")]` gate is functionally a +// no-op on every supported configuration. It's added here for +// structural consistency with the NEON / scalar gating pattern, so an +// audit of "tests that take host-native fixtures and call kernels with +// ``" returns a uniform answer across every backend. (If x86 +// ever adds BE support, the gates are already in place.) +// +// SSE4.1 BE-host correctness — when SSE4.1 is run on a hypothetical BE +// target — is locked down separately by the dedicated BE-parity tests +// in this same module (which build LE-encoded fixtures via byte-swap +// helpers and assert ``/`` parity on every host). // MXCSR access via inline asm. `_mm_getcsr` / `_mm_setcsr` are deprecated // (the deprecation message itself points at inline assembly), so we use the @@ -34,6 +56,7 @@ unsafe fn write_mxcsr(v: u32) { #[test] #[cfg(target_arch = "x86_64")] +#[cfg(target_endian = "little")] #[cfg_attr(miri, ignore = "MXCSR + SIMD intrinsics unsupported by Miri")] fn rgbf32_to_rgb_row_simd_matches_scalar_under_truncate_mxcsr() { if !std::arch::is_x86_feature_detected!("sse4.1") { @@ -52,8 +75,8 @@ fn rgbf32_to_rgb_row_simd_matches_scalar_under_truncate_mxcsr() { let mut simd_out = std::vec![0u8; width * 3]; let mut scalar_out = std::vec![0u8; width * 3]; - unsafe { rgbf32_to_rgb_row(&rgb, &mut simd_out, width) }; - scalar::rgbf32_to_rgb_row(&rgb, &mut scalar_out, width); + unsafe { rgbf32_to_rgb_row::(&rgb, &mut simd_out, width) }; + scalar::rgbf32_to_rgb_row::(&rgb, &mut scalar_out, width); // Restore MXCSR before any assertion so panic formatting doesn't misfire. unsafe { write_mxcsr(saved) }; @@ -83,6 +106,7 @@ fn pseudo_random_rgbf32(width: usize) -> std::vec::Vec { } #[test] +#[cfg(target_endian = "little")] fn sse41_rgbf32_to_rgb_matches_scalar() { if !std::arch::is_x86_feature_detected!("sse4.1") { return; @@ -91,15 +115,16 @@ fn sse41_rgbf32_to_rgb_matches_scalar() { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_simd = std::vec![0u8; w * 3]; - scalar::rgbf32_to_rgb_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgb_row(&input, &mut out_simd, w); + rgbf32_to_rgb_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "SSE4.1 rgbf32_to_rgb width {w}"); } } #[test] +#[cfg(target_endian = "little")] fn sse41_rgbf32_to_rgba_matches_scalar() { if !std::arch::is_x86_feature_detected!("sse4.1") { return; @@ -108,15 +133,16 @@ fn sse41_rgbf32_to_rgba_matches_scalar() { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_simd = std::vec![0u8; w * 4]; - scalar::rgbf32_to_rgba_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgba_row(&input, &mut out_simd, w); + rgbf32_to_rgba_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "SSE4.1 rgbf32_to_rgba width {w}"); } } #[test] +#[cfg(target_endian = "little")] fn sse41_rgbf32_to_rgb_u16_matches_scalar() { if !std::arch::is_x86_feature_detected!("sse4.1") { return; @@ -125,15 +151,16 @@ fn sse41_rgbf32_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_simd = std::vec![0u16; w * 3]; - scalar::rgbf32_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgb_u16_row(&input, &mut out_simd, w); + rgbf32_to_rgb_u16_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "SSE4.1 rgbf32_to_rgb_u16 width {w}"); } } #[test] +#[cfg(target_endian = "little")] fn sse41_rgbf32_to_rgba_u16_matches_scalar() { if !std::arch::is_x86_feature_detected!("sse4.1") { return; @@ -142,15 +169,16 @@ fn sse41_rgbf32_to_rgba_u16_matches_scalar() { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_simd = std::vec![0u16; w * 4]; - scalar::rgbf32_to_rgba_u16_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgba_u16_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgba_u16_row(&input, &mut out_simd, w); + rgbf32_to_rgba_u16_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "SSE4.1 rgbf32_to_rgba_u16 width {w}"); } } #[test] +#[cfg(target_endian = "little")] fn sse41_rgbf32_to_rgb_f32_matches_scalar() { if !std::arch::is_x86_feature_detected!("sse4.1") { return; @@ -159,9 +187,9 @@ fn sse41_rgbf32_to_rgb_f32_matches_scalar() { let input = pseudo_random_rgbf32(w); let mut out_scalar = std::vec![0.0f32; w * 3]; let mut out_simd = std::vec![0.0f32; w * 3]; - scalar::rgbf32_to_rgb_f32_row(&input, &mut out_scalar, w); + scalar::rgbf32_to_rgb_f32_row::(&input, &mut out_scalar, w); unsafe { - rgbf32_to_rgb_f32_row(&input, &mut out_simd, w); + rgbf32_to_rgb_f32_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "SSE4.1 rgbf32_to_rgb_f32 width {w}"); assert_eq!(out_simd, input[..w * 3], "lossless width {w}"); @@ -178,6 +206,7 @@ fn pseudo_random_rgbf16(width: usize) -> std::vec::Vec { } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -191,15 +220,16 @@ fn sse41_rgbf16_to_rgb_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_simd = std::vec![0u8; w * 3]; - scalar::rgbf16_to_rgb_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgb_row(&input, &mut out_simd, w); + rgbf16_to_rgb_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "SSE4.1+F16C rgbf16_to_rgb width {w}"); } } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -213,15 +243,16 @@ fn sse41_rgbf16_to_rgba_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_simd = std::vec![0u8; w * 4]; - scalar::rgbf16_to_rgba_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgba_row(&input, &mut out_simd, w); + rgbf16_to_rgba_row::(&input, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "SSE4.1+F16C rgbf16_to_rgba width {w}"); } } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -235,9 +266,9 @@ fn sse41_rgbf16_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_simd = std::vec![0u16; w * 3]; - scalar::rgbf16_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgb_u16_row(&input, &mut out_simd, w); + rgbf16_to_rgb_u16_row::(&input, &mut out_simd, w); } assert_eq!( out_scalar, out_simd, @@ -247,6 +278,7 @@ fn sse41_rgbf16_to_rgb_u16_matches_scalar() { } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -260,9 +292,9 @@ fn sse41_rgbf16_to_rgba_u16_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_simd = std::vec![0u16; w * 4]; - scalar::rgbf16_to_rgba_u16_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgba_u16_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgba_u16_row(&input, &mut out_simd, w); + rgbf16_to_rgba_u16_row::(&input, &mut out_simd, w); } assert_eq!( out_scalar, out_simd, @@ -272,6 +304,7 @@ fn sse41_rgbf16_to_rgba_u16_matches_scalar() { } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -285,9 +318,9 @@ fn sse41_rgbf16_to_rgb_f32_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![0.0f32; w * 3]; let mut out_simd = std::vec![0.0f32; w * 3]; - scalar::rgbf16_to_rgb_f32_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgb_f32_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgb_f32_row(&input, &mut out_simd, w); + rgbf16_to_rgb_f32_row::(&input, &mut out_simd, w); } assert_eq!( out_scalar, out_simd, @@ -297,6 +330,7 @@ fn sse41_rgbf16_to_rgb_f32_matches_scalar() { } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" @@ -310,9 +344,9 @@ fn sse41_rgbf16_to_rgb_f16_matches_scalar() { let input = pseudo_random_rgbf16(w); let mut out_scalar = std::vec![half::f16::ZERO; w * 3]; let mut out_simd = std::vec![half::f16::ZERO; w * 3]; - scalar::rgbf16_to_rgb_f16_row(&input, &mut out_scalar, w); + scalar::rgbf16_to_rgb_f16_row::(&input, &mut out_scalar, w); unsafe { - rgbf16_to_rgb_f16_row(&input, &mut out_simd, w); + rgbf16_to_rgb_f16_row::(&input, &mut out_simd, w); } assert_eq!( out_scalar, out_simd, @@ -321,3 +355,319 @@ fn sse41_rgbf16_to_rgb_f16_matches_scalar() { assert_eq!(out_simd, input[..w * 3], "lossless width {w}"); } } + +// ---- BE parity tests — SSE4.1 Rgbf32 ---------------------------------------- +// +// For each kernel: byte-swap the LE f32 inputs into a BE buffer, call the +// kernel with `BE=true`, and assert the output matches the LE run (`BE=false`). +// x86 feature detection guards required (memory: x86_test_feature_guard). + +fn be_rgbf32(le: &[f32]) -> std::vec::Vec { + le.iter() + .map(|v| f32::from_bits(v.to_bits().swap_bytes())) + .collect() +} + +fn be_rgbf16(le: &[half::f16]) -> std::vec::Vec { + le.iter() + .map(|v| half::f16::from_bits(v.to_bits().swap_bytes())) + .collect() +} + +#[test] +#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] +fn sse41_rgbf32_to_rgb_be_matches_le() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + rgbf32_to_rgb_row::(&le_in, &mut out_le, w); + rgbf32_to_rgb_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "SSE4.1 rgbf32_to_rgb BE parity width {w}"); + } +} + +#[test] +#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] +fn sse41_rgbf32_to_rgba_be_matches_le() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + rgbf32_to_rgba_row::(&le_in, &mut out_le, w); + rgbf32_to_rgba_row::(&be_in, &mut out_be, w); + } + assert_eq!(out_le, out_be, "SSE4.1 rgbf32_to_rgba BE parity width {w}"); + } +} + +#[test] +#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] +fn sse41_rgbf32_to_rgb_u16_be_matches_le() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + rgbf32_to_rgb_u16_row::(&le_in, &mut out_le, w); + rgbf32_to_rgb_u16_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "SSE4.1 rgbf32_to_rgb_u16 BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] +fn sse41_rgbf32_to_rgba_u16_be_matches_le() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + rgbf32_to_rgba_u16_row::(&le_in, &mut out_le, w); + rgbf32_to_rgba_u16_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "SSE4.1 rgbf32_to_rgba_u16 BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] +fn sse41_rgbf32_to_rgb_f32_be_is_byteswap() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf32(w); + let be_in = be_rgbf32(&le_in); + let mut out_le = std::vec![0.0f32; w * 3]; + let mut out_be = std::vec![0.0f32; w * 3]; + unsafe { + rgbf32_to_rgb_f32_row::(&le_in, &mut out_le, w); + rgbf32_to_rgb_f32_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "SSE4.1 rgbf32_to_rgb_f32 BE parity width {w}" + ); + } +} + +/// Feeds an explicitly LE-encoded fixture through `rgbf32_to_rgb_f32_row::` +/// and asserts it decodes to the host-native expected values. +/// +/// On LE hosts this is a vacuous sanity check (LE-encoded == host-native), but +/// on BE hosts it guards against the historical bug where the kernel used a raw +/// `_mm_loadu_ps`/`_mm_storeu_ps` copy in the `BE = false` branch, which +/// preserved the LE byte order on store and produced corrupted (byte-swapped) +/// host f32s. The current kernel falls through to the endian-aware +/// `load_f32x4::` slow path on BE hosts (`HOST_NATIVE_BE != BE`) so this +/// test passes on both. +#[test] +#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] +fn sse41_rgbf32_to_rgb_f32_row_le_input_decodes_correctly_on_any_host() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let expected = pseudo_random_rgbf32(w); // host-native f32 values + // Build LE-encoded input: each lane's bits, written as if LE on disk, then + // reinterpreted as host-native f32. On LE hosts this is identical to + // `expected`; on BE hosts each lane is byte-swapped. + let le_in: std::vec::Vec = expected + .iter() + .map(|v| f32::from_bits(u32::from_le(v.to_bits()))) + .collect(); + let mut out = std::vec![0.0f32; w * 3]; + unsafe { + rgbf32_to_rgb_f32_row::(&le_in, &mut out, w); + } + assert_eq!( + out, expected, + "SSE4.1 rgbf32_to_rgb_f32_row:: must decode LE input to host-native (width {w})" + ); + } +} + +// ---- BE parity tests — SSE4.1 + F16C Rgbf16 ---------------------------------- + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn sse41_rgbf16_to_rgb_be_matches_le() { + if !std::arch::is_x86_feature_detected!("sse4.1") || !std::arch::is_x86_feature_detected!("f16c") + { + return; + } + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + rgbf16_to_rgb_row::(&le_in, &mut out_le, w); + rgbf16_to_rgb_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "SSE4.1+F16C rgbf16_to_rgb BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn sse41_rgbf16_to_rgba_be_matches_le() { + if !std::arch::is_x86_feature_detected!("sse4.1") || !std::arch::is_x86_feature_detected!("f16c") + { + return; + } + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + rgbf16_to_rgba_row::(&le_in, &mut out_le, w); + rgbf16_to_rgba_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "SSE4.1+F16C rgbf16_to_rgba BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn sse41_rgbf16_to_rgb_u16_be_matches_le() { + if !std::arch::is_x86_feature_detected!("sse4.1") || !std::arch::is_x86_feature_detected!("f16c") + { + return; + } + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + rgbf16_to_rgb_u16_row::(&le_in, &mut out_le, w); + rgbf16_to_rgb_u16_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "SSE4.1+F16C rgbf16_to_rgb_u16 BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn sse41_rgbf16_to_rgba_u16_be_matches_le() { + if !std::arch::is_x86_feature_detected!("sse4.1") || !std::arch::is_x86_feature_detected!("f16c") + { + return; + } + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + rgbf16_to_rgba_u16_row::(&le_in, &mut out_le, w); + rgbf16_to_rgba_u16_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "SSE4.1+F16C rgbf16_to_rgba_u16 BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn sse41_rgbf16_to_rgb_f32_be_matches_le() { + if !std::arch::is_x86_feature_detected!("sse4.1") || !std::arch::is_x86_feature_detected!("f16c") + { + return; + } + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![0.0f32; w * 3]; + let mut out_be = std::vec![0.0f32; w * 3]; + unsafe { + rgbf16_to_rgb_f32_row::(&le_in, &mut out_le, w); + rgbf16_to_rgb_f32_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "SSE4.1+F16C rgbf16_to_rgb_f32 BE parity width {w}" + ); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn sse41_rgbf16_to_rgb_f16_be_is_byteswap() { + if !std::arch::is_x86_feature_detected!("sse4.1") || !std::arch::is_x86_feature_detected!("f16c") + { + return; + } + for w in [1usize, 4, 7, 16, 33, 1920, 1921] { + let le_in = pseudo_random_rgbf16(w); + let be_in = be_rgbf16(&le_in); + let mut out_le = std::vec![half::f16::ZERO; w * 3]; + let mut out_be = std::vec![half::f16::ZERO; w * 3]; + unsafe { + rgbf16_to_rgb_f16_row::(&le_in, &mut out_le, w); + rgbf16_to_rgb_f16_row::(&be_in, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "SSE4.1+F16C rgbf16_to_rgb_f16 BE parity width {w}" + ); + } +} diff --git a/src/row/dispatch/rgb_f16_ops.rs b/src/row/dispatch/rgb_f16_ops.rs index 8dce5df5..c96af5d5 100644 --- a/src/row/dispatch/rgb_f16_ops.rs +++ b/src/row/dispatch/rgb_f16_ops.rs @@ -44,7 +44,12 @@ use crate::row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, s /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], width: usize, use_simd: bool) { +pub fn rgbf16_to_rgb_row( + rgb_in: &[half::f16], + rgb_out: &mut [u8], + width: usize, + use_simd: bool, +) { let rgb_in_min = rgb_row_elems(width); let rgb_out_min = rgb_row_bytes(width); assert!(rgb_in.len() >= rgb_in_min, "rgbf16 row too short"); @@ -55,38 +60,38 @@ pub fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], width: usize, target_arch = "aarch64" => { if neon_available() && fp16_available() { // SAFETY: `neon_available()` verified NEON is present. - unsafe { arch::neon::rgbf16_to_rgb_row(rgb_in, rgb_out, width); } + unsafe { arch::neon::rgbf16_to_rgb_row::(rgb_in, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() && f16c_available() { // SAFETY: AVX-512F + F16C verified. - unsafe { arch::x86_avx512::rgbf16_to_rgb_row(rgb_in, rgb_out, width); } + unsafe { arch::x86_avx512::rgbf16_to_rgb_row::(rgb_in, rgb_out, width); } return; } if avx2_available() && f16c_available() { // SAFETY: AVX2 + F16C verified. - unsafe { arch::x86_avx2::rgbf16_to_rgb_row(rgb_in, rgb_out, width); } + unsafe { arch::x86_avx2::rgbf16_to_rgb_row::(rgb_in, rgb_out, width); } return; } if sse41_available() && f16c_available() { // SAFETY: SSE4.1 + F16C verified. - unsafe { arch::x86_sse41::rgbf16_to_rgb_row(rgb_in, rgb_out, width); } + unsafe { arch::x86_sse41::rgbf16_to_rgb_row::(rgb_in, rgb_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::rgbf16_to_rgb_row(rgb_in, rgb_out, width); } + unsafe { arch::wasm_simd128::rgbf16_to_rgb_row::(rgb_in, rgb_out, width); } return; } }, _ => {} } } - scalar::rgbf16_to_rgb_row(rgb_in, rgb_out, width); + scalar::rgbf16_to_rgb_row::(rgb_in, rgb_out, width); } /// Converts packed `R, G, B` `half::f16` input to packed `R, G, B, A` `u8` @@ -94,7 +99,12 @@ pub fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], width: usize, /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8], width: usize, use_simd: bool) { +pub fn rgbf16_to_rgba_row( + rgb_in: &[half::f16], + rgba_out: &mut [u8], + width: usize, + use_simd: bool, +) { let rgb_in_min = rgb_row_elems(width); let rgba_out_min = rgba_row_bytes(width); assert!(rgb_in.len() >= rgb_in_min, "rgbf16 row too short"); @@ -104,34 +114,34 @@ pub fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8], width: usiz cfg_select! { target_arch = "aarch64" => { if neon_available() && fp16_available() { - unsafe { arch::neon::rgbf16_to_rgba_row(rgb_in, rgba_out, width); } + unsafe { arch::neon::rgbf16_to_rgba_row::(rgb_in, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() && f16c_available() { - unsafe { arch::x86_avx512::rgbf16_to_rgba_row(rgb_in, rgba_out, width); } + unsafe { arch::x86_avx512::rgbf16_to_rgba_row::(rgb_in, rgba_out, width); } return; } if avx2_available() && f16c_available() { - unsafe { arch::x86_avx2::rgbf16_to_rgba_row(rgb_in, rgba_out, width); } + unsafe { arch::x86_avx2::rgbf16_to_rgba_row::(rgb_in, rgba_out, width); } return; } if sse41_available() && f16c_available() { - unsafe { arch::x86_sse41::rgbf16_to_rgba_row(rgb_in, rgba_out, width); } + unsafe { arch::x86_sse41::rgbf16_to_rgba_row::(rgb_in, rgba_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::rgbf16_to_rgba_row(rgb_in, rgba_out, width); } + unsafe { arch::wasm_simd128::rgbf16_to_rgba_row::(rgb_in, rgba_out, width); } return; } }, _ => {} } } - scalar::rgbf16_to_rgba_row(rgb_in, rgba_out, width); + scalar::rgbf16_to_rgba_row::(rgb_in, rgba_out, width); } /// Converts packed `R, G, B` `half::f16` input to packed `R, G, B` `u16` @@ -139,7 +149,7 @@ pub fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8], width: usiz /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgbf16_to_rgb_u16_row( +pub fn rgbf16_to_rgb_u16_row( rgb_in: &[half::f16], rgb_out: &mut [u16], width: usize, @@ -154,34 +164,34 @@ pub fn rgbf16_to_rgb_u16_row( cfg_select! { target_arch = "aarch64" => { if neon_available() && fp16_available() { - unsafe { arch::neon::rgbf16_to_rgb_u16_row(rgb_in, rgb_out, width); } + unsafe { arch::neon::rgbf16_to_rgb_u16_row::(rgb_in, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() && f16c_available() { - unsafe { arch::x86_avx512::rgbf16_to_rgb_u16_row(rgb_in, rgb_out, width); } + unsafe { arch::x86_avx512::rgbf16_to_rgb_u16_row::(rgb_in, rgb_out, width); } return; } if avx2_available() && f16c_available() { - unsafe { arch::x86_avx2::rgbf16_to_rgb_u16_row(rgb_in, rgb_out, width); } + unsafe { arch::x86_avx2::rgbf16_to_rgb_u16_row::(rgb_in, rgb_out, width); } return; } if sse41_available() && f16c_available() { - unsafe { arch::x86_sse41::rgbf16_to_rgb_u16_row(rgb_in, rgb_out, width); } + unsafe { arch::x86_sse41::rgbf16_to_rgb_u16_row::(rgb_in, rgb_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::rgbf16_to_rgb_u16_row(rgb_in, rgb_out, width); } + unsafe { arch::wasm_simd128::rgbf16_to_rgb_u16_row::(rgb_in, rgb_out, width); } return; } }, _ => {} } } - scalar::rgbf16_to_rgb_u16_row(rgb_in, rgb_out, width); + scalar::rgbf16_to_rgb_u16_row::(rgb_in, rgb_out, width); } /// Converts packed `R, G, B` `half::f16` input to packed `R, G, B, A` `u16` @@ -189,7 +199,7 @@ pub fn rgbf16_to_rgb_u16_row( /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgbf16_to_rgba_u16_row( +pub fn rgbf16_to_rgba_u16_row( rgb_in: &[half::f16], rgba_out: &mut [u16], width: usize, @@ -204,34 +214,34 @@ pub fn rgbf16_to_rgba_u16_row( cfg_select! { target_arch = "aarch64" => { if neon_available() && fp16_available() { - unsafe { arch::neon::rgbf16_to_rgba_u16_row(rgb_in, rgba_out, width); } + unsafe { arch::neon::rgbf16_to_rgba_u16_row::(rgb_in, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() && f16c_available() { - unsafe { arch::x86_avx512::rgbf16_to_rgba_u16_row(rgb_in, rgba_out, width); } + unsafe { arch::x86_avx512::rgbf16_to_rgba_u16_row::(rgb_in, rgba_out, width); } return; } if avx2_available() && f16c_available() { - unsafe { arch::x86_avx2::rgbf16_to_rgba_u16_row(rgb_in, rgba_out, width); } + unsafe { arch::x86_avx2::rgbf16_to_rgba_u16_row::(rgb_in, rgba_out, width); } return; } if sse41_available() && f16c_available() { - unsafe { arch::x86_sse41::rgbf16_to_rgba_u16_row(rgb_in, rgba_out, width); } + unsafe { arch::x86_sse41::rgbf16_to_rgba_u16_row::(rgb_in, rgba_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::rgbf16_to_rgba_u16_row(rgb_in, rgba_out, width); } + unsafe { arch::wasm_simd128::rgbf16_to_rgba_u16_row::(rgb_in, rgba_out, width); } return; } }, _ => {} } } - scalar::rgbf16_to_rgba_u16_row(rgb_in, rgba_out, width); + scalar::rgbf16_to_rgba_u16_row::(rgb_in, rgba_out, width); } /// **Lossless** half-float pass-through: copies packed `R, G, B` `half::f16` @@ -241,7 +251,7 @@ pub fn rgbf16_to_rgba_u16_row( /// `use_simd = false` forces the scalar reference path (which is also just /// `copy_from_slice` — the compiler will vectorize it regardless). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgbf16_to_rgb_f16_row( +pub fn rgbf16_to_rgb_f16_row( rgb_in: &[half::f16], rgb_out: &mut [half::f16], width: usize, @@ -256,34 +266,34 @@ pub fn rgbf16_to_rgb_f16_row( cfg_select! { target_arch = "aarch64" => { if neon_available() && fp16_available() { - unsafe { arch::neon::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width); } + unsafe { arch::neon::rgbf16_to_rgb_f16_row::(rgb_in, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() && f16c_available() { - unsafe { arch::x86_avx512::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width); } + unsafe { arch::x86_avx512::rgbf16_to_rgb_f16_row::(rgb_in, rgb_out, width); } return; } if avx2_available() && f16c_available() { - unsafe { arch::x86_avx2::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width); } + unsafe { arch::x86_avx2::rgbf16_to_rgb_f16_row::(rgb_in, rgb_out, width); } return; } if sse41_available() && f16c_available() { - unsafe { arch::x86_sse41::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width); } + unsafe { arch::x86_sse41::rgbf16_to_rgb_f16_row::(rgb_in, rgb_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width); } + unsafe { arch::wasm_simd128::rgbf16_to_rgb_f16_row::(rgb_in, rgb_out, width); } return; } }, _ => {} } } - scalar::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width); + scalar::rgbf16_to_rgb_f16_row::(rgb_in, rgb_out, width); } /// Lossless widening pass: converts packed `R, G, B` `half::f16` input to @@ -292,7 +302,7 @@ pub fn rgbf16_to_rgb_f16_row( /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgbf16_to_rgb_f32_row( +pub fn rgbf16_to_rgb_f32_row( rgb_in: &[half::f16], rgb_out: &mut [f32], width: usize, @@ -307,32 +317,32 @@ pub fn rgbf16_to_rgb_f32_row( cfg_select! { target_arch = "aarch64" => { if neon_available() && fp16_available() { - unsafe { arch::neon::rgbf16_to_rgb_f32_row(rgb_in, rgb_out, width); } + unsafe { arch::neon::rgbf16_to_rgb_f32_row::(rgb_in, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() && f16c_available() { - unsafe { arch::x86_avx512::rgbf16_to_rgb_f32_row(rgb_in, rgb_out, width); } + unsafe { arch::x86_avx512::rgbf16_to_rgb_f32_row::(rgb_in, rgb_out, width); } return; } if avx2_available() && f16c_available() { - unsafe { arch::x86_avx2::rgbf16_to_rgb_f32_row(rgb_in, rgb_out, width); } + unsafe { arch::x86_avx2::rgbf16_to_rgb_f32_row::(rgb_in, rgb_out, width); } return; } if sse41_available() && f16c_available() { - unsafe { arch::x86_sse41::rgbf16_to_rgb_f32_row(rgb_in, rgb_out, width); } + unsafe { arch::x86_sse41::rgbf16_to_rgb_f32_row::(rgb_in, rgb_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::rgbf16_to_rgb_f32_row(rgb_in, rgb_out, width); } + unsafe { arch::wasm_simd128::rgbf16_to_rgb_f32_row::(rgb_in, rgb_out, width); } return; } }, _ => {} } } - scalar::rgbf16_to_rgb_f32_row(rgb_in, rgb_out, width); + scalar::rgbf16_to_rgb_f32_row::(rgb_in, rgb_out, width); } diff --git a/src/row/dispatch/rgb_float_ops.rs b/src/row/dispatch/rgb_float_ops.rs index 43baaf4c..5a238b91 100644 --- a/src/row/dispatch/rgb_float_ops.rs +++ b/src/row/dispatch/rgb_float_ops.rs @@ -30,7 +30,12 @@ use crate::row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, s /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width: usize, use_simd: bool) { +pub fn rgbf32_to_rgb_row( + rgb_in: &[f32], + rgb_out: &mut [u8], + width: usize, + use_simd: bool, +) { let rgb_in_min = rgb_row_elems(width); let rgb_out_min = rgb_row_bytes(width); assert!(rgb_in.len() >= rgb_in_min, "rgbf32 row too short"); @@ -41,38 +46,38 @@ pub fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width: usize, use_s target_arch = "aarch64" => { if neon_available() { // SAFETY: `neon_available()` verified NEON is present. - unsafe { arch::neon::rgbf32_to_rgb_row(rgb_in, rgb_out, width); } + unsafe { arch::neon::rgbf32_to_rgb_row::(rgb_in, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512F verified. - unsafe { arch::x86_avx512::rgbf32_to_rgb_row(rgb_in, rgb_out, width); } + unsafe { arch::x86_avx512::rgbf32_to_rgb_row::(rgb_in, rgb_out, width); } return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::rgbf32_to_rgb_row(rgb_in, rgb_out, width); } + unsafe { arch::x86_avx2::rgbf32_to_rgb_row::(rgb_in, rgb_out, width); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::rgbf32_to_rgb_row(rgb_in, rgb_out, width); } + unsafe { arch::x86_sse41::rgbf32_to_rgb_row::(rgb_in, rgb_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::rgbf32_to_rgb_row(rgb_in, rgb_out, width); } + unsafe { arch::wasm_simd128::rgbf32_to_rgb_row::(rgb_in, rgb_out, width); } return; } }, _ => {} } } - scalar::rgbf32_to_rgb_row(rgb_in, rgb_out, width); + scalar::rgbf32_to_rgb_row::(rgb_in, rgb_out, width); } /// Converts packed `R, G, B` `f32` input to packed `R, G, B, A` `u8` @@ -80,7 +85,12 @@ pub fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width: usize, use_s /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], width: usize, use_simd: bool) { +pub fn rgbf32_to_rgba_row( + rgb_in: &[f32], + rgba_out: &mut [u8], + width: usize, + use_simd: bool, +) { let rgb_in_min = rgb_row_elems(width); let rgba_out_min = rgba_row_bytes(width); assert!(rgb_in.len() >= rgb_in_min, "rgbf32 row too short"); @@ -90,34 +100,34 @@ pub fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], width: usize, use cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::rgbf32_to_rgba_row(rgb_in, rgba_out, width); } + unsafe { arch::neon::rgbf32_to_rgba_row::(rgb_in, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::rgbf32_to_rgba_row(rgb_in, rgba_out, width); } + unsafe { arch::x86_avx512::rgbf32_to_rgba_row::(rgb_in, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::rgbf32_to_rgba_row(rgb_in, rgba_out, width); } + unsafe { arch::x86_avx2::rgbf32_to_rgba_row::(rgb_in, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::rgbf32_to_rgba_row(rgb_in, rgba_out, width); } + unsafe { arch::x86_sse41::rgbf32_to_rgba_row::(rgb_in, rgba_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::rgbf32_to_rgba_row(rgb_in, rgba_out, width); } + unsafe { arch::wasm_simd128::rgbf32_to_rgba_row::(rgb_in, rgba_out, width); } return; } }, _ => {} } } - scalar::rgbf32_to_rgba_row(rgb_in, rgba_out, width); + scalar::rgbf32_to_rgba_row::(rgb_in, rgba_out, width); } /// Converts packed `R, G, B` `f32` input to packed `R, G, B` `u16` @@ -125,7 +135,12 @@ pub fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], width: usize, use /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], width: usize, use_simd: bool) { +pub fn rgbf32_to_rgb_u16_row( + rgb_in: &[f32], + rgb_out: &mut [u16], + width: usize, + use_simd: bool, +) { let rgb_in_min = rgb_row_elems(width); let rgb_out_min = rgb_row_elems(width); assert!(rgb_in.len() >= rgb_in_min, "rgbf32 row too short"); @@ -135,34 +150,34 @@ pub fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], width: usize, cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::rgbf32_to_rgb_u16_row(rgb_in, rgb_out, width); } + unsafe { arch::neon::rgbf32_to_rgb_u16_row::(rgb_in, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::rgbf32_to_rgb_u16_row(rgb_in, rgb_out, width); } + unsafe { arch::x86_avx512::rgbf32_to_rgb_u16_row::(rgb_in, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::rgbf32_to_rgb_u16_row(rgb_in, rgb_out, width); } + unsafe { arch::x86_avx2::rgbf32_to_rgb_u16_row::(rgb_in, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::rgbf32_to_rgb_u16_row(rgb_in, rgb_out, width); } + unsafe { arch::x86_sse41::rgbf32_to_rgb_u16_row::(rgb_in, rgb_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::rgbf32_to_rgb_u16_row(rgb_in, rgb_out, width); } + unsafe { arch::wasm_simd128::rgbf32_to_rgb_u16_row::(rgb_in, rgb_out, width); } return; } }, _ => {} } } - scalar::rgbf32_to_rgb_u16_row(rgb_in, rgb_out, width); + scalar::rgbf32_to_rgb_u16_row::(rgb_in, rgb_out, width); } /// Converts packed `R, G, B` `f32` input to packed `R, G, B, A` `u16` @@ -170,7 +185,12 @@ pub fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], width: usize, /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16], width: usize, use_simd: bool) { +pub fn rgbf32_to_rgba_u16_row( + rgb_in: &[f32], + rgba_out: &mut [u16], + width: usize, + use_simd: bool, +) { let rgb_in_min = rgb_row_elems(width); let rgba_out_min = rgba_row_elems(width); assert!(rgb_in.len() >= rgb_in_min, "rgbf32 row too short"); @@ -180,34 +200,34 @@ pub fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16], width: usize cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::rgbf32_to_rgba_u16_row(rgb_in, rgba_out, width); } + unsafe { arch::neon::rgbf32_to_rgba_u16_row::(rgb_in, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::rgbf32_to_rgba_u16_row(rgb_in, rgba_out, width); } + unsafe { arch::x86_avx512::rgbf32_to_rgba_u16_row::(rgb_in, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::rgbf32_to_rgba_u16_row(rgb_in, rgba_out, width); } + unsafe { arch::x86_avx2::rgbf32_to_rgba_u16_row::(rgb_in, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::rgbf32_to_rgba_u16_row(rgb_in, rgba_out, width); } + unsafe { arch::x86_sse41::rgbf32_to_rgba_u16_row::(rgb_in, rgba_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::rgbf32_to_rgba_u16_row(rgb_in, rgba_out, width); } + unsafe { arch::wasm_simd128::rgbf32_to_rgba_u16_row::(rgb_in, rgba_out, width); } return; } }, _ => {} } } - scalar::rgbf32_to_rgba_u16_row(rgb_in, rgba_out, width); + scalar::rgbf32_to_rgba_u16_row::(rgb_in, rgba_out, width); } /// **Lossless** float pass-through: copies packed `R, G, B` `f32` @@ -216,7 +236,12 @@ pub fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16], width: usize /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], width: usize, use_simd: bool) { +pub fn rgbf32_to_rgb_f32_row( + rgb_in: &[f32], + rgb_out: &mut [f32], + width: usize, + use_simd: bool, +) { let rgb_in_min = rgb_row_elems(width); let rgb_out_min = rgb_row_elems(width); assert!(rgb_in.len() >= rgb_in_min, "rgbf32 row too short"); @@ -226,32 +251,32 @@ pub fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], width: usize, cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::rgbf32_to_rgb_f32_row(rgb_in, rgb_out, width); } + unsafe { arch::neon::rgbf32_to_rgb_f32_row::(rgb_in, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::rgbf32_to_rgb_f32_row(rgb_in, rgb_out, width); } + unsafe { arch::x86_avx512::rgbf32_to_rgb_f32_row::(rgb_in, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::rgbf32_to_rgb_f32_row(rgb_in, rgb_out, width); } + unsafe { arch::x86_avx2::rgbf32_to_rgb_f32_row::(rgb_in, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::rgbf32_to_rgb_f32_row(rgb_in, rgb_out, width); } + unsafe { arch::x86_sse41::rgbf32_to_rgb_f32_row::(rgb_in, rgb_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::rgbf32_to_rgb_f32_row(rgb_in, rgb_out, width); } + unsafe { arch::wasm_simd128::rgbf32_to_rgb_f32_row::(rgb_in, rgb_out, width); } return; } }, _ => {} } } - scalar::rgbf32_to_rgb_f32_row(rgb_in, rgb_out, width); + scalar::rgbf32_to_rgb_f32_row::(rgb_in, rgb_out, width); } diff --git a/src/row/scalar/packed_rgb_float.rs b/src/row/scalar/packed_rgb_float.rs index 58281e2c..79cd2c4e 100644 --- a/src/row/scalar/packed_rgb_float.rs +++ b/src/row/scalar/packed_rgb_float.rs @@ -66,42 +66,86 @@ pub(crate) fn f32_to_u16_clamped(v: f32) -> u16 { round_ties_even_nonneg(scaled) as u16 } +/// Read one f32 element from `rgb_in[i]`, decoding the IEEE 754 bit +/// pattern from `BE` byte order to host-native byte order. Scalar +/// endian-aware load for Rgbf32 streams. +/// +/// `from_be` / `from_le` are target-endian aware: a no-op when the +/// stored byte order matches the host, a byte-swap when they differ. +/// Mirrors the SIMD `load_endian_*::` helpers' semantics so LE and +/// BE hosts produce identical decoded values. +#[cfg_attr(not(tarpaulin), inline(always))] +fn load_f32(rgb_in: &[f32], i: usize) -> f32 { + let bits = rgb_in[i].to_bits(); + f32::from_bits(if BE { + u32::from_be(bits) + } else { + u32::from_le(bits) + }) +} + +/// Read one `half::f16` element from `rgb_in[i]`, decoding the bit +/// pattern from `BE` byte order to host-native. Scalar endian-aware +/// load for Rgbf16 streams. +#[cfg_attr(not(tarpaulin), inline(always))] +fn load_f16(rgb_in: &[half::f16], i: usize) -> half::f16 { + let bits = rgb_in[i].to_bits(); + half::f16::from_bits(if BE { + u16::from_be(bits) + } else { + u16::from_le(bits) + }) +} + /// Converts packed `R, G, B` `f32` input to packed `R, G, B` `u8` /// output. Each `f32` is clamped to `[0, 1]` and scaled by 255. /// +/// `BE` selects the **encoded byte order** of the input buffer: +/// `false` = LE-encoded on disk/wire, `true` = BE-encoded. This is +/// independent of the host CPU's native byte order — a swap happens +/// only when the encoded order differs from the host CPU's native order +/// (handled internally via `u32::from_le` / `u32::from_be`, both +/// target-endian-aware). +/// /// # Panics /// /// Panics (any build profile) if `rgb_in.len() < 3 * width` or /// `rgb_out.len() < 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width: usize) { +pub(crate) fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width: usize) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let i = x * 3; - rgb_out[i] = f32_to_u8_clamped(rgb_in[i]); - rgb_out[i + 1] = f32_to_u8_clamped(rgb_in[i + 1]); - rgb_out[i + 2] = f32_to_u8_clamped(rgb_in[i + 2]); + rgb_out[i] = f32_to_u8_clamped(load_f32::(rgb_in, i)); + rgb_out[i + 1] = f32_to_u8_clamped(load_f32::(rgb_in, i + 1)); + rgb_out[i + 2] = f32_to_u8_clamped(load_f32::(rgb_in, i + 2)); } } /// Converts packed `R, G, B` `f32` input to packed `R, G, B, A` `u8` /// output with `A = 0xFF` (the float source has no alpha). /// +/// When `BE = true` the input `f32` values are big-endian encoded. +/// /// # Panics /// /// Panics (any build profile) if `rgb_in.len() < 3 * width` or /// `rgba_out.len() < 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], width: usize) { +pub(crate) fn rgbf32_to_rgba_row( + rgb_in: &[f32], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let s = x * 3; let d = x * 4; - rgba_out[d] = f32_to_u8_clamped(rgb_in[s]); - rgba_out[d + 1] = f32_to_u8_clamped(rgb_in[s + 1]); - rgba_out[d + 2] = f32_to_u8_clamped(rgb_in[s + 2]); + rgba_out[d] = f32_to_u8_clamped(load_f32::(rgb_in, s)); + rgba_out[d + 1] = f32_to_u8_clamped(load_f32::(rgb_in, s + 1)); + rgba_out[d + 2] = f32_to_u8_clamped(load_f32::(rgb_in, s + 2)); rgba_out[d + 3] = 0xFF; } } @@ -109,39 +153,51 @@ pub(crate) fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], width: usi /// Converts packed `R, G, B` `f32` input to packed `R, G, B` `u16` /// output. Each `f32` is clamped to `[0, 1]` and scaled by 65535. /// +/// When `BE = true` the input `f32` values are big-endian encoded. +/// /// # Panics /// /// Panics (any build profile) if `rgb_in.len() < 3 * width` or /// `rgb_out.len() < 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], width: usize) { +pub(crate) fn rgbf32_to_rgb_u16_row( + rgb_in: &[f32], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let i = x * 3; - rgb_out[i] = f32_to_u16_clamped(rgb_in[i]); - rgb_out[i + 1] = f32_to_u16_clamped(rgb_in[i + 1]); - rgb_out[i + 2] = f32_to_u16_clamped(rgb_in[i + 2]); + rgb_out[i] = f32_to_u16_clamped(load_f32::(rgb_in, i)); + rgb_out[i + 1] = f32_to_u16_clamped(load_f32::(rgb_in, i + 1)); + rgb_out[i + 2] = f32_to_u16_clamped(load_f32::(rgb_in, i + 2)); } } /// Converts packed `R, G, B` `f32` input to packed `R, G, B, A` `u16` /// output with `A = 0xFFFF`. /// +/// When `BE = true` the input `f32` values are big-endian encoded. +/// /// # Panics /// /// Panics (any build profile) if `rgb_in.len() < 3 * width` or /// `rgba_out.len() < 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16], width: usize) { +pub(crate) fn rgbf32_to_rgba_u16_row( + rgb_in: &[f32], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let s = x * 3; let d = x * 4; - rgba_out[d] = f32_to_u16_clamped(rgb_in[s]); - rgba_out[d + 1] = f32_to_u16_clamped(rgb_in[s + 1]); - rgba_out[d + 2] = f32_to_u16_clamped(rgb_in[s + 2]); + rgba_out[d] = f32_to_u16_clamped(load_f32::(rgb_in, s)); + rgba_out[d + 1] = f32_to_u16_clamped(load_f32::(rgb_in, s + 1)); + rgba_out[d + 2] = f32_to_u16_clamped(load_f32::(rgb_in, s + 2)); rgba_out[d + 3] = 0xFFFF; } } @@ -150,15 +206,45 @@ pub(crate) fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16], width /// row into the output buffer without conversion. Source HDR values /// (> 1.0) and negatives are preserved bit-exact. /// +/// When `BE = true` the input is byte-swapped (big-endian → host-native) +/// so the output is always host-native `f32`. +/// /// # Panics /// /// Panics (any build profile) if `rgb_in.len() < 3 * width` or /// `rgb_out.len() < 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], width: usize) { +pub(crate) fn rgbf32_to_rgb_f32_row( + rgb_in: &[f32], + rgb_out: &mut [f32], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short"); - rgb_out[..width * 3].copy_from_slice(&rgb_in[..width * 3]); + // Fast path: encoded byte order matches host-native — pure memcpy. + // (LE-encoded data on LE host, or BE-encoded data on BE host.) + // The const-generic `BE == HOST_NATIVE_BE` branch is dead-code- + // eliminated per monomorphization, so this becomes a single + // `copy_from_slice` call with no swap loop. + const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + if BE == HOST_NATIVE_BE { + rgb_out[..width * 3].copy_from_slice(&rgb_in[..width * 3]); + return; + } + // Slow path: encoded byte order differs from host — byte-swap each + // f32 element via `u32::from_be` / `u32::from_le` (the dead branch + // is eliminated since `BE` is const). Output is always host-native. + for (dst, src) in rgb_out[..width * 3] + .iter_mut() + .zip(rgb_in[..width * 3].iter()) + { + let bits = src.to_bits(); + *dst = f32::from_bits(if BE { + u32::from_be(bits) + } else { + u32::from_le(bits) + }); + } } // ---- Tier 9 — Rgbf16 scalar row kernels -------------------------------- @@ -173,19 +259,25 @@ pub(crate) fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], width: /// `R, G, B` `u8` output. Each `half::f16` is widened to `f32`, then /// clamped to `[0, 1]` and scaled by 255. /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Panics /// /// Panics (any build profile) if `rgb_in.len() < 3 * width` or /// `rgb_out.len() < 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], width: usize) { +pub(crate) fn rgbf16_to_rgb_row( + rgb_in: &[half::f16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let i = x * 3; - rgb_out[i] = f32_to_u8_clamped(rgb_in[i].to_f32()); - rgb_out[i + 1] = f32_to_u8_clamped(rgb_in[i + 1].to_f32()); - rgb_out[i + 2] = f32_to_u8_clamped(rgb_in[i + 2].to_f32()); + rgb_out[i] = f32_to_u8_clamped(load_f16::(rgb_in, i).to_f32()); + rgb_out[i + 1] = f32_to_u8_clamped(load_f16::(rgb_in, i + 1).to_f32()); + rgb_out[i + 2] = f32_to_u8_clamped(load_f16::(rgb_in, i + 2).to_f32()); } } @@ -193,20 +285,26 @@ pub(crate) fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], width: /// `R, G, B, A` `u8` output with `A = 0xFF` (the half-float source has no /// alpha channel). /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Panics /// /// Panics (any build profile) if `rgb_in.len() < 3 * width` or /// `rgba_out.len() < 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8], width: usize) { +pub(crate) fn rgbf16_to_rgba_row( + rgb_in: &[half::f16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let s = x * 3; let d = x * 4; - rgba_out[d] = f32_to_u8_clamped(rgb_in[s].to_f32()); - rgba_out[d + 1] = f32_to_u8_clamped(rgb_in[s + 1].to_f32()); - rgba_out[d + 2] = f32_to_u8_clamped(rgb_in[s + 2].to_f32()); + rgba_out[d] = f32_to_u8_clamped(load_f16::(rgb_in, s).to_f32()); + rgba_out[d + 1] = f32_to_u8_clamped(load_f16::(rgb_in, s + 1).to_f32()); + rgba_out[d + 2] = f32_to_u8_clamped(load_f16::(rgb_in, s + 2).to_f32()); rgba_out[d + 3] = 0xFF; } } @@ -215,39 +313,51 @@ pub(crate) fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8], widt /// `R, G, B` `u16` output. Each `half::f16` is widened to `f32`, then /// clamped to `[0, 1]` and scaled by 65535. /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Panics /// /// Panics (any build profile) if `rgb_in.len() < 3 * width` or /// `rgb_out.len() < 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgbf16_to_rgb_u16_row(rgb_in: &[half::f16], rgb_out: &mut [u16], width: usize) { +pub(crate) fn rgbf16_to_rgb_u16_row( + rgb_in: &[half::f16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let i = x * 3; - rgb_out[i] = f32_to_u16_clamped(rgb_in[i].to_f32()); - rgb_out[i + 1] = f32_to_u16_clamped(rgb_in[i + 1].to_f32()); - rgb_out[i + 2] = f32_to_u16_clamped(rgb_in[i + 2].to_f32()); + rgb_out[i] = f32_to_u16_clamped(load_f16::(rgb_in, i).to_f32()); + rgb_out[i + 1] = f32_to_u16_clamped(load_f16::(rgb_in, i + 1).to_f32()); + rgb_out[i + 2] = f32_to_u16_clamped(load_f16::(rgb_in, i + 2).to_f32()); } } /// Converts packed `R, G, B` 16-bit half-precision float input to packed /// `R, G, B, A` `u16` output with `A = 0xFFFF`. /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Panics /// /// Panics (any build profile) if `rgb_in.len() < 3 * width` or /// `rgba_out.len() < 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgbf16_to_rgba_u16_row(rgb_in: &[half::f16], rgba_out: &mut [u16], width: usize) { +pub(crate) fn rgbf16_to_rgba_u16_row( + rgb_in: &[half::f16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let s = x * 3; let d = x * 4; - rgba_out[d] = f32_to_u16_clamped(rgb_in[s].to_f32()); - rgba_out[d + 1] = f32_to_u16_clamped(rgb_in[s + 1].to_f32()); - rgba_out[d + 2] = f32_to_u16_clamped(rgb_in[s + 2].to_f32()); + rgba_out[d] = f32_to_u16_clamped(load_f16::(rgb_in, s).to_f32()); + rgba_out[d + 1] = f32_to_u16_clamped(load_f16::(rgb_in, s + 1).to_f32()); + rgba_out[d + 2] = f32_to_u16_clamped(load_f16::(rgb_in, s + 2).to_f32()); rgba_out[d + 3] = 0xFFFF; } } @@ -256,16 +366,31 @@ pub(crate) fn rgbf16_to_rgba_u16_row(rgb_in: &[half::f16], rgba_out: &mut [u16], /// (> 1.0) and negatives bit-exactly through the widen step. Output /// is `f32`; no clamping is applied. /// +/// When `BE = true` the input `half::f16` values are big-endian encoded. +/// /// # Panics /// /// Panics (any build profile) if `rgb_in.len() < 3 * width` or /// `rgb_out.len() < 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgbf16_to_rgb_f32_row(rgb_in: &[half::f16], rgb_out: &mut [f32], width: usize) { +pub(crate) fn rgbf16_to_rgb_f32_row( + rgb_in: &[half::f16], + rgb_out: &mut [f32], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short"); - for i in 0..width * 3 { - rgb_out[i] = rgb_in[i].to_f32(); + for (dst, src) in rgb_out[..width * 3] + .iter_mut() + .zip(rgb_in[..width * 3].iter()) + { + let bits = src.to_bits(); + let host_bits = if BE { + u16::from_be(bits) + } else { + u16::from_le(bits) + }; + *dst = half::f16::from_bits(host_bits).to_f32(); } } @@ -273,13 +398,42 @@ pub(crate) fn rgbf16_to_rgb_f32_row(rgb_in: &[half::f16], rgb_out: &mut [f32], w /// into the output buffer without any conversion. Source HDR values and /// negatives are preserved bit-exact. /// +/// When `BE = true` the input values are byte-swapped to host-native order +/// on output. +/// /// # Panics /// /// Panics (any build profile) if `rgb_in.len() < 3 * width` or /// `rgb_out.len() < 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgbf16_to_rgb_f16_row(rgb_in: &[half::f16], rgb_out: &mut [half::f16], width: usize) { +pub(crate) fn rgbf16_to_rgb_f16_row( + rgb_in: &[half::f16], + rgb_out: &mut [half::f16], + width: usize, +) { debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_f16_out row too short"); - rgb_out[..width * 3].copy_from_slice(&rgb_in[..width * 3]); + // Fast path: encoded byte order matches host-native — pure memcpy. + // Mirrors the `rgbf32_to_rgb_f32_row` fast path; the const-generic + // `BE == HOST_NATIVE_BE` branch is dead-code-eliminated per + // monomorphization, so this becomes a single `copy_from_slice`. + const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + if BE == HOST_NATIVE_BE { + rgb_out[..width * 3].copy_from_slice(&rgb_in[..width * 3]); + return; + } + // Slow path: encoded byte order differs from host — byte-swap each + // f16 element via `u16::from_be` / `u16::from_le`. Output is always + // host-native f16. + for (dst, src) in rgb_out[..width * 3] + .iter_mut() + .zip(rgb_in[..width * 3].iter()) + { + let bits = src.to_bits(); + *dst = half::f16::from_bits(if BE { + u16::from_be(bits) + } else { + u16::from_le(bits) + }); + } } diff --git a/src/row/scalar/tests.rs b/src/row/scalar/tests.rs index 61a2857b..2bf38a79 100644 --- a/src/row/scalar/tests.rs +++ b/src/row/scalar/tests.rs @@ -641,10 +641,45 @@ fn p416_rgba_u16_gray_alpha_is_ffff() { // values, calls the `rgbf16_to_*_row` kernel, and then calls the matching // `rgbf32_to_*_row` kernel with the widened f32 slice. The outputs must // be identical, proving that widening is the only difference. +// +// LE-host gating rationale (codex 5th-pass review of PR #83): +// +// The fixture builder `rgbf16_test_inputs` produces host-native `half::f16` +// (and widened host-native `f32`) values via `half::f16::from_f32` / +// `to_f32`. The tests then call the kernels with `::`, which means +// "input is LE-encoded — decode to host-native by applying `from_le`". +// +// On a little-endian host, host-native bits and LE-encoded bits are the +// same byte sequence, so `u16::from_le` / `u32::from_le` is a no-op and +// the assertion holds. +// +// On a big-endian host, host-native `f16`/`f32` bits do NOT lay out +// little-endian, so the kernel's `from_le` byte-swap correctly +// reinterprets the host-native fixture as if it were an LE-encoded +// payload — producing a different (corrupted) value than the test +// expects. The kernel itself is correct; this is purely a +// fixture-vs-kernel byte-order mismatch on BE hosts (same class as the +// PR #82 alpha_extract / planar_gbr_high_bit gates in `8f2e329`). +// +// Kernel BE-host correctness is locked down separately by the dedicated +// BE-parity tests in the per-backend `tests/packed_rgb_float.rs` +// modules, which build LE-encoded fixtures via +// `f32::from_bits(u32::from_le(_))` / `half::f16::from_bits(u16::from_le(_))` +// and assert the kernel output matches the original host-native values +// on every host. Those tests are intentionally NOT gated. +// +// The fixture set `[0.0, 1.0, 0.5, 65504.0, 1e-5, -0.5, 2.5, 0.999, 0.001]` +// includes only one byte-symmetric value (`0.0` → `0x00..00`); every +// other value has distinct LE/BE byte layouts, so the parity assertions +// would fail on BE without gating. /// 9 representative half-float inputs: normal [0,1] range, HDR, subnormal- /// ish, negative, and over-range. Replicated across 9 pixels × 3 channels /// so that every channel position sees every value at some pixel. +/// +/// LE-only: the six parity / widen / copy tests below are all gated on +/// `target_endian = "little"`, so this helper is unused on BE hosts. +#[cfg(target_endian = "little")] fn rgbf16_test_inputs() -> (Vec, Vec, usize) { let inputs_f32: [f32; 9] = [0.0, 1.0, 0.5, 65504.0, 1e-5, -0.5, 2.5, 0.999, 0.001]; let width = inputs_f32.len(); @@ -656,6 +691,7 @@ fn rgbf16_test_inputs() -> (Vec, Vec, usize) { } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri" @@ -664,12 +700,13 @@ fn rgbf16_scalar_rgb_matches_widen_then_rgbf32() { let (rgb_in, widened, width) = rgbf16_test_inputs(); let mut out_f16 = std::vec![0u8; width * 3]; let mut out_via_f32 = std::vec![0u8; width * 3]; - rgbf16_to_rgb_row(&rgb_in, &mut out_f16, width); - rgbf32_to_rgb_row(&widened, &mut out_via_f32, width); + rgbf16_to_rgb_row::(&rgb_in, &mut out_f16, width); + rgbf32_to_rgb_row::(&widened, &mut out_via_f32, width); assert_eq!(out_f16, out_via_f32, "rgbf16_to_rgb scalar parity"); } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri" @@ -678,12 +715,13 @@ fn rgbf16_scalar_rgba_matches_widen_then_rgbf32() { let (rgb_in, widened, width) = rgbf16_test_inputs(); let mut out_f16 = std::vec![0u8; width * 4]; let mut out_via_f32 = std::vec![0u8; width * 4]; - rgbf16_to_rgba_row(&rgb_in, &mut out_f16, width); - rgbf32_to_rgba_row(&widened, &mut out_via_f32, width); + rgbf16_to_rgba_row::(&rgb_in, &mut out_f16, width); + rgbf32_to_rgba_row::(&widened, &mut out_via_f32, width); assert_eq!(out_f16, out_via_f32, "rgbf16_to_rgba scalar parity"); } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri" @@ -692,12 +730,13 @@ fn rgbf16_scalar_rgb_u16_matches_widen_then_rgbf32() { let (rgb_in, widened, width) = rgbf16_test_inputs(); let mut out_f16 = std::vec![0u16; width * 3]; let mut out_via_f32 = std::vec![0u16; width * 3]; - rgbf16_to_rgb_u16_row(&rgb_in, &mut out_f16, width); - rgbf32_to_rgb_u16_row(&widened, &mut out_via_f32, width); + rgbf16_to_rgb_u16_row::(&rgb_in, &mut out_f16, width); + rgbf32_to_rgb_u16_row::(&widened, &mut out_via_f32, width); assert_eq!(out_f16, out_via_f32, "rgbf16_to_rgb_u16 scalar parity"); } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri" @@ -706,12 +745,13 @@ fn rgbf16_scalar_rgba_u16_matches_widen_then_rgbf32() { let (rgb_in, widened, width) = rgbf16_test_inputs(); let mut out_f16 = std::vec![0u16; width * 4]; let mut out_via_f32 = std::vec![0u16; width * 4]; - rgbf16_to_rgba_u16_row(&rgb_in, &mut out_f16, width); - rgbf32_to_rgba_u16_row(&widened, &mut out_via_f32, width); + rgbf16_to_rgba_u16_row::(&rgb_in, &mut out_f16, width); + rgbf32_to_rgba_u16_row::(&widened, &mut out_via_f32, width); assert_eq!(out_f16, out_via_f32, "rgbf16_to_rgba_u16 scalar parity"); } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri" @@ -719,7 +759,7 @@ fn rgbf16_scalar_rgba_u16_matches_widen_then_rgbf32() { fn rgbf16_scalar_rgb_f32_matches_element_wise_widen() { let (rgb_in, widened, width) = rgbf16_test_inputs(); let mut out = std::vec![0.0f32; width * 3]; - rgbf16_to_rgb_f32_row(&rgb_in, &mut out, width); + rgbf16_to_rgb_f32_row::(&rgb_in, &mut out, width); // Each output must equal the bit-exact widening of the input f16. assert_eq!( out, widened, @@ -728,6 +768,7 @@ fn rgbf16_scalar_rgb_f32_matches_element_wise_widen() { } #[test] +#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri" @@ -735,7 +776,7 @@ fn rgbf16_scalar_rgb_f32_matches_element_wise_widen() { fn rgbf16_scalar_rgb_f16_is_copy() { let (rgb_in, _widened, width) = rgbf16_test_inputs(); let mut out = std::vec![half::f16::ZERO; width * 3]; - rgbf16_to_rgb_f16_row(&rgb_in, &mut out, width); + rgbf16_to_rgb_f16_row::(&rgb_in, &mut out, width); assert_eq!( out, rgb_in, "rgbf16_to_rgb_f16 must be a byte-identical copy" diff --git a/src/sinker/mixed/packed_rgb_f16.rs b/src/sinker/mixed/packed_rgb_f16.rs index 38e07d15..62ab1cc8 100644 --- a/src/sinker/mixed/packed_rgb_f16.rs +++ b/src/sinker/mixed/packed_rgb_f16.rs @@ -34,6 +34,25 @@ use crate::{ yuv::{Rgbf16, Rgbf16Row, Rgbf16Sink}, }; +/// `BE` value that makes the `rgbf16_to_*` row dispatchers treat their input as +/// host-native (a no-op byte-swap). Used here because [`crate::frame::Rgbf16Frame`] +/// exposes a `&[half::f16]` row in **host-native** layout — the API contract is that the +/// caller hands us already-decoded half-floats. The kernel `BE` parameter, +/// however, names the **encoded** byte order (so `BE = false` means "decode +/// LE-encoded bytes" via `u16::from_le`). On a LE host the host-native layout +/// is LE, so `BE = false` is correct; on a BE host the host-native layout is +/// BE, so we must request `BE = true` to make `u16::from_be` no-op the swap. +/// Without this routing the loaders would byte-swap an already-decoded host- +/// native `f16` on BE hosts, corrupting every output path. +/// +/// This is the **sinker-layer** complement to the SIMD-backend-internal +/// `HOST_NATIVE_BE` introduced for the f16→f32 widen-then-convert paths in +/// `c3a6478` — same truth table, different layer: +/// +/// • LE host: `HOST_NATIVE_BE = false` → `from_le` (no-op on LE) → correct. +/// • BE host: `HOST_NATIVE_BE = true` → `from_be` (no-op on BE) → correct. +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + // ---- Rgbf16 impl ------------------------------------------------------- impl<'a> MixedSinker<'a, Rgbf16> { @@ -234,27 +253,27 @@ impl PixelSink for MixedSinker<'_, Rgbf16> { if let Some(buf) = rgb_f16.as_deref_mut() { let f16_start = one_plane_start * 3; let f16_end = one_plane_end * 3; - rgbf16_to_rgb_f16_row(rgb_in, &mut buf[f16_start..f16_end], w, use_simd); + rgbf16_to_rgb_f16_row::(rgb_in, &mut buf[f16_start..f16_end], w, use_simd); } // Lossless f32 widen — also independent of integer conversion paths. if let Some(buf) = rgb_f32.as_deref_mut() { let f32_start = one_plane_start * 3; let f32_end = one_plane_end * 3; - rgbf16_to_rgb_f32_row(rgb_in, &mut buf[f32_start..f32_end], w, use_simd); + rgbf16_to_rgb_f32_row::(rgb_in, &mut buf[f32_start..f32_end], w, use_simd); } // u16 RGB output — direct half-float → u16 conversion (no staging). if let Some(buf) = rgb_u16.as_deref_mut() { let u16_start = one_plane_start * 3; let u16_end = one_plane_end * 3; - rgbf16_to_rgb_u16_row(rgb_in, &mut buf[u16_start..u16_end], w, use_simd); + rgbf16_to_rgb_u16_row::(rgb_in, &mut buf[u16_start..u16_end], w, use_simd); } // u16 RGBA output — direct half-float → u16 conversion (no staging). if let Some(buf) = rgba_u16.as_deref_mut() { let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - rgbf16_to_rgba_u16_row(rgb_in, rgba_row, w, use_simd); + rgbf16_to_rgba_u16_row::(rgb_in, rgba_row, w, use_simd); } // u8 RGBA standalone fast path — direct float → u8 when no RGB / luma / @@ -269,7 +288,7 @@ impl PixelSink for MixedSinker<'_, Rgbf16> { if want_rgba_u8 && !need_u8_rgb { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - rgbf16_to_rgba_row(rgb_in, rgba_row, w, use_simd); + rgbf16_to_rgba_row::(rgb_in, rgba_row, w, use_simd); return Ok(()); } @@ -288,7 +307,7 @@ impl PixelSink for MixedSinker<'_, Rgbf16> { w, h, )?; - rgbf16_to_rgb_row(rgb_in, rgb_row, w, use_simd); + rgbf16_to_rgb_row::(rgb_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { rgb_to_luma_row( @@ -328,7 +347,7 @@ impl PixelSink for MixedSinker<'_, Rgbf16> { // over `rgb_row` via `expand_rgb_to_rgba_row`. if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - rgbf16_to_rgba_row(rgb_in, rgba_row, w, use_simd); + rgbf16_to_rgba_row::(rgb_in, rgba_row, w, use_simd); } Ok(()) diff --git a/src/sinker/mixed/packed_rgb_float.rs b/src/sinker/mixed/packed_rgb_float.rs index cc63c9b2..f189e5ab 100644 --- a/src/sinker/mixed/packed_rgb_float.rs +++ b/src/sinker/mixed/packed_rgb_float.rs @@ -31,6 +31,25 @@ use crate::{ yuv::{Rgbf32, Rgbf32Row, Rgbf32Sink}, }; +/// `BE` value that makes the `rgbf32_to_*` row dispatchers treat their input as +/// host-native (a no-op byte-swap). Used here because [`crate::frame::Rgbf32Frame`] +/// exposes a `&[f32]` row in **host-native** layout — the API contract is that the caller +/// hands us already-decoded floats. The kernel `BE` parameter, however, names +/// the **encoded** byte order (so `BE = false` means "decode LE-encoded bytes" +/// via `u32::from_le`). On a LE host the host-native layout is LE, so +/// `BE = false` is correct; on a BE host the host-native layout is BE, so we +/// must request `BE = true` to make `u32::from_be` no-op the swap. Without this +/// routing the loaders would byte-swap an already-decoded host-native `f32` on +/// BE hosts, corrupting every output path. +/// +/// This is the **sinker-layer** complement to the SIMD-backend-internal +/// `HOST_NATIVE_BE` introduced for the f16→f32 widen-then-convert paths in +/// `c3a6478` — same truth table, different layer: +/// +/// • LE host: `HOST_NATIVE_BE = false` → `from_le` (no-op on LE) → correct. +/// • BE host: `HOST_NATIVE_BE = true` → `from_be` (no-op on BE) → correct. +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + // ---- Rgbf32 impl ------------------------------------------------------- impl<'a> MixedSinker<'a, Rgbf32> { @@ -209,20 +228,20 @@ impl PixelSink for MixedSinker<'_, Rgbf32> { if let Some(buf) = rgb_f32.as_deref_mut() { let f32_start = one_plane_start * 3; let f32_end = one_plane_end * 3; - rgbf32_to_rgb_f32_row(rgb_in, &mut buf[f32_start..f32_end], w, use_simd); + rgbf32_to_rgb_f32_row::(rgb_in, &mut buf[f32_start..f32_end], w, use_simd); } // u16 RGB output — direct float→u16 conversion (no staging). if let Some(buf) = rgb_u16.as_deref_mut() { let u16_start = one_plane_start * 3; let u16_end = one_plane_end * 3; - rgbf32_to_rgb_u16_row(rgb_in, &mut buf[u16_start..u16_end], w, use_simd); + rgbf32_to_rgb_u16_row::(rgb_in, &mut buf[u16_start..u16_end], w, use_simd); } // u16 RGBA output — direct float→u16 conversion (no staging). if let Some(buf) = rgba_u16.as_deref_mut() { let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - rgbf32_to_rgba_u16_row(rgb_in, rgba_row, w, use_simd); + rgbf32_to_rgba_u16_row::(rgb_in, rgba_row, w, use_simd); } // u8 RGBA standalone fast path — direct float→u8 conversion when @@ -237,7 +256,7 @@ impl PixelSink for MixedSinker<'_, Rgbf32> { if want_rgba_u8 && !need_u8_rgb { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - rgbf32_to_rgba_row(rgb_in, rgba_row, w, use_simd); + rgbf32_to_rgba_row::(rgb_in, rgba_row, w, use_simd); return Ok(()); } @@ -257,7 +276,7 @@ impl PixelSink for MixedSinker<'_, Rgbf32> { w, h, )?; - rgbf32_to_rgb_row(rgb_in, rgb_row, w, use_simd); + rgbf32_to_rgb_row::(rgb_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { rgb_to_luma_row( @@ -299,7 +318,7 @@ impl PixelSink for MixedSinker<'_, Rgbf32> { // less memory pass for combined `with_rgb + with_rgba` callers. if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - rgbf32_to_rgba_row(rgb_in, rgba_row, w, use_simd); + rgbf32_to_rgba_row::(rgb_in, rgba_row, w, use_simd); } Ok(()) diff --git a/src/sinker/mixed/tests/packed_rgb_f16.rs b/src/sinker/mixed/tests/packed_rgb_f16.rs index a30bbe8a..1cd09a1f 100644 --- a/src/sinker/mixed/tests/packed_rgb_f16.rs +++ b/src/sinker/mixed/tests/packed_rgb_f16.rs @@ -310,3 +310,132 @@ fn rgbf16_simd_matches_scalar_with_random_input() { assert_eq!(luma_u16_simd, luma_u16_scalar, "Luma u16 output diverges"); assert_eq!(rgb_f16_simd, pix, "RGB f16 output is not lossless"); } + +/// Sinker-layer host-native-`f16` regression for the bug fixed alongside +/// `c3a6478` (PR #83 codex 2nd-pass review): the [`Rgbf16`] sinker used to +/// hardcode `::` when calling the row dispatchers, telling them to +/// "decode LE-encoded input". Because [`Rgbf16Frame`] hands us a host-native +/// `&[half::f16]` row, that routing was a no-op on LE hosts but corrupted +/// every output path on BE hosts (the `u16` loaders would byte-swap an +/// already-decoded f16 bit-pattern). The fix replaces those `::` with +/// `::`, which is `false` on LE and `true` on BE — a no-op +/// byte-swap on either host. +/// +/// On a LE host (the only target Apple-Silicon and x86_64 macOS can run), +/// `HOST_NATIVE_BE = false` and `::` is byte-for-byte +/// identical to `::`, so this test cannot distinguish the broken vs +/// fixed code on LE. It instead documents the equivalence at the **kernel +/// dispatch** layer — calling each `rgbf16_to_*` dispatcher with both +/// `BE = false` and `BE = HOST_NATIVE_BE` (= `cfg!(target_endian = "big")`) +/// must produce identical output on the active host. +/// +/// **LE-host-only**: gated on `target_endian = "little"`. On a BE host the +/// equality `::` ≡ `::` is _false_ — `::` +/// decodes the host-native fixture as if it were LE-encoded (byte-swap), +/// while `:: == ::` decodes as BE (no swap), so the +/// outputs diverge by design. The dispatch-equivalence claim is specifically +/// about the LE host-routing pattern; the BE-host correctness of the routing +/// change is verified instead by +/// [`rgbf16_sinker_host_native_contract_lossless_passthrough`] and the +/// row-kernel BE parity tests in `src/row/arch/*/tests/`. +#[test] +#[cfg(target_endian = "little")] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn rgbf16_kernel_host_native_be_matches_false_on_le_host() { + use crate::row::{ + rgbf16_to_rgb_f16_row, rgbf16_to_rgb_f32_row, rgbf16_to_rgb_row, rgbf16_to_rgb_u16_row, + rgbf16_to_rgba_row, rgbf16_to_rgba_u16_row, + }; + + // The sinker layer's `HOST_NATIVE_BE` mirrors `cfg!(target_endian = "big")`. + // Compute it locally so the test asserts the same condition without taking + // a dependency on a private const. + const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + + // Width 33 covers SIMD main loop + scalar tail across every backend. + let w = 33usize; + let f32_inputs = [0.0f32, 0.5, 1.0, 1.75, -0.25]; + let pix: std::vec::Vec = (0..w * 3) + .map(|i| half::f16::from_f32(f32_inputs[i % f32_inputs.len()])) + .collect(); + + // u8 RGB. + let mut rgb_false = std::vec![0u8; w * 3]; + let mut rgb_host = std::vec![0u8; w * 3]; + rgbf16_to_rgb_row::(&pix, &mut rgb_false, w, true); + rgbf16_to_rgb_row::(&pix, &mut rgb_host, w, true); + assert_eq!(rgb_false, rgb_host, "u8 RGB diverges"); + + // u8 RGBA. + let mut rgba_false = std::vec![0u8; w * 4]; + let mut rgba_host = std::vec![0u8; w * 4]; + rgbf16_to_rgba_row::(&pix, &mut rgba_false, w, true); + rgbf16_to_rgba_row::(&pix, &mut rgba_host, w, true); + assert_eq!(rgba_false, rgba_host, "u8 RGBA diverges"); + + // u16 RGB. + let mut rgb_u16_false = std::vec![0u16; w * 3]; + let mut rgb_u16_host = std::vec![0u16; w * 3]; + rgbf16_to_rgb_u16_row::(&pix, &mut rgb_u16_false, w, true); + rgbf16_to_rgb_u16_row::(&pix, &mut rgb_u16_host, w, true); + assert_eq!(rgb_u16_false, rgb_u16_host, "u16 RGB diverges"); + + // u16 RGBA. + let mut rgba_u16_false = std::vec![0u16; w * 4]; + let mut rgba_u16_host = std::vec![0u16; w * 4]; + rgbf16_to_rgba_u16_row::(&pix, &mut rgba_u16_false, w, true); + rgbf16_to_rgba_u16_row::(&pix, &mut rgba_u16_host, w, true); + assert_eq!(rgba_u16_false, rgba_u16_host, "u16 RGBA diverges"); + + // f16 lossless pass-through. + let mut f16_false = std::vec![half::f16::ZERO; w * 3]; + let mut f16_host = std::vec![half::f16::ZERO; w * 3]; + rgbf16_to_rgb_f16_row::(&pix, &mut f16_false, w, true); + rgbf16_to_rgb_f16_row::(&pix, &mut f16_host, w, true); + assert_eq!(f16_false, f16_host, "f16 RGB diverges"); + if !HOST_NATIVE_BE { + assert_eq!( + f16_host, pix, + "f16 lossless pass-through corrupted on LE host" + ); + } + + // f32 lossless widen. + let mut f32_false = std::vec![0.0f32; w * 3]; + let mut f32_host = std::vec![0.0f32; w * 3]; + rgbf16_to_rgb_f32_row::(&pix, &mut f32_false, w, true); + rgbf16_to_rgb_f32_row::(&pix, &mut f32_host, w, true); + assert_eq!(f32_false, f32_host, "f32 widen diverges"); +} + +/// End-to-end sinker contract test: feeding host-native `half::f16` through +/// [`MixedSinker`] must round-trip the f16 input bit-exact via +/// `with_rgb_f16` on every host. Documents the public-API contract that the +/// [`HOST_NATIVE_BE`] routing fix preserves. Pairs with the kernel-level +/// test above; together they cover both the dispatch boundary and the public +/// sinker boundary. +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn rgbf16_sinker_host_native_contract_lossless_passthrough() { + let vals_f32 = [0.5f32, 1.5, -0.25, 100.0]; + let pix: std::vec::Vec = (0..16 * 4 * 3) + .map(|i| half::f16::from_f32(vals_f32[i % vals_f32.len()])) + .collect(); + let src = Rgbf16Frame::try_new(&pix, 16, 4, 16 * 3).unwrap(); + + let mut rgb_f16_out = std::vec![half::f16::ZERO; 16 * 4 * 3]; + let mut sink = MixedSinker::::new(16, 4) + .with_rgb_f16(&mut rgb_f16_out) + .unwrap(); + rgbf16_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); + + // Bit-exact pass-through on every host — broken `::` routing + // would byte-swap on a BE host; the fixed routing keeps the f16 intact. + assert_eq!(rgb_f16_out, pix, "Rgbf16 sinker f16 pass-through corrupted"); +} diff --git a/src/sinker/mixed/tests/packed_rgb_float.rs b/src/sinker/mixed/tests/packed_rgb_float.rs index 47cd3be4..fd4df2da 100644 --- a/src/sinker/mixed/tests/packed_rgb_float.rs +++ b/src/sinker/mixed/tests/packed_rgb_float.rs @@ -245,3 +245,141 @@ fn rgbf32_simd_matches_scalar_with_random_input() { assert_eq!(luma_u16_simd, luma_u16_scalar, "Luma u16 output diverges"); assert_eq!(rgb_f32_simd, pix, "RGB f32 output is not lossless"); } + +/// Sinker-layer host-native-`f32` regression for the bug fixed alongside +/// `c3a6478` (PR #83 codex 2nd-pass review): the [`Rgbf32`] sinker used to +/// hardcode `::` when calling the row dispatchers, telling them to +/// "decode LE-encoded input". Because [`Rgbf32Frame`] hands us a host-native +/// `&[f32]` row, that routing was a no-op on LE hosts but corrupted every +/// output path on BE hosts (the loaders would byte-swap an already-decoded +/// f32). The fix replaces those `::` with `::`, which +/// is `false` on LE and `true` on BE — a no-op byte-swap on either host. +/// +/// On a LE host (the only target Apple-Silicon and x86_64 macOS can run), +/// `HOST_NATIVE_BE = false` and `::` is byte-for-byte +/// identical to `::`, so this test cannot distinguish the broken vs +/// fixed code on LE. It instead documents the equivalence at the **kernel +/// dispatch** layer — calling each `rgbf32_to_*` dispatcher with both +/// `BE = false` and `BE = HOST_NATIVE_BE` (= `cfg!(target_endian = "big")`) +/// must produce identical output on the active host. +/// +/// **LE-host-only**: gated on `target_endian = "little"`. On a BE host the +/// equality `::` ≡ `::` is _false_ — `::` +/// decodes the host-native fixture as if it were LE-encoded (byte-swap), +/// while `:: == ::` decodes as BE (no swap), so the +/// outputs diverge by design. The dispatch-equivalence claim is specifically +/// about the LE host-routing pattern; the BE-host correctness of the routing +/// change is verified instead by +/// [`rgbf32_sinker_host_native_contract_lossless_passthrough`] and the +/// row-kernel BE parity tests in `src/row/arch/*/tests/`. +#[test] +#[cfg(target_endian = "little")] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn rgbf32_kernel_host_native_be_matches_false_on_le_host() { + use crate::row::{ + rgbf32_to_rgb_f32_row, rgbf32_to_rgb_row, rgbf32_to_rgb_u16_row, rgbf32_to_rgba_row, + rgbf32_to_rgba_u16_row, + }; + + // The sinker layer's `HOST_NATIVE_BE` mirrors `cfg!(target_endian = "big")`. + // Compute it locally so the test asserts the same condition without taking + // a dependency on a private const. + const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + + // Width 33 covers SIMD main loop + scalar tail across every backend. + let w = 33usize; + let mut pix = std::vec![0.0f32; w * 3]; + for (i, v) in pix.iter_mut().enumerate() { + // Mix in-range, HDR, and negative values to exercise every clamp branch. + *v = match i % 5 { + 0 => 0.0, + 1 => 0.5, + 2 => 1.0, + 3 => 1.75, + _ => -0.25, + }; + } + + // u8 RGB. + let mut rgb_false = std::vec![0u8; w * 3]; + let mut rgb_host = std::vec![0u8; w * 3]; + rgbf32_to_rgb_row::(&pix, &mut rgb_false, w, true); + rgbf32_to_rgb_row::(&pix, &mut rgb_host, w, true); + assert_eq!(rgb_false, rgb_host, "u8 RGB diverges"); + + // u8 RGBA. + let mut rgba_false = std::vec![0u8; w * 4]; + let mut rgba_host = std::vec![0u8; w * 4]; + rgbf32_to_rgba_row::(&pix, &mut rgba_false, w, true); + rgbf32_to_rgba_row::(&pix, &mut rgba_host, w, true); + assert_eq!(rgba_false, rgba_host, "u8 RGBA diverges"); + + // u16 RGB. + let mut rgb_u16_false = std::vec![0u16; w * 3]; + let mut rgb_u16_host = std::vec![0u16; w * 3]; + rgbf32_to_rgb_u16_row::(&pix, &mut rgb_u16_false, w, true); + rgbf32_to_rgb_u16_row::(&pix, &mut rgb_u16_host, w, true); + assert_eq!(rgb_u16_false, rgb_u16_host, "u16 RGB diverges"); + + // u16 RGBA. + let mut rgba_u16_false = std::vec![0u16; w * 4]; + let mut rgba_u16_host = std::vec![0u16; w * 4]; + rgbf32_to_rgba_u16_row::(&pix, &mut rgba_u16_false, w, true); + rgbf32_to_rgba_u16_row::(&pix, &mut rgba_u16_host, w, true); + assert_eq!(rgba_u16_false, rgba_u16_host, "u16 RGBA diverges"); + + // f32 lossless pass-through. + let mut f32_false = std::vec![0.0f32; w * 3]; + let mut f32_host = std::vec![0.0f32; w * 3]; + rgbf32_to_rgb_f32_row::(&pix, &mut f32_false, w, true); + rgbf32_to_rgb_f32_row::(&pix, &mut f32_host, w, true); + assert_eq!(f32_false, f32_host, "f32 RGB diverges"); + // And on the host (LE on every CI runner) both must equal `pix` bit-exact. + if !HOST_NATIVE_BE { + assert_eq!( + f32_host, pix, + "f32 lossless pass-through corrupted on LE host" + ); + } +} + +/// End-to-end sinker contract test: feeding host-native `f32` through +/// [`MixedSinker`] must produce the same output every other sinker +/// would expect from a host-native source — specifically, `with_rgb_f32` +/// must be bit-exact identical to the input on every host. Documents the +/// public-API contract that the [`HOST_NATIVE_BE`] routing fix preserves. +/// Pairs with the kernel-level test above; together they cover both the +/// dispatch boundary and the public sinker boundary. +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn rgbf32_sinker_host_native_contract_lossless_passthrough() { + // Mix HDR, in-range, and negative values — the f32 lossless path must + // round-trip them bit-exact on every host. + let mut pix = std::vec![0.0f32; 16 * 4 * 3]; + for (i, v) in pix.iter_mut().enumerate() { + *v = match i % 4 { + 0 => 0.5, + 1 => 1.5, + 2 => -0.25, + _ => 100.0, + }; + } + let src = Rgbf32Frame::try_new(&pix, 16, 4, 16 * 3).unwrap(); + + let mut rgb_f32_out = std::vec![0.0f32; 16 * 4 * 3]; + let mut sink = MixedSinker::::new(16, 4) + .with_rgb_f32(&mut rgb_f32_out) + .unwrap(); + rgbf32_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); + + // Bit-exact pass-through on every host. On the buggy `::` routing + // a BE host would see byte-swapped output here; on the fixed routing the + // assertion holds on both LE and BE. + assert_eq!(rgb_f32_out, pix, "Rgbf32 sinker f32 pass-through corrupted"); +}