diff --git a/src/row/arch/neon/alpha_extract.rs b/src/row/arch/neon/alpha_extract.rs index 8b870dc2..ffb04e6a 100644 --- a/src/row/arch/neon/alpha_extract.rs +++ b/src/row/arch/neon/alpha_extract.rs @@ -241,7 +241,12 @@ pub(crate) unsafe fn copy_alpha_plane_u16_to_u8( } if x < width { - scalar::copy_alpha_plane_u16_to_u8::( + // Scalar tail uses `BE = false`: this NEON helper does host-native u16 + // loads (`vld1q_u16`), which match LE-on-disk only on LE hosts. The + // dispatcher routes the BE = true case directly to scalar (see + // `dispatch::alpha_extract`), so the SIMD path here is BE = false by + // construction. + scalar::copy_alpha_plane_u16_to_u8::( &alpha[x..width], &mut rgba_out[x * 4..width * 4], width - x, @@ -286,7 +291,8 @@ pub(crate) unsafe fn copy_alpha_plane_u16( } if x < width { - scalar::copy_alpha_plane_u16::( + // Scalar tail uses `BE = false`: see `copy_alpha_plane_u16_to_u8` above. + scalar::copy_alpha_plane_u16::( &alpha[x..width], &mut rgba_out[x * 4..width * 4], width - x, @@ -409,7 +415,8 @@ mod tests { pseudo_random_u8(&mut rgba_simd, 0xBABE); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_simd, w) }; - scalar::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_scalar, w); + // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host). + scalar::copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } @@ -430,7 +437,8 @@ mod tests { pseudo_random_u8(&mut rgba_simd, 0x5EED); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_simd, w) }; - scalar::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_scalar, w); + // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host). + scalar::copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } @@ -448,7 +456,8 @@ mod tests { pseudo_random_u16(&mut rgba_simd, 0xFADE); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_simd, w) }; - scalar::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_scalar, w); + // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host). + scalar::copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } diff --git a/src/row/arch/neon/planar_gbr_high_bit.rs b/src/row/arch/neon/planar_gbr_high_bit.rs index 44996068..0d839324 100644 --- a/src/row/arch/neon/planar_gbr_high_bit.rs +++ b/src/row/arch/neon/planar_gbr_high_bit.rs @@ -1,6 +1,7 @@ //! NEON kernels for high-bit-depth planar GBR sources (Tier 10b). //! -//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}`. +//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}` and +//! `BE: bool` (endianness of the source u16 planes). //! Lane width: 8 pixels per iteration (`vld1q_u16` = 8 × u16). //! `vst3q_u16` / `vst4q_u16` do the 3-way / 4-way u16 interleave in a //! single hardware instruction. Scalar tails handle the remainder. @@ -11,16 +12,27 @@ //! using a negative-count vector shift (`vshlq_u16` with a negative //! shift), then narrowed with `vqmovn_u16` to u8x8. Two such halves are //! recombined with `vcombine_u8` before `vst3q_u8` / `vst4q_u8`. +//! +//! # Big-endian (`BE = true`) mode +//! +//! When `BE = true` each 8-pixel NEON load goes through +//! `load_endian_u16x8::` (defined in `endian.rs`) which applies a +//! per-lane byte-swap via `vrev16q_u8`. The branch is resolved at +//! monomorphisation — `BE = false` compiles to a plain `vld1q_u16`. use core::arch::aarch64::*; use crate::row::scalar; +use super::endian::load_endian_u16x8; + // ---- u8 output, 3-channel (RGB) ----------------------------------------- /// NEON high-bit-depth G/B/R planar → packed `R, G, B` **bytes**. /// Downshifts each sample by `BITS - 8` and narrows to u8. /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. NEON must be available (caller obligation). @@ -28,7 +40,7 @@ use crate::row::scalar; /// 3. `rgb_out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gbr_to_rgb_high_bit_row( +pub(crate) unsafe fn gbr_to_rgb_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -48,9 +60,13 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( let mut x = 0usize; while x + 8 <= width { - let g_v = vandq_u16(vld1q_u16(g.as_ptr().add(x)), mask_v); - let b_v = vandq_u16(vld1q_u16(b.as_ptr().add(x)), mask_v); - let r_v = vandq_u16(vld1q_u16(r.as_ptr().add(x)), mask_v); + let g_raw = load_endian_u16x8::(g.as_ptr().add(x).cast()); + let b_raw = load_endian_u16x8::(b.as_ptr().add(x).cast()); + let r_raw = load_endian_u16x8::(r.as_ptr().add(x).cast()); + + let g_v = vandq_u16(g_raw, mask_v); + let b_v = vandq_u16(b_raw, mask_v); + let r_v = vandq_u16(r_raw, mask_v); // Right-shift each 8-pixel vector by BITS-8, then narrow to u8x8. let r_sh = vqmovn_u16(vshlq_u16(r_v, shr)); @@ -70,7 +86,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( x += 8; } if x < width { - scalar::gbr_to_rgb_high_bit_row::( + scalar::gbr_to_rgb_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -86,6 +102,8 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( /// NEON high-bit-depth G/B/R planar → packed `R, G, B, A` **bytes** /// with constant opaque alpha (`0xFF`). Used by `Gbrp*` (no alpha plane). /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. NEON must be available (caller obligation). @@ -93,7 +111,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( /// 3. `rgba_out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( +pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -113,9 +131,13 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( let mut x = 0usize; while x + 8 <= width { - let g_v = vandq_u16(vld1q_u16(g.as_ptr().add(x)), mask_v); - let b_v = vandq_u16(vld1q_u16(b.as_ptr().add(x)), mask_v); - let r_v = vandq_u16(vld1q_u16(r.as_ptr().add(x)), mask_v); + let g_raw = load_endian_u16x8::(g.as_ptr().add(x).cast()); + let b_raw = load_endian_u16x8::(b.as_ptr().add(x).cast()); + let r_raw = load_endian_u16x8::(r.as_ptr().add(x).cast()); + + let g_v = vandq_u16(g_raw, mask_v); + let b_v = vandq_u16(b_raw, mask_v); + let r_v = vandq_u16(r_raw, mask_v); let r_sh = vqmovn_u16(vshlq_u16(r_v, shr)); let g_sh = vqmovn_u16(vshlq_u16(g_v, shr)); @@ -132,7 +154,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( x += 8; } if x < width { - scalar::gbr_to_rgba_opaque_high_bit_row::( + scalar::gbr_to_rgba_opaque_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -148,6 +170,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( /// NEON high-bit-depth G/B/R/A planar → packed `R, G, B, A` **bytes**. /// Alpha sourced from the `a` plane, downshifted by `BITS - 8`. /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. NEON must be available (caller obligation). @@ -155,7 +179,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( /// 3. `rgba_out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gbra_to_rgba_high_bit_row( +pub(crate) unsafe fn gbra_to_rgba_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -176,10 +200,15 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row( let mut x = 0usize; while x + 8 <= width { - let g_v = vandq_u16(vld1q_u16(g.as_ptr().add(x)), mask_v); - let b_v = vandq_u16(vld1q_u16(b.as_ptr().add(x)), mask_v); - let r_v = vandq_u16(vld1q_u16(r.as_ptr().add(x)), mask_v); - let a_v = vandq_u16(vld1q_u16(a.as_ptr().add(x)), mask_v); + let g_raw = load_endian_u16x8::(g.as_ptr().add(x).cast()); + let b_raw = load_endian_u16x8::(b.as_ptr().add(x).cast()); + let r_raw = load_endian_u16x8::(r.as_ptr().add(x).cast()); + let a_raw = load_endian_u16x8::(a.as_ptr().add(x).cast()); + + let g_v = vandq_u16(g_raw, mask_v); + let b_v = vandq_u16(b_raw, mask_v); + let r_v = vandq_u16(r_raw, mask_v); + let a_v = vandq_u16(a_raw, mask_v); let r_sh = vqmovn_u16(vshlq_u16(r_v, shr)); let g_sh = vqmovn_u16(vshlq_u16(g_v, shr)); @@ -197,7 +226,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row( x += 8; } if x < width { - scalar::gbra_to_rgba_high_bit_row::( + scalar::gbra_to_rgba_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -214,6 +243,8 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row( /// NEON high-bit-depth G/B/R planar → packed `R, G, B` **u16** samples. /// Copies samples without shifting — output values in `[0, (1<( /// 3. `rgb_u16_out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( +pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -238,16 +269,16 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( let mask_v = vdupq_n_u16(((1u32 << BITS) - 1) as u16); let mut x = 0usize; while x + 8 <= width { - let r_v = vandq_u16(vld1q_u16(r.as_ptr().add(x)), mask_v); - let g_v = vandq_u16(vld1q_u16(g.as_ptr().add(x)), mask_v); - let b_v = vandq_u16(vld1q_u16(b.as_ptr().add(x)), mask_v); + let r_v = vandq_u16(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask_v); + let g_v = vandq_u16(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask_v); + let b_v = vandq_u16(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask_v); // vst3q_u16 stores 8×3 = 24 u16 interleaved as R,G,B per pixel. let triple = uint16x8x3_t(r_v, g_v, b_v); vst3q_u16(rgb_u16_out.as_mut_ptr().add(x * 3), triple); x += 8; } if x < width { - scalar::gbr_to_rgb_u16_high_bit_row::( + scalar::gbr_to_rgb_u16_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -263,6 +294,8 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( /// NEON high-bit-depth G/B/R planar → packed `R, G, B, A` **u16** samples /// with constant opaque alpha `(1 << BITS) - 1`. /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. NEON must be available (caller obligation). @@ -270,7 +303,7 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( /// 3. `rgba_u16_out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( +pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -292,15 +325,15 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( let mut x = 0usize; while x + 8 <= width { - let r_v = vandq_u16(vld1q_u16(r.as_ptr().add(x)), mask_v); - let g_v = vandq_u16(vld1q_u16(g.as_ptr().add(x)), mask_v); - let b_v = vandq_u16(vld1q_u16(b.as_ptr().add(x)), mask_v); + let r_v = vandq_u16(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask_v); + let g_v = vandq_u16(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask_v); + let b_v = vandq_u16(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask_v); let quad = uint16x8x4_t(r_v, g_v, b_v, opaque); vst4q_u16(rgba_u16_out.as_mut_ptr().add(x * 4), quad); x += 8; } if x < width { - scalar::gbr_to_rgba_opaque_u16_high_bit_row::( + scalar::gbr_to_rgba_opaque_u16_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -316,6 +349,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( /// NEON high-bit-depth G/B/R/A planar → packed `R, G, B, A` **u16** samples. /// Alpha sourced from the `a` plane at native depth (no shift). /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. NEON must be available (caller obligation). @@ -323,7 +358,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( /// 3. `rgba_u16_out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row( +pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -345,16 +380,16 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row( let mask_v = vdupq_n_u16(((1u32 << BITS) - 1) as u16); let mut x = 0usize; while x + 8 <= width { - let r_v = vandq_u16(vld1q_u16(r.as_ptr().add(x)), mask_v); - let g_v = vandq_u16(vld1q_u16(g.as_ptr().add(x)), mask_v); - let b_v = vandq_u16(vld1q_u16(b.as_ptr().add(x)), mask_v); - let a_v = vandq_u16(vld1q_u16(a.as_ptr().add(x)), mask_v); + let r_v = vandq_u16(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask_v); + let g_v = vandq_u16(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask_v); + let b_v = vandq_u16(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask_v); + let a_v = vandq_u16(load_endian_u16x8::(a.as_ptr().add(x).cast()), mask_v); let quad = uint16x8x4_t(r_v, g_v, b_v, a_v); vst4q_u16(rgba_u16_out.as_mut_ptr().add(x * 4), quad); x += 8; } if x < width { - scalar::gbra_to_rgba_u16_high_bit_row::( + scalar::gbra_to_rgba_u16_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], diff --git a/src/row/arch/neon/tests/planar_gbr_high_bit.rs b/src/row/arch/neon/tests/planar_gbr_high_bit.rs index 3f7762ba..0a9c3301 100644 --- a/src/row/arch/neon/tests/planar_gbr_high_bit.rs +++ b/src/row/arch/neon/tests/planar_gbr_high_bit.rs @@ -37,9 +37,9 @@ fn neon_gbr_to_rgb_high_bit_matches_scalar_bits10() { let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_neon = std::vec![0u8; w * 3]; - scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_neon, w); + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_neon, w); } assert_eq!( out_scalar, out_neon, @@ -57,9 +57,9 @@ fn neon_gbr_to_rgb_high_bit_matches_scalar_bits16() { let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_neon = std::vec![0u8; w * 3]; - scalar::gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_neon, w); + gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_neon, w); } assert_eq!( out_scalar, out_neon, @@ -77,9 +77,9 @@ fn neon_gbr_to_rgba_opaque_high_bit_matches_scalar_bits10() { let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_neon = std::vec![0u8; w * 4]; - scalar::gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_neon, w); + gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_neon, w); } assert_eq!( out_scalar, out_neon, @@ -97,9 +97,9 @@ fn neon_gbr_to_rgba_opaque_high_bit_matches_scalar_bits16() { let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_neon = std::vec![0u8; w * 4]; - scalar::gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_neon, w); + gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_neon, w); } assert_eq!( out_scalar, out_neon, @@ -118,9 +118,9 @@ fn neon_gbra_to_rgba_high_bit_matches_scalar_bits10() { let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_neon = std::vec![0u8; w * 4]; - scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_neon, w); + gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_neon, w); } assert_eq!( out_scalar, out_neon, @@ -139,9 +139,9 @@ fn neon_gbra_to_rgba_high_bit_matches_scalar_bits16() { let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_neon = std::vec![0u8; w * 4]; - scalar::gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_neon, w); + gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_neon, w); } assert_eq!( out_scalar, out_neon, @@ -161,9 +161,9 @@ fn neon_gbr_to_rgb_u16_high_bit_matches_scalar_bits10() { let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_neon = std::vec![0u16; w * 3]; - scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_neon, w); + gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_neon, w); } assert_eq!( out_scalar, out_neon, @@ -181,9 +181,9 @@ fn neon_gbr_to_rgb_u16_high_bit_matches_scalar_bits16() { let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_neon = std::vec![0u16; w * 3]; - scalar::gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_neon, w); + gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_neon, w); } assert_eq!( out_scalar, out_neon, @@ -201,9 +201,9 @@ fn neon_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits10() { let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_neon = std::vec![0u16; w * 4]; - scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_neon, w); + gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_neon, w); } assert_eq!( out_scalar, out_neon, @@ -221,9 +221,9 @@ fn neon_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits16() { let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_neon = std::vec![0u16; w * 4]; - scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_neon, w); + gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_neon, w); } assert_eq!( out_scalar, out_neon, @@ -242,9 +242,9 @@ fn neon_gbra_to_rgba_u16_high_bit_matches_scalar_bits10() { let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_neon = std::vec![0u16; w * 4]; - scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_neon, w); + gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_neon, w); } assert_eq!( out_scalar, out_neon, @@ -263,9 +263,9 @@ fn neon_gbra_to_rgba_u16_high_bit_matches_scalar_bits16() { let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_neon = std::vec![0u16; w * 4]; - scalar::gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_neon, w); + gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_neon, w); } assert_eq!( out_scalar, out_neon, @@ -286,9 +286,9 @@ fn neon_gbr_to_rgb_high_bit_upper_bits_masked_bits10() { let r = gbr_plane_u16_dirty::<10>(w, 0x0400); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_neon = std::vec![0u8; w * 3]; - scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_neon, w); + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_neon, w); } assert_eq!( out_scalar, out_neon, @@ -307,9 +307,9 @@ fn neon_gbra_to_rgba_high_bit_upper_bits_masked_bits10() { let a = gbr_plane_u16_dirty::<10>(w, 0x0C00); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_neon = std::vec![0u8; w * 4]; - scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_neon, w); + gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_neon, w); } assert_eq!( out_scalar, out_neon, @@ -327,9 +327,9 @@ fn neon_gbr_to_rgb_u16_high_bit_upper_bits_masked_bits10() { let r = gbr_plane_u16_dirty::<10>(w, 0x0400); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_neon = std::vec![0u16; w * 3]; - scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_neon, w); + gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_neon, w); } assert_eq!( out_scalar, out_neon, @@ -348,9 +348,9 @@ fn neon_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() { let a = gbr_plane_u16_dirty::<10>(w, 0x0C00); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_neon = std::vec![0u16; w * 4]; - scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_neon, w); + gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_neon, w); } assert_eq!( out_scalar, out_neon, @@ -358,3 +358,300 @@ fn neon_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() { ); } } + +// ---- BE parity: NEON output must match NEON -------- +// +// For each kernel: +// 1. Generate LE plane data. +// 2. Byte-swap each element to produce BE-encoded plane data. +// 3. Run the kernel with BE=true on the byte-swapped input. +// 4. Run the kernel with BE=false on the original LE input. +// 5. Assert outputs are byte-identical. + +fn byte_swap_plane(plane: &[u16]) -> std::vec::Vec { + plane.iter().map(|v| v.swap_bytes()).collect() +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_gbr_to_rgb_high_bit_be_matches_le_bits10() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "NEON gbr_to_rgb_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_gbr_to_rgb_high_bit_be_matches_le_bits16() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "NEON gbr_to_rgb_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_gbr_to_rgba_opaque_high_bit_be_matches_le_bits10() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgba_opaque_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "NEON gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_gbr_to_rgba_opaque_high_bit_be_matches_le_bits16() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgba_opaque_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "NEON gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_gbra_to_rgba_high_bit_be_matches_le_bits10() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let a_be = byte_swap_plane(&a); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "NEON gbra_to_rgba_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_gbra_to_rgba_high_bit_be_matches_le_bits16() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let a_be = byte_swap_plane(&a); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "NEON gbra_to_rgba_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_gbr_to_rgb_u16_high_bit_be_matches_le_bits10() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "NEON gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_gbr_to_rgb_u16_high_bit_be_matches_le_bits16() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "NEON gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits10() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgba_opaque_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "NEON gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits16() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgba_opaque_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "NEON gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_gbra_to_rgba_u16_high_bit_be_matches_le_bits10() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let a_be = byte_swap_plane(&a); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "NEON gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_gbra_to_rgba_u16_high_bit_be_matches_le_bits16() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let a_be = byte_swap_plane(&a); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "NEON gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} diff --git a/src/row/arch/wasm_simd128/alpha_extract.rs b/src/row/arch/wasm_simd128/alpha_extract.rs index 105910be..b999b618 100644 --- a/src/row/arch/wasm_simd128/alpha_extract.rs +++ b/src/row/arch/wasm_simd128/alpha_extract.rs @@ -357,7 +357,12 @@ pub(crate) unsafe fn copy_alpha_plane_u16_to_u8( } if x < width { - scalar::copy_alpha_plane_u16_to_u8::( + // Scalar tail uses `BE = false`: this wasm-simd128 helper does + // host-native u16 loads (`v128_load64_zero`), which match LE-on-disk + // only on LE hosts. The dispatcher routes BE = true directly to scalar + // (see `dispatch::alpha_extract`), so the SIMD path here is BE = false + // by construction. + scalar::copy_alpha_plane_u16_to_u8::( &alpha[x..width], &mut rgba_out[x * 4..width * 4], width - x, @@ -438,7 +443,8 @@ pub(crate) unsafe fn copy_alpha_plane_u16( } if x < width { - scalar::copy_alpha_plane_u16::( + // Scalar tail uses `BE = false`: see `copy_alpha_plane_u16_to_u8` above. + scalar::copy_alpha_plane_u16::( &alpha[x..width], &mut rgba_out[x * 4..width * 4], width - x, @@ -575,7 +581,8 @@ mod tests { unsafe { super::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_simd, w); } - scalar::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_scalar, w); + // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host). + scalar::copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } @@ -598,7 +605,8 @@ mod tests { unsafe { super::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_simd, w); } - scalar::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_scalar, w); + // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host). + scalar::copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } @@ -618,7 +626,8 @@ mod tests { unsafe { super::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_simd, w); } - scalar::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_scalar, w); + // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host). + scalar::copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } diff --git a/src/row/arch/wasm_simd128/planar_gbr_high_bit.rs b/src/row/arch/wasm_simd128/planar_gbr_high_bit.rs index 7102afa3..94dcfbd5 100644 --- a/src/row/arch/wasm_simd128/planar_gbr_high_bit.rs +++ b/src/row/arch/wasm_simd128/planar_gbr_high_bit.rs @@ -1,6 +1,7 @@ //! wasm-simd128 kernels for high-bit-depth planar GBR sources (Tier 10b). //! -//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}`. +//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}` and +//! `BE` (big-endian input when `true`). //! Lane width: 8 pixels per iteration (8 × u16 per `v128`). //! Scalar tail handles the remainder. //! @@ -21,12 +22,13 @@ use core::arch::wasm32::*; use crate::row::scalar; -use super::*; +use super::{endian::load_endian_u16x8, *}; // ---- u8 output, 3-channel (RGB) ----------------------------------------- /// wasm-simd128 high-bit-depth G/B/R planar → packed `R, G, B` **bytes**. /// Downshifts each sample by `BITS - 8` and narrows to u8. +/// When `BE = true`, input u16 lanes are byte-swapped before processing. /// /// # Safety /// @@ -35,7 +37,7 @@ use super::*; /// 3. `rgb_out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbr_to_rgb_high_bit_row( +pub(crate) unsafe fn gbr_to_rgb_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -55,9 +57,9 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( let mut x = 0usize; while x + 8 <= width { - let r_v = v128_and(v128_load(r.as_ptr().add(x).cast()), mask_v); - let g_v = v128_and(v128_load(g.as_ptr().add(x).cast()), mask_v); - let b_v = v128_and(v128_load(b.as_ptr().add(x).cast()), mask_v); + let r_v = v128_and(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask_v); + let g_v = v128_and(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask_v); + let b_v = v128_and(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask_v); // Shift right by BITS-8, then narrow u16x8 → u8x8 (in low half). let r_sh = u16x8_shr(r_v, shift); @@ -80,7 +82,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( x += 8; } if x < width { - scalar::gbr_to_rgb_high_bit_row::( + scalar::gbr_to_rgb_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -95,6 +97,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( /// wasm-simd128 high-bit-depth G/B/R planar → packed `R, G, B, A` **bytes** /// with constant opaque alpha (`0xFF`). +/// When `BE = true`, input u16 lanes are byte-swapped before processing. /// /// # Safety /// @@ -103,7 +106,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( /// 3. `rgba_out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( +pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -124,9 +127,9 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( let mut x = 0usize; while x + 8 <= width { - let r_v = v128_and(v128_load(r.as_ptr().add(x).cast()), mask_v); - let g_v = v128_and(v128_load(g.as_ptr().add(x).cast()), mask_v); - let b_v = v128_and(v128_load(b.as_ptr().add(x).cast()), mask_v); + let r_v = v128_and(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask_v); + let g_v = v128_and(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask_v); + let b_v = v128_and(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask_v); let r_sh = u16x8_shr(r_v, shift); let g_sh = u16x8_shr(g_v, shift); @@ -144,7 +147,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( x += 8; } if x < width { - scalar::gbr_to_rgba_opaque_high_bit_row::( + scalar::gbr_to_rgba_opaque_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -159,6 +162,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( /// wasm-simd128 high-bit-depth G/B/R/A planar → packed `R, G, B, A` **bytes**. /// Alpha sourced from the `a` plane, downshifted by `BITS - 8`. +/// When `BE = true`, input u16 lanes are byte-swapped before processing. /// /// # Safety /// @@ -167,7 +171,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( /// 3. `rgba_out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbra_to_rgba_high_bit_row( +pub(crate) unsafe fn gbra_to_rgba_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -189,10 +193,10 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row( let mut x = 0usize; while x + 8 <= width { - let r_v = v128_and(v128_load(r.as_ptr().add(x).cast()), mask_v); - let g_v = v128_and(v128_load(g.as_ptr().add(x).cast()), mask_v); - let b_v = v128_and(v128_load(b.as_ptr().add(x).cast()), mask_v); - let a_v = v128_and(v128_load(a.as_ptr().add(x).cast()), mask_v); + let r_v = v128_and(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask_v); + let g_v = v128_and(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask_v); + let b_v = v128_and(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask_v); + let a_v = v128_and(load_endian_u16x8::(a.as_ptr().add(x).cast()), mask_v); let r_sh = u16x8_shr(r_v, shift); let g_sh = u16x8_shr(g_v, shift); @@ -211,7 +215,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row( x += 8; } if x < width { - scalar::gbra_to_rgba_high_bit_row::( + scalar::gbra_to_rgba_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -227,6 +231,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row( /// wasm-simd128 high-bit-depth G/B/R planar → packed `R, G, B` **u16** samples. /// No shift — values copied directly, reordered G/B/R → R/G/B. +/// When `BE = true`, input u16 lanes are byte-swapped before processing. /// /// # Safety /// @@ -235,7 +240,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row( /// 3. `rgb_u16_out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( +pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -252,14 +257,14 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( let mask_v = u16x8_splat(((1u32 << BITS) - 1) as u16); let mut x = 0usize; while x + 8 <= width { - let r_v = v128_and(v128_load(r.as_ptr().add(x).cast()), mask_v); - let g_v = v128_and(v128_load(g.as_ptr().add(x).cast()), mask_v); - let b_v = v128_and(v128_load(b.as_ptr().add(x).cast()), mask_v); + let r_v = v128_and(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask_v); + let g_v = v128_and(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask_v); + let b_v = v128_and(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask_v); write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::gbr_to_rgb_u16_high_bit_row::( + scalar::gbr_to_rgb_u16_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -274,6 +279,7 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( /// wasm-simd128 high-bit-depth G/B/R planar → packed `R, G, B, A` **u16** samples /// with constant opaque alpha `(1 << BITS) - 1`. +/// When `BE = true`, input u16 lanes are byte-swapped before processing. /// /// # Safety /// @@ -282,7 +288,7 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( /// 3. `rgba_u16_out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( +pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -304,14 +310,14 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( let mut x = 0usize; while x + 8 <= width { - let r_v = v128_and(v128_load(r.as_ptr().add(x).cast()), mask_v); - let g_v = v128_and(v128_load(g.as_ptr().add(x).cast()), mask_v); - let b_v = v128_and(v128_load(b.as_ptr().add(x).cast()), mask_v); + let r_v = v128_and(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask_v); + let g_v = v128_and(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask_v); + let b_v = v128_and(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask_v); write_rgba_u16_8(r_v, g_v, b_v, opaque, rgba_u16_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::gbr_to_rgba_opaque_u16_high_bit_row::( + scalar::gbr_to_rgba_opaque_u16_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -326,6 +332,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( /// wasm-simd128 high-bit-depth G/B/R/A planar → packed `R, G, B, A` **u16** samples. /// Alpha sourced from the `a` plane at native depth (no shift). +/// When `BE = true`, input u16 lanes are byte-swapped before processing. /// /// # Safety /// @@ -334,7 +341,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( /// 3. `rgba_u16_out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row( +pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -356,15 +363,15 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row( let mask_v = u16x8_splat(((1u32 << BITS) - 1) as u16); let mut x = 0usize; while x + 8 <= width { - let r_v = v128_and(v128_load(r.as_ptr().add(x).cast()), mask_v); - let g_v = v128_and(v128_load(g.as_ptr().add(x).cast()), mask_v); - let b_v = v128_and(v128_load(b.as_ptr().add(x).cast()), mask_v); - let a_v = v128_and(v128_load(a.as_ptr().add(x).cast()), mask_v); + let r_v = v128_and(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask_v); + let g_v = v128_and(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask_v); + let b_v = v128_and(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask_v); + let a_v = v128_and(load_endian_u16x8::(a.as_ptr().add(x).cast()), mask_v); write_rgba_u16_8(r_v, g_v, b_v, a_v, rgba_u16_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::gbra_to_rgba_u16_high_bit_row::( + scalar::gbra_to_rgba_u16_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], diff --git a/src/row/arch/wasm_simd128/tests/planar_gbr_high_bit.rs b/src/row/arch/wasm_simd128/tests/planar_gbr_high_bit.rs index 8fb1faef..5b041673 100644 --- a/src/row/arch/wasm_simd128/tests/planar_gbr_high_bit.rs +++ b/src/row/arch/wasm_simd128/tests/planar_gbr_high_bit.rs @@ -31,9 +31,9 @@ fn simd128_gbr_to_rgb_high_bit_matches_scalar_bits10() { let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_wasm = std::vec![0u8; w * 3]; - scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_wasm, w); + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -50,9 +50,9 @@ fn simd128_gbr_to_rgb_high_bit_matches_scalar_bits16() { let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_wasm = std::vec![0u8; w * 3]; - scalar::gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_wasm, w); + gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -69,9 +69,9 @@ fn simd128_gbr_to_rgba_opaque_high_bit_matches_scalar_bits10() { let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_wasm = std::vec![0u8; w * 4]; - scalar::gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_wasm, w); + gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -88,9 +88,9 @@ fn simd128_gbr_to_rgba_opaque_high_bit_matches_scalar_bits16() { let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_wasm = std::vec![0u8; w * 4]; - scalar::gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_wasm, w); + gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -108,9 +108,9 @@ fn simd128_gbra_to_rgba_high_bit_matches_scalar_bits10() { let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_wasm = std::vec![0u8; w * 4]; - scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_wasm, w); + gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -128,9 +128,9 @@ fn simd128_gbra_to_rgba_high_bit_matches_scalar_bits16() { let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_wasm = std::vec![0u8; w * 4]; - scalar::gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_wasm, w); + gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -149,9 +149,9 @@ fn simd128_gbr_to_rgb_u16_high_bit_matches_scalar_bits10() { let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_wasm = std::vec![0u16; w * 3]; - scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_wasm, w); + gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -168,9 +168,9 @@ fn simd128_gbr_to_rgb_u16_high_bit_matches_scalar_bits16() { let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_wasm = std::vec![0u16; w * 3]; - scalar::gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_wasm, w); + gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -187,9 +187,9 @@ fn simd128_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits10() { let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_wasm = std::vec![0u16; w * 4]; - scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_wasm, w); + gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -206,9 +206,9 @@ fn simd128_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits16() { let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_wasm = std::vec![0u16; w * 4]; - scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_wasm, w); + gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -226,9 +226,9 @@ fn simd128_gbra_to_rgba_u16_high_bit_matches_scalar_bits10() { let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_wasm = std::vec![0u16; w * 4]; - scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_wasm, w); + gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -246,9 +246,9 @@ fn simd128_gbra_to_rgba_u16_high_bit_matches_scalar_bits16() { let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_wasm = std::vec![0u16; w * 4]; - scalar::gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_wasm, w); + gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -256,3 +256,281 @@ fn simd128_gbra_to_rgba_u16_high_bit_matches_scalar_bits16() { ); } } + +// ---- BE parity: simd128 output must match simd128 -- + +fn byte_swap_plane(plane: &[u16]) -> std::vec::Vec { + plane.iter().map(|v| v.swap_bytes()).collect() +} + +#[test] +fn simd128_gbr_to_rgb_high_bit_be_matches_le_bits10() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "simd128 gbr_to_rgb_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +fn simd128_gbr_to_rgb_high_bit_be_matches_le_bits16() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "simd128 gbr_to_rgb_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +fn simd128_gbr_to_rgba_opaque_high_bit_be_matches_le_bits10() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgba_opaque_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "simd128 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +fn simd128_gbr_to_rgba_opaque_high_bit_be_matches_le_bits16() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgba_opaque_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "simd128 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +fn simd128_gbra_to_rgba_high_bit_be_matches_le_bits10() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let a_be = byte_swap_plane(&a); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "simd128 gbra_to_rgba_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +fn simd128_gbra_to_rgba_high_bit_be_matches_le_bits16() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let a_be = byte_swap_plane(&a); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "simd128 gbra_to_rgba_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +fn simd128_gbr_to_rgb_u16_high_bit_be_matches_le_bits10() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "simd128 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +fn simd128_gbr_to_rgb_u16_high_bit_be_matches_le_bits16() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "simd128 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +fn simd128_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits10() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgba_opaque_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "simd128 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +fn simd128_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits16() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgba_opaque_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "simd128 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +fn simd128_gbra_to_rgba_u16_high_bit_be_matches_le_bits10() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let a_be = byte_swap_plane(&a); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "simd128 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +fn simd128_gbra_to_rgba_u16_high_bit_be_matches_le_bits16() { + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let a_be = byte_swap_plane(&a); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "simd128 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} diff --git a/src/row/arch/x86_avx2/alpha_extract.rs b/src/row/arch/x86_avx2/alpha_extract.rs index ba4ade4f..1ebe97c1 100644 --- a/src/row/arch/x86_avx2/alpha_extract.rs +++ b/src/row/arch/x86_avx2/alpha_extract.rs @@ -450,7 +450,12 @@ pub(crate) unsafe fn copy_alpha_plane_u16_to_u8( } if x < width { - scalar::copy_alpha_plane_u16_to_u8::( + // Scalar tail uses `BE = false`: this AVX2 helper does host-native + // u16 loads (`_mm_loadu_si128`), which match LE-on-disk only on LE + // hosts. The dispatcher routes BE = true directly to scalar (see + // `dispatch::alpha_extract`), so the SIMD path here is BE = false by + // construction. + scalar::copy_alpha_plane_u16_to_u8::( &alpha[x..width], &mut rgba_out[x * 4..width * 4], width - x, @@ -554,7 +559,8 @@ pub(crate) unsafe fn copy_alpha_plane_u16( } if x < width { - scalar::copy_alpha_plane_u16::( + // Scalar tail uses `BE = false`: see `copy_alpha_plane_u16_to_u8` above. + scalar::copy_alpha_plane_u16::( &alpha[x..width], &mut rgba_out[x * 4..width * 4], width - x, @@ -696,7 +702,8 @@ mod tests { pseudo_random_u8(&mut rgba_simd, 0xBABE); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_simd, w) }; - scalar::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_scalar, w); + // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host). + scalar::copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } @@ -720,7 +727,8 @@ mod tests { pseudo_random_u8(&mut rgba_simd, 0x5EED); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_simd, w) }; - scalar::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_scalar, w); + // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host). + scalar::copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } @@ -741,7 +749,8 @@ mod tests { pseudo_random_u16(&mut rgba_simd, 0xFADE); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_simd, w) }; - scalar::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_scalar, w); + // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host). + scalar::copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } diff --git a/src/row/arch/x86_avx2/planar_gbr_high_bit.rs b/src/row/arch/x86_avx2/planar_gbr_high_bit.rs index 23c76e15..26d9e298 100644 --- a/src/row/arch/x86_avx2/planar_gbr_high_bit.rs +++ b/src/row/arch/x86_avx2/planar_gbr_high_bit.rs @@ -1,6 +1,7 @@ //! AVX2 kernels for high-bit-depth planar GBR sources (Tier 10b). //! -//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}`. +//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}` and +//! `BE: bool` (endianness of the source u16 planes). //! Lane width: 16 pixels per iteration (16 × u16 per `__m256i`). //! Scalar tail handles the remainder. //! @@ -18,16 +19,26 @@ //! //! Process 16 u16 pixels per outer iteration via two calls to the 128-bit //! `write_rgb_u16_8` / `write_rgba_u16_8` helpers (8 pixels each). +//! +//! # Big-endian (`BE = true`) mode +//! +//! Wide (16-pixel) iterations use `load_endian_u16x16::` from this +//! backend's own `endian.rs` (256-bit shuffle). 8-pixel tail iterations use +//! `load_endian_u16x8::` from the SSE4.1 `endian.rs` (128-bit shuffle). +//! Both branches are resolved at monomorphisation time. use core::arch::x86_64::*; -use super::*; +use super::{endian::load_endian_u16x16, *}; +use crate::row::arch::x86_sse41::endian::load_endian_u16x8; // ---- u8 output, 3-channel (RGB) ----------------------------------------- /// AVX2 high-bit-depth G/B/R planar → packed `R, G, B` **bytes**. /// Downshifts each sample by `BITS - 8` and packs to u8. /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. AVX2 must be available (caller obligation). @@ -35,7 +46,7 @@ use super::*; /// 3. `rgb_out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gbr_to_rgb_high_bit_row( +pub(crate) unsafe fn gbr_to_rgb_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -58,9 +69,9 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( let mut x = 0usize; while x + 16 <= width { - let r_v = _mm256_and_si256(_mm256_loadu_si256(r.as_ptr().add(x).cast()), mask256); - let g_v = _mm256_and_si256(_mm256_loadu_si256(g.as_ptr().add(x).cast()), mask256); - let b_v = _mm256_and_si256(_mm256_loadu_si256(b.as_ptr().add(x).cast()), mask256); + let r_v = _mm256_and_si256(load_endian_u16x16::(r.as_ptr().add(x).cast()), mask256); + let g_v = _mm256_and_si256(load_endian_u16x16::(g.as_ptr().add(x).cast()), mask256); + let b_v = _mm256_and_si256(load_endian_u16x16::(b.as_ptr().add(x).cast()), mask256); // Variable-count logical right-shift for all 16 u16 lanes. let r_sh = _mm256_srl_epi16(r_v, shr_count); @@ -85,9 +96,9 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( // Drain remaining 8-pixel blocks with the SSE-width path. if x + 8 <= width { let zero = _mm_setzero_si128(); - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128); + let r_v = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask128); + let g_v = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask128); + let b_v = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask128); let r_sh = _mm_srl_epi16(r_v, shr_count); let g_sh = _mm_srl_epi16(g_v, shr_count); let b_sh = _mm_srl_epi16(b_v, shr_count); @@ -100,7 +111,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( x += 8; } if x < width { - scalar::gbr_to_rgb_high_bit_row::( + scalar::gbr_to_rgb_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -116,6 +127,8 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( /// AVX2 high-bit-depth G/B/R planar → packed `R, G, B, A` **bytes** /// with constant opaque alpha (`0xFF`). /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. AVX2 must be available (caller obligation). @@ -123,7 +136,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( /// 3. `rgba_out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( +pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -147,9 +160,9 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( let mut x = 0usize; while x + 16 <= width { - let r_v = _mm256_and_si256(_mm256_loadu_si256(r.as_ptr().add(x).cast()), mask256); - let g_v = _mm256_and_si256(_mm256_loadu_si256(g.as_ptr().add(x).cast()), mask256); - let b_v = _mm256_and_si256(_mm256_loadu_si256(b.as_ptr().add(x).cast()), mask256); + let r_v = _mm256_and_si256(load_endian_u16x16::(r.as_ptr().add(x).cast()), mask256); + let g_v = _mm256_and_si256(load_endian_u16x16::(g.as_ptr().add(x).cast()), mask256); + let b_v = _mm256_and_si256(load_endian_u16x16::(b.as_ptr().add(x).cast()), mask256); let r_sh = _mm256_srl_epi16(r_v, shr_count); let g_sh = _mm256_srl_epi16(g_v, shr_count); @@ -174,9 +187,9 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( x += 16; } if x + 8 <= width { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128); + let r_v = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask128); + let g_v = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask128); + let b_v = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask128); let r_sh = _mm_srl_epi16(r_v, shr_count); let g_sh = _mm_srl_epi16(g_v, shr_count); let b_sh = _mm_srl_epi16(b_v, shr_count); @@ -189,7 +202,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( x += 8; } if x < width { - scalar::gbr_to_rgba_opaque_high_bit_row::( + scalar::gbr_to_rgba_opaque_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -205,6 +218,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( /// AVX2 high-bit-depth G/B/R/A planar → packed `R, G, B, A` **bytes**. /// Alpha sourced from the `a` plane, downshifted by `BITS - 8`. /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. AVX2 must be available (caller obligation). @@ -212,7 +227,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( /// 3. `rgba_out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gbra_to_rgba_high_bit_row( +pub(crate) unsafe fn gbra_to_rgba_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -236,10 +251,10 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row( let mut x = 0usize; while x + 16 <= width { - let r_v = _mm256_and_si256(_mm256_loadu_si256(r.as_ptr().add(x).cast()), mask256); - let g_v = _mm256_and_si256(_mm256_loadu_si256(g.as_ptr().add(x).cast()), mask256); - let b_v = _mm256_and_si256(_mm256_loadu_si256(b.as_ptr().add(x).cast()), mask256); - let a_v = _mm256_and_si256(_mm256_loadu_si256(a.as_ptr().add(x).cast()), mask256); + let r_v = _mm256_and_si256(load_endian_u16x16::(r.as_ptr().add(x).cast()), mask256); + let g_v = _mm256_and_si256(load_endian_u16x16::(g.as_ptr().add(x).cast()), mask256); + let b_v = _mm256_and_si256(load_endian_u16x16::(b.as_ptr().add(x).cast()), mask256); + let a_v = _mm256_and_si256(load_endian_u16x16::(a.as_ptr().add(x).cast()), mask256); let r_sh = _mm256_srl_epi16(r_v, shr_count); let g_sh = _mm256_srl_epi16(g_v, shr_count); @@ -261,10 +276,10 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row( x += 16; } if x + 8 <= width { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128); - let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask128); + let r_v = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask128); + let g_v = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask128); + let b_v = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask128); + let a_v = _mm_and_si128(load_endian_u16x8::(a.as_ptr().add(x).cast()), mask128); let r_sh = _mm_srl_epi16(r_v, shr_count); let g_sh = _mm_srl_epi16(g_v, shr_count); let b_sh = _mm_srl_epi16(b_v, shr_count); @@ -279,7 +294,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row( x += 8; } if x < width { - scalar::gbra_to_rgba_high_bit_row::( + scalar::gbra_to_rgba_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -297,6 +312,8 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row( /// No shift — values copied directly, reordered G/B/R → R/G/B. /// Processes 16 pixels per outer loop via two 8-pixel `write_rgb_u16_8` calls. /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. AVX2 must be available (caller obligation). @@ -304,7 +321,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row( /// 3. `rgb_u16_out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( +pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -322,27 +339,36 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( let mut x = 0usize; while x + 16 <= width { // Two 8-pixel halves using the SSE helper. - let r_lo = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128); - let g_lo = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128); - let b_lo = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128); + let r_lo = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask128); + let g_lo = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask128); + let b_lo = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask128); write_rgb_u16_8(r_lo, g_lo, b_lo, rgb_u16_out.as_mut_ptr().add(x * 3)); - let r_hi = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 8).cast()), mask128); - let g_hi = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 8).cast()), mask128); - let b_hi = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 8).cast()), mask128); + let r_hi = _mm_and_si128( + load_endian_u16x8::(r.as_ptr().add(x + 8).cast()), + mask128, + ); + let g_hi = _mm_and_si128( + load_endian_u16x8::(g.as_ptr().add(x + 8).cast()), + mask128, + ); + let b_hi = _mm_and_si128( + load_endian_u16x8::(b.as_ptr().add(x + 8).cast()), + mask128, + ); write_rgb_u16_8(r_hi, g_hi, b_hi, rgb_u16_out.as_mut_ptr().add((x + 8) * 3)); x += 16; } if x + 8 <= width { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128); + let r_v = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask128); + let g_v = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask128); + let b_v = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask128); write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::gbr_to_rgb_u16_high_bit_row::( + scalar::gbr_to_rgb_u16_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -358,6 +384,8 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( /// AVX2 high-bit-depth G/B/R planar → packed `R, G, B, A` **u16** samples /// with constant opaque alpha `(1 << BITS) - 1`. /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. AVX2 must be available (caller obligation). @@ -365,7 +393,7 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( /// 3. `rgba_u16_out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( +pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -387,9 +415,9 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( let mut x = 0usize; while x + 16 <= width { - let r_lo = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128); - let g_lo = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128); - let b_lo = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128); + let r_lo = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask128); + let g_lo = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask128); + let b_lo = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask128); write_rgba_u16_8( r_lo, g_lo, @@ -398,9 +426,18 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( rgba_u16_out.as_mut_ptr().add(x * 4), ); - let r_hi = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 8).cast()), mask128); - let g_hi = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 8).cast()), mask128); - let b_hi = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 8).cast()), mask128); + let r_hi = _mm_and_si128( + load_endian_u16x8::(r.as_ptr().add(x + 8).cast()), + mask128, + ); + let g_hi = _mm_and_si128( + load_endian_u16x8::(g.as_ptr().add(x + 8).cast()), + mask128, + ); + let b_hi = _mm_and_si128( + load_endian_u16x8::(b.as_ptr().add(x + 8).cast()), + mask128, + ); write_rgba_u16_8( r_hi, g_hi, @@ -412,14 +449,14 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( x += 16; } if x + 8 <= width { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128); + let r_v = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask128); + let g_v = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask128); + let b_v = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask128); write_rgba_u16_8(r_v, g_v, b_v, opaque, rgba_u16_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::gbr_to_rgba_opaque_u16_high_bit_row::( + scalar::gbr_to_rgba_opaque_u16_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -435,6 +472,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( /// AVX2 high-bit-depth G/B/R/A planar → packed `R, G, B, A` **u16** samples. /// Alpha sourced from the `a` plane at native depth (no shift). /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. AVX2 must be available (caller obligation). @@ -442,7 +481,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( /// 3. `rgba_u16_out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row( +pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -464,16 +503,28 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row( let mask128 = _mm_set1_epi16(((1u32 << BITS) - 1) as u16 as i16); let mut x = 0usize; while x + 16 <= width { - let r_lo = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128); - let g_lo = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128); - let b_lo = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128); - let a_lo = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask128); + let r_lo = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask128); + let g_lo = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask128); + let b_lo = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask128); + let a_lo = _mm_and_si128(load_endian_u16x8::(a.as_ptr().add(x).cast()), mask128); write_rgba_u16_8(r_lo, g_lo, b_lo, a_lo, rgba_u16_out.as_mut_ptr().add(x * 4)); - let r_hi = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 8).cast()), mask128); - let g_hi = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 8).cast()), mask128); - let b_hi = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 8).cast()), mask128); - let a_hi = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x + 8).cast()), mask128); + let r_hi = _mm_and_si128( + load_endian_u16x8::(r.as_ptr().add(x + 8).cast()), + mask128, + ); + let g_hi = _mm_and_si128( + load_endian_u16x8::(g.as_ptr().add(x + 8).cast()), + mask128, + ); + let b_hi = _mm_and_si128( + load_endian_u16x8::(b.as_ptr().add(x + 8).cast()), + mask128, + ); + let a_hi = _mm_and_si128( + load_endian_u16x8::(a.as_ptr().add(x + 8).cast()), + mask128, + ); write_rgba_u16_8( r_hi, g_hi, @@ -485,15 +536,15 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row( x += 16; } if x + 8 <= width { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128); - let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask128); + let r_v = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask128); + let g_v = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask128); + let b_v = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask128); + let a_v = _mm_and_si128(load_endian_u16x8::(a.as_ptr().add(x).cast()), mask128); write_rgba_u16_8(r_v, g_v, b_v, a_v, rgba_u16_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::gbra_to_rgba_u16_high_bit_row::( + scalar::gbra_to_rgba_u16_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], diff --git a/src/row/arch/x86_avx2/tests/planar_gbr_high_bit.rs b/src/row/arch/x86_avx2/tests/planar_gbr_high_bit.rs index 72225d19..505256fe 100644 --- a/src/row/arch/x86_avx2/tests/planar_gbr_high_bit.rs +++ b/src/row/arch/x86_avx2/tests/planar_gbr_high_bit.rs @@ -37,9 +37,9 @@ fn avx2_gbr_to_rgb_high_bit_matches_scalar_bits10() { let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_avx = std::vec![0u8; w * 3]; - scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w); + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -60,9 +60,9 @@ fn avx2_gbr_to_rgb_high_bit_matches_scalar_bits16() { let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_avx = std::vec![0u8; w * 3]; - scalar::gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w); + gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -83,9 +83,9 @@ fn avx2_gbr_to_rgba_opaque_high_bit_matches_scalar_bits10() { let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w); + gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -106,9 +106,9 @@ fn avx2_gbr_to_rgba_opaque_high_bit_matches_scalar_bits16() { let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w); + gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -130,9 +130,9 @@ fn avx2_gbra_to_rgba_high_bit_matches_scalar_bits10() { let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w); + gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -154,9 +154,9 @@ fn avx2_gbra_to_rgba_high_bit_matches_scalar_bits16() { let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_avx, w); + gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -179,9 +179,9 @@ fn avx2_gbr_to_rgb_u16_high_bit_matches_scalar_bits10() { let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_avx = std::vec![0u16; w * 3]; - scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w); + gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -202,9 +202,9 @@ fn avx2_gbr_to_rgb_u16_high_bit_matches_scalar_bits16() { let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_avx = std::vec![0u16; w * 3]; - scalar::gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w); + gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -225,9 +225,9 @@ fn avx2_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits10() { let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_avx = std::vec![0u16; w * 4]; - scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w); + gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -248,9 +248,9 @@ fn avx2_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits16() { let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_avx = std::vec![0u16; w * 4]; - scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w); + gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -272,9 +272,9 @@ fn avx2_gbra_to_rgba_u16_high_bit_matches_scalar_bits10() { let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_avx = std::vec![0u16; w * 4]; - scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w); + gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -296,9 +296,9 @@ fn avx2_gbra_to_rgba_u16_high_bit_matches_scalar_bits16() { let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_avx = std::vec![0u16; w * 4]; - scalar::gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_avx, w); + gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -321,9 +321,9 @@ fn avx2_gbr_to_rgb_high_bit_upper_bits_masked_bits10() { let r = gbr_plane_u16_dirty::<10>(w, 0x0400); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_avx = std::vec![0u8; w * 3]; - scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w); + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -345,9 +345,9 @@ fn avx2_gbra_to_rgba_high_bit_upper_bits_masked_bits10() { let a = gbr_plane_u16_dirty::<10>(w, 0x0C00); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w); + gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -368,9 +368,9 @@ fn avx2_gbr_to_rgb_u16_high_bit_upper_bits_masked_bits10() { let r = gbr_plane_u16_dirty::<10>(w, 0x0400); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_avx = std::vec![0u16; w * 3]; - scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w); + gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -392,9 +392,9 @@ fn avx2_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() { let a = gbr_plane_u16_dirty::<10>(w, 0x0C00); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_avx = std::vec![0u16; w * 4]; - scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w); + gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -402,3 +402,329 @@ fn avx2_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() { ); } } + +// ---- BE parity: AVX2 output must match AVX2 -------- + +fn byte_swap_plane(plane: &[u16]) -> std::vec::Vec { + plane.iter().map(|v| v.swap_bytes()).collect() +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_gbr_to_rgb_high_bit_be_matches_le_bits10() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX2 gbr_to_rgb_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_gbr_to_rgb_high_bit_be_matches_le_bits16() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX2 gbr_to_rgb_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_gbr_to_rgba_opaque_high_bit_be_matches_le_bits10() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgba_opaque_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX2 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_gbr_to_rgba_opaque_high_bit_be_matches_le_bits16() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgba_opaque_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX2 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_gbra_to_rgba_high_bit_be_matches_le_bits10() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let a_be = byte_swap_plane(&a); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX2 gbra_to_rgba_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_gbra_to_rgba_high_bit_be_matches_le_bits16() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let a_be = byte_swap_plane(&a); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX2 gbra_to_rgba_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_gbr_to_rgb_u16_high_bit_be_matches_le_bits10() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX2 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_gbr_to_rgb_u16_high_bit_be_matches_le_bits16() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX2 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits10() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgba_opaque_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX2 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits16() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgba_opaque_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX2 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_gbra_to_rgba_u16_high_bit_be_matches_le_bits10() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let a_be = byte_swap_plane(&a); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX2 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_gbra_to_rgba_u16_high_bit_be_matches_le_bits16() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let a_be = byte_swap_plane(&a); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX2 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} diff --git a/src/row/arch/x86_avx512/alpha_extract.rs b/src/row/arch/x86_avx512/alpha_extract.rs index f311e366..203e08e3 100644 --- a/src/row/arch/x86_avx512/alpha_extract.rs +++ b/src/row/arch/x86_avx512/alpha_extract.rs @@ -434,7 +434,12 @@ pub(crate) unsafe fn copy_alpha_plane_u16_to_u8( } if x < width { - scalar::copy_alpha_plane_u16_to_u8::( + // Scalar tail uses `BE = false`: this AVX-512 helper does host-native + // u16 loads (`_mm256_loadu_si256`), which match LE-on-disk only on LE + // hosts. The dispatcher routes BE = true directly to scalar (see + // `dispatch::alpha_extract`), so the SIMD path here is BE = false by + // construction. + scalar::copy_alpha_plane_u16_to_u8::( &alpha[x..width], &mut rgba_out[x * 4..width * 4], width - x, @@ -519,7 +524,8 @@ pub(crate) unsafe fn copy_alpha_plane_u16( } if x < width { - scalar::copy_alpha_plane_u16::( + // Scalar tail uses `BE = false`: see `copy_alpha_plane_u16_to_u8` above. + scalar::copy_alpha_plane_u16::( &alpha[x..width], &mut rgba_out[x * 4..width * 4], width - x, @@ -670,7 +676,8 @@ mod tests { pseudo_random_u8(&mut rgba_simd, 0xBABE); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_simd, w) }; - scalar::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_scalar, w); + // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host). + scalar::copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } @@ -696,7 +703,8 @@ mod tests { pseudo_random_u8(&mut rgba_simd, 0x5EED); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_simd, w) }; - scalar::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_scalar, w); + // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host). + scalar::copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } @@ -719,7 +727,8 @@ mod tests { pseudo_random_u16(&mut rgba_simd, 0xFADE); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_simd, w) }; - scalar::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_scalar, w); + // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host). + scalar::copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } diff --git a/src/row/arch/x86_avx512/planar_gbr_high_bit.rs b/src/row/arch/x86_avx512/planar_gbr_high_bit.rs index 4f763434..afc8ccc3 100644 --- a/src/row/arch/x86_avx512/planar_gbr_high_bit.rs +++ b/src/row/arch/x86_avx512/planar_gbr_high_bit.rs @@ -1,6 +1,7 @@ //! AVX-512 (F + BW) kernels for high-bit-depth planar GBR sources (Tier 10b). //! -//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}`. +//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}` and +//! `BE: bool` (endianness of the source u16 planes). //! Lane width: 32 pixels per iteration (32 × u16 per `__m512i`). //! Scalar tail handles the remainder. //! @@ -20,16 +21,25 @@ //! //! Process 32 pixels via four calls to `write_rgb_u16_8` / //! `write_rgba_u16_8` (8 pixels each, SSE4.1 128-bit helpers). +//! +//! # Big-endian (`BE = true`) mode +//! +//! Wide (32-pixel) iterations use `load_endian_u16x32::` from this +//! backend's own `endian.rs` (512-bit shuffle). 8-pixel tail iterations use +//! `load_endian_u16x8::` from the SSE4.1 `endian.rs` (128-bit shuffle). use core::arch::x86_64::*; -use super::*; +use super::{endian::load_endian_u16x32, *}; +use crate::row::arch::x86_sse41::endian::load_endian_u16x8; // ---- u8 output, 3-channel (RGB) ----------------------------------------- /// AVX-512 (F+BW) high-bit-depth G/B/R planar → packed `R, G, B` **bytes**. /// Downshifts each sample by `BITS - 8` and packs to u8. /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. AVX-512BW must be available (caller obligation). @@ -37,7 +47,7 @@ use super::*; /// 3. `rgb_out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gbr_to_rgb_high_bit_row( +pub(crate) unsafe fn gbr_to_rgb_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -60,9 +70,9 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( let mut x = 0usize; while x + 32 <= width { // Load 32 u16 pixels per plane via 512-bit loads, then mask. - let r_v = _mm512_and_si512(_mm512_loadu_si512(r.as_ptr().add(x).cast()), mask512); - let g_v = _mm512_and_si512(_mm512_loadu_si512(g.as_ptr().add(x).cast()), mask512); - let b_v = _mm512_and_si512(_mm512_loadu_si512(b.as_ptr().add(x).cast()), mask512); + let r_v = _mm512_and_si512(load_endian_u16x32::(r.as_ptr().add(x).cast()), mask512); + let g_v = _mm512_and_si512(load_endian_u16x32::(g.as_ptr().add(x).cast()), mask512); + let b_v = _mm512_and_si512(load_endian_u16x32::(b.as_ptr().add(x).cast()), mask512); // Shift all 32 u16 lanes right by BITS-8. let r_sh = _mm512_srl_epi16(r_v, shr_count); @@ -124,9 +134,9 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( } // Drain remaining 8-pixel blocks before scalar tail. while x + 8 <= width { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128); + let r_v = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask128); + let g_v = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask128); + let b_v = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask128); let r_sh = _mm_srl_epi16(r_v, shr_count); let g_sh = _mm_srl_epi16(g_v, shr_count); let b_sh = _mm_srl_epi16(b_v, shr_count); @@ -139,7 +149,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( x += 8; } if x < width { - scalar::gbr_to_rgb_high_bit_row::( + scalar::gbr_to_rgb_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -155,6 +165,8 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( /// AVX-512 (F+BW) high-bit-depth G/B/R planar → packed `R, G, B, A` **bytes** /// with constant opaque alpha (`0xFF`). /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. AVX-512BW must be available (caller obligation). @@ -162,7 +174,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( /// 3. `rgba_out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( +pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -185,9 +197,9 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( let mut x = 0usize; while x + 32 <= width { - let r_v = _mm512_and_si512(_mm512_loadu_si512(r.as_ptr().add(x).cast()), mask512); - let g_v = _mm512_and_si512(_mm512_loadu_si512(g.as_ptr().add(x).cast()), mask512); - let b_v = _mm512_and_si512(_mm512_loadu_si512(b.as_ptr().add(x).cast()), mask512); + let r_v = _mm512_and_si512(load_endian_u16x32::(r.as_ptr().add(x).cast()), mask512); + let g_v = _mm512_and_si512(load_endian_u16x32::(g.as_ptr().add(x).cast()), mask512); + let b_v = _mm512_and_si512(load_endian_u16x32::(b.as_ptr().add(x).cast()), mask512); let r_sh = _mm512_srl_epi16(r_v, shr_count); let g_sh = _mm512_srl_epi16(g_v, shr_count); @@ -245,9 +257,9 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( x += 32; } while x + 8 <= width { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128); + let r_v = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask128); + let g_v = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask128); + let b_v = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask128); let r_sh = _mm_srl_epi16(r_v, shr_count); let g_sh = _mm_srl_epi16(g_v, shr_count); let b_sh = _mm_srl_epi16(b_v, shr_count); @@ -260,7 +272,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( x += 8; } if x < width { - scalar::gbr_to_rgba_opaque_high_bit_row::( + scalar::gbr_to_rgba_opaque_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -276,6 +288,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( /// AVX-512 (F+BW) high-bit-depth G/B/R/A planar → packed `R, G, B, A` **bytes**. /// Alpha sourced from the `a` plane, downshifted by `BITS - 8`. /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. AVX-512BW must be available (caller obligation). @@ -283,7 +297,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( /// 3. `rgba_out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gbra_to_rgba_high_bit_row( +pub(crate) unsafe fn gbra_to_rgba_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -306,10 +320,10 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row( let mut x = 0usize; while x + 32 <= width { - let r_v = _mm512_and_si512(_mm512_loadu_si512(r.as_ptr().add(x).cast()), mask512); - let g_v = _mm512_and_si512(_mm512_loadu_si512(g.as_ptr().add(x).cast()), mask512); - let b_v = _mm512_and_si512(_mm512_loadu_si512(b.as_ptr().add(x).cast()), mask512); - let a_v = _mm512_and_si512(_mm512_loadu_si512(a.as_ptr().add(x).cast()), mask512); + let r_v = _mm512_and_si512(load_endian_u16x32::(r.as_ptr().add(x).cast()), mask512); + let g_v = _mm512_and_si512(load_endian_u16x32::(g.as_ptr().add(x).cast()), mask512); + let b_v = _mm512_and_si512(load_endian_u16x32::(b.as_ptr().add(x).cast()), mask512); + let a_v = _mm512_and_si512(load_endian_u16x32::(a.as_ptr().add(x).cast()), mask512); let r_sh = _mm512_srl_epi16(r_v, shr_count); let g_sh = _mm512_srl_epi16(g_v, shr_count); @@ -376,10 +390,10 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row( x += 32; } while x + 8 <= width { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128); - let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask128); + let r_v = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask128); + let g_v = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask128); + let b_v = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask128); + let a_v = _mm_and_si128(load_endian_u16x8::(a.as_ptr().add(x).cast()), mask128); let r_sh = _mm_srl_epi16(r_v, shr_count); let g_sh = _mm_srl_epi16(g_v, shr_count); let b_sh = _mm_srl_epi16(b_v, shr_count); @@ -394,7 +408,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row( x += 8; } if x < width { - scalar::gbra_to_rgba_high_bit_row::( + scalar::gbra_to_rgba_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -412,6 +426,8 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row( /// No shift — values copied directly, reordered G/B/R → R/G/B. /// Processes 32 pixels per outer loop via four 8-pixel `write_rgb_u16_8` calls. /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. AVX-512BW must be available (caller obligation). @@ -419,7 +435,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row( /// 3. `rgb_u16_out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( +pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -438,40 +454,67 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( while x + 32 <= width { // Four 8-pixel blocks (offsets 0, 8, 16, 24). { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128); + let r_v = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask128); + let g_v = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask128); + let b_v = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask128); write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add(x * 3)); } { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 8).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 8).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 8).cast()), mask128); + let r_v = _mm_and_si128( + load_endian_u16x8::(r.as_ptr().add(x + 8).cast()), + mask128, + ); + let g_v = _mm_and_si128( + load_endian_u16x8::(g.as_ptr().add(x + 8).cast()), + mask128, + ); + let b_v = _mm_and_si128( + load_endian_u16x8::(b.as_ptr().add(x + 8).cast()), + mask128, + ); write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add((x + 8) * 3)); } { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 16).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 16).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 16).cast()), mask128); + let r_v = _mm_and_si128( + load_endian_u16x8::(r.as_ptr().add(x + 16).cast()), + mask128, + ); + let g_v = _mm_and_si128( + load_endian_u16x8::(g.as_ptr().add(x + 16).cast()), + mask128, + ); + let b_v = _mm_and_si128( + load_endian_u16x8::(b.as_ptr().add(x + 16).cast()), + mask128, + ); write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add((x + 16) * 3)); } { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 24).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 24).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 24).cast()), mask128); + let r_v = _mm_and_si128( + load_endian_u16x8::(r.as_ptr().add(x + 24).cast()), + mask128, + ); + let g_v = _mm_and_si128( + load_endian_u16x8::(g.as_ptr().add(x + 24).cast()), + mask128, + ); + let b_v = _mm_and_si128( + load_endian_u16x8::(b.as_ptr().add(x + 24).cast()), + mask128, + ); write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add((x + 24) * 3)); } x += 32; } while x + 8 <= width { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128); + let r_v = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask128); + let g_v = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask128); + let b_v = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask128); write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::gbr_to_rgb_u16_high_bit_row::( + scalar::gbr_to_rgb_u16_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -487,6 +530,8 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( /// AVX-512 (F+BW) high-bit-depth G/B/R planar → packed `R, G, B, A` **u16** samples /// with constant opaque alpha `(1 << BITS) - 1`. /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. AVX-512BW must be available (caller obligation). @@ -494,7 +539,7 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( /// 3. `rgba_u16_out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( +pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -517,15 +562,24 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( let mut x = 0usize; while x + 32 <= width { { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128); + let r_v = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask128); + let g_v = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask128); + let b_v = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask128); write_rgba_u16_8(r_v, g_v, b_v, opaque, rgba_u16_out.as_mut_ptr().add(x * 4)); } { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 8).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 8).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 8).cast()), mask128); + let r_v = _mm_and_si128( + load_endian_u16x8::(r.as_ptr().add(x + 8).cast()), + mask128, + ); + let g_v = _mm_and_si128( + load_endian_u16x8::(g.as_ptr().add(x + 8).cast()), + mask128, + ); + let b_v = _mm_and_si128( + load_endian_u16x8::(b.as_ptr().add(x + 8).cast()), + mask128, + ); write_rgba_u16_8( r_v, g_v, @@ -535,9 +589,18 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( ); } { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 16).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 16).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 16).cast()), mask128); + let r_v = _mm_and_si128( + load_endian_u16x8::(r.as_ptr().add(x + 16).cast()), + mask128, + ); + let g_v = _mm_and_si128( + load_endian_u16x8::(g.as_ptr().add(x + 16).cast()), + mask128, + ); + let b_v = _mm_and_si128( + load_endian_u16x8::(b.as_ptr().add(x + 16).cast()), + mask128, + ); write_rgba_u16_8( r_v, g_v, @@ -547,9 +610,18 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( ); } { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 24).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 24).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 24).cast()), mask128); + let r_v = _mm_and_si128( + load_endian_u16x8::(r.as_ptr().add(x + 24).cast()), + mask128, + ); + let g_v = _mm_and_si128( + load_endian_u16x8::(g.as_ptr().add(x + 24).cast()), + mask128, + ); + let b_v = _mm_and_si128( + load_endian_u16x8::(b.as_ptr().add(x + 24).cast()), + mask128, + ); write_rgba_u16_8( r_v, g_v, @@ -561,14 +633,14 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( x += 32; } while x + 8 <= width { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128); + let r_v = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask128); + let g_v = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask128); + let b_v = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask128); write_rgba_u16_8(r_v, g_v, b_v, opaque, rgba_u16_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::gbr_to_rgba_opaque_u16_high_bit_row::( + scalar::gbr_to_rgba_opaque_u16_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -584,6 +656,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( /// AVX-512 (F+BW) high-bit-depth G/B/R/A planar → packed `R, G, B, A` **u16** samples. /// Alpha sourced from the `a` plane at native depth (no shift). /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. AVX-512BW must be available (caller obligation). @@ -591,7 +665,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( /// 3. `rgba_u16_out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row( +pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -614,17 +688,29 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row( let mut x = 0usize; while x + 32 <= width { { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128); - let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask128); + let r_v = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask128); + let g_v = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask128); + let b_v = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask128); + let a_v = _mm_and_si128(load_endian_u16x8::(a.as_ptr().add(x).cast()), mask128); write_rgba_u16_8(r_v, g_v, b_v, a_v, rgba_u16_out.as_mut_ptr().add(x * 4)); } { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 8).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 8).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 8).cast()), mask128); - let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x + 8).cast()), mask128); + let r_v = _mm_and_si128( + load_endian_u16x8::(r.as_ptr().add(x + 8).cast()), + mask128, + ); + let g_v = _mm_and_si128( + load_endian_u16x8::(g.as_ptr().add(x + 8).cast()), + mask128, + ); + let b_v = _mm_and_si128( + load_endian_u16x8::(b.as_ptr().add(x + 8).cast()), + mask128, + ); + let a_v = _mm_and_si128( + load_endian_u16x8::(a.as_ptr().add(x + 8).cast()), + mask128, + ); write_rgba_u16_8( r_v, g_v, @@ -634,10 +720,22 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row( ); } { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 16).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 16).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 16).cast()), mask128); - let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x + 16).cast()), mask128); + let r_v = _mm_and_si128( + load_endian_u16x8::(r.as_ptr().add(x + 16).cast()), + mask128, + ); + let g_v = _mm_and_si128( + load_endian_u16x8::(g.as_ptr().add(x + 16).cast()), + mask128, + ); + let b_v = _mm_and_si128( + load_endian_u16x8::(b.as_ptr().add(x + 16).cast()), + mask128, + ); + let a_v = _mm_and_si128( + load_endian_u16x8::(a.as_ptr().add(x + 16).cast()), + mask128, + ); write_rgba_u16_8( r_v, g_v, @@ -647,10 +745,22 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row( ); } { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 24).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 24).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 24).cast()), mask128); - let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x + 24).cast()), mask128); + let r_v = _mm_and_si128( + load_endian_u16x8::(r.as_ptr().add(x + 24).cast()), + mask128, + ); + let g_v = _mm_and_si128( + load_endian_u16x8::(g.as_ptr().add(x + 24).cast()), + mask128, + ); + let b_v = _mm_and_si128( + load_endian_u16x8::(b.as_ptr().add(x + 24).cast()), + mask128, + ); + let a_v = _mm_and_si128( + load_endian_u16x8::(a.as_ptr().add(x + 24).cast()), + mask128, + ); write_rgba_u16_8( r_v, g_v, @@ -662,15 +772,15 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row( x += 32; } while x + 8 <= width { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128); - let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask128); + let r_v = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask128); + let g_v = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask128); + let b_v = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask128); + let a_v = _mm_and_si128(load_endian_u16x8::(a.as_ptr().add(x).cast()), mask128); write_rgba_u16_8(r_v, g_v, b_v, a_v, rgba_u16_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::gbra_to_rgba_u16_high_bit_row::( + scalar::gbra_to_rgba_u16_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], diff --git a/src/row/arch/x86_avx512/tests/planar_gbr_high_bit.rs b/src/row/arch/x86_avx512/tests/planar_gbr_high_bit.rs index 3a5ca557..80bc153a 100644 --- a/src/row/arch/x86_avx512/tests/planar_gbr_high_bit.rs +++ b/src/row/arch/x86_avx512/tests/planar_gbr_high_bit.rs @@ -37,9 +37,9 @@ fn avx512_gbr_to_rgb_high_bit_matches_scalar_bits10() { let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_avx = std::vec![0u8; w * 3]; - scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w); + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -60,9 +60,9 @@ fn avx512_gbr_to_rgb_high_bit_matches_scalar_bits16() { let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_avx = std::vec![0u8; w * 3]; - scalar::gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w); + gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -83,9 +83,9 @@ fn avx512_gbr_to_rgba_opaque_high_bit_matches_scalar_bits10() { let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w); + gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -106,9 +106,9 @@ fn avx512_gbr_to_rgba_opaque_high_bit_matches_scalar_bits16() { let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w); + gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -130,9 +130,9 @@ fn avx512_gbra_to_rgba_high_bit_matches_scalar_bits10() { let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w); + gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -154,9 +154,9 @@ fn avx512_gbra_to_rgba_high_bit_matches_scalar_bits16() { let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_avx, w); + gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -179,9 +179,9 @@ fn avx512_gbr_to_rgb_u16_high_bit_matches_scalar_bits10() { let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_avx = std::vec![0u16; w * 3]; - scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w); + gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -202,9 +202,9 @@ fn avx512_gbr_to_rgb_u16_high_bit_matches_scalar_bits16() { let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_avx = std::vec![0u16; w * 3]; - scalar::gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w); + gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -225,9 +225,9 @@ fn avx512_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits10() { let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_avx = std::vec![0u16; w * 4]; - scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w); + gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -248,9 +248,9 @@ fn avx512_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits16() { let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_avx = std::vec![0u16; w * 4]; - scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w); + gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -272,9 +272,9 @@ fn avx512_gbra_to_rgba_u16_high_bit_matches_scalar_bits10() { let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_avx = std::vec![0u16; w * 4]; - scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w); + gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -296,9 +296,9 @@ fn avx512_gbra_to_rgba_u16_high_bit_matches_scalar_bits16() { let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_avx = std::vec![0u16; w * 4]; - scalar::gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_avx, w); + gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -321,9 +321,9 @@ fn avx512_gbr_to_rgb_high_bit_upper_bits_masked_bits10() { let r = gbr_plane_u16_dirty::<10>(w, 0x0400); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_avx = std::vec![0u8; w * 3]; - scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w); + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -345,9 +345,9 @@ fn avx512_gbra_to_rgba_high_bit_upper_bits_masked_bits10() { let a = gbr_plane_u16_dirty::<10>(w, 0x0C00); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w); + gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -368,9 +368,9 @@ fn avx512_gbr_to_rgb_u16_high_bit_upper_bits_masked_bits10() { let r = gbr_plane_u16_dirty::<10>(w, 0x0400); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_avx = std::vec![0u16; w * 3]; - scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w); + gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -392,9 +392,9 @@ fn avx512_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() { let a = gbr_plane_u16_dirty::<10>(w, 0x0C00); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_avx = std::vec![0u16; w * 4]; - scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w); + gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -402,3 +402,329 @@ fn avx512_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() { ); } } + +// ---- BE parity: AVX-512 output must match AVX-512 -- + +fn byte_swap_plane(plane: &[u16]) -> std::vec::Vec { + plane.iter().map(|v| v.swap_bytes()).collect() +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_gbr_to_rgb_high_bit_be_matches_le_bits10() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX-512 gbr_to_rgb_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_gbr_to_rgb_high_bit_be_matches_le_bits16() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX-512 gbr_to_rgb_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_gbr_to_rgba_opaque_high_bit_be_matches_le_bits10() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgba_opaque_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX-512 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_gbr_to_rgba_opaque_high_bit_be_matches_le_bits16() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgba_opaque_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX-512 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_gbra_to_rgba_high_bit_be_matches_le_bits10() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let a_be = byte_swap_plane(&a); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX-512 gbra_to_rgba_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_gbra_to_rgba_high_bit_be_matches_le_bits16() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let a_be = byte_swap_plane(&a); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX-512 gbra_to_rgba_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_gbr_to_rgb_u16_high_bit_be_matches_le_bits10() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX-512 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_gbr_to_rgb_u16_high_bit_be_matches_le_bits16() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX-512 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits10() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgba_opaque_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX-512 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits16() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgba_opaque_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX-512 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_gbra_to_rgba_u16_high_bit_be_matches_le_bits10() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let a_be = byte_swap_plane(&a); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX-512 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_gbra_to_rgba_u16_high_bit_be_matches_le_bits16() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let a_be = byte_swap_plane(&a); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "AVX-512 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} diff --git a/src/row/arch/x86_sse41/alpha_extract.rs b/src/row/arch/x86_sse41/alpha_extract.rs index 5abdfd08..d327e299 100644 --- a/src/row/arch/x86_sse41/alpha_extract.rs +++ b/src/row/arch/x86_sse41/alpha_extract.rs @@ -356,7 +356,12 @@ pub(crate) unsafe fn copy_alpha_plane_u16_to_u8( } if x < width { - scalar::copy_alpha_plane_u16_to_u8::( + // Scalar tail uses `BE = false`: this SSE4.1 helper does host-native + // u16 loads (`_mm_loadl_epi64`), which match LE-on-disk only on LE + // hosts. The dispatcher routes BE = true directly to scalar (see + // `dispatch::alpha_extract`), so the SIMD path here is BE = false by + // construction. + scalar::copy_alpha_plane_u16_to_u8::( &alpha[x..width], &mut rgba_out[x * 4..width * 4], width - x, @@ -440,7 +445,8 @@ pub(crate) unsafe fn copy_alpha_plane_u16( } if x < width { - scalar::copy_alpha_plane_u16::( + // Scalar tail uses `BE = false`: see `copy_alpha_plane_u16_to_u8` above. + scalar::copy_alpha_plane_u16::( &alpha[x..width], &mut rgba_out[x * 4..width * 4], width - x, @@ -581,7 +587,8 @@ mod tests { pseudo_random_u8(&mut rgba_simd, 0xBABE); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_simd, w) }; - scalar::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_scalar, w); + // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host). + scalar::copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } @@ -605,7 +612,8 @@ mod tests { pseudo_random_u8(&mut rgba_simd, 0x5EED); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_simd, w) }; - scalar::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_scalar, w); + // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host). + scalar::copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } @@ -626,7 +634,8 @@ mod tests { pseudo_random_u16(&mut rgba_simd, 0xFADE); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_simd, w) }; - scalar::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_scalar, w); + // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host). + scalar::copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } diff --git a/src/row/arch/x86_sse41/planar_gbr_high_bit.rs b/src/row/arch/x86_sse41/planar_gbr_high_bit.rs index 364eac74..f28d4fe3 100644 --- a/src/row/arch/x86_sse41/planar_gbr_high_bit.rs +++ b/src/row/arch/x86_sse41/planar_gbr_high_bit.rs @@ -1,6 +1,7 @@ //! SSE4.1 kernels for high-bit-depth planar GBR sources (Tier 10b). //! -//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}`. +//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}` and +//! `BE: bool` (endianness of the source u16 planes). //! Lane width: 8 pixels per iteration (8 × u16 per `__m128i`). //! Scalar tail handles the remainder. //! @@ -17,16 +18,26 @@ //! Use the existing `write_rgb_u16_8` / `write_rgba_u16_8` helpers from //! `x86_common` which interleave 8 u16 lanes per channel into packed //! RGB / RGBA u16 output. +//! +//! # Big-endian (`BE = true`) mode +//! +//! When `BE = true` each 8-pixel load goes through +//! `load_endian_u16x8::` (defined in `endian.rs`) which applies +//! `_mm_shuffle_epi8` (SSSE3 pshufb) to byte-swap every u16 lane. +//! The branch is resolved at monomorphisation — `BE = false` compiles +//! to a plain `_mm_loadu_si128`. use core::arch::x86_64::*; -use super::*; +use super::{endian::load_endian_u16x8, *}; // ---- u8 output, 3-channel (RGB) ----------------------------------------- /// SSE4.1 high-bit-depth G/B/R planar → packed `R, G, B` **bytes**. /// Downshifts each sample by `BITS - 8` and packs to u8. /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. SSE4.1 must be available (caller obligation). @@ -34,7 +45,7 @@ use super::*; /// 3. `rgb_out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gbr_to_rgb_high_bit_row( +pub(crate) unsafe fn gbr_to_rgb_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -58,9 +69,9 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( let mut x = 0usize; while x + 8 <= width { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask_v); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask_v); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask_v); + let r_v = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask_v); + let g_v = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask_v); + let b_v = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask_v); // Variable-count logical right-shift by BITS-8 per u16 lane. let r_sh = _mm_srl_epi16(r_v, shr_count); @@ -81,7 +92,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( x += 8; } if x < width { - scalar::gbr_to_rgb_high_bit_row::( + scalar::gbr_to_rgb_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -97,6 +108,8 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( /// SSE4.1 high-bit-depth G/B/R planar → packed `R, G, B, A` **bytes** /// with constant opaque alpha (`0xFF`). /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. SSE4.1 must be available (caller obligation). @@ -104,7 +117,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row( /// 3. `rgba_out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( +pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -127,9 +140,9 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( let mut x = 0usize; while x + 8 <= width { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask_v); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask_v); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask_v); + let r_v = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask_v); + let g_v = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask_v); + let b_v = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask_v); let r_sh = _mm_srl_epi16(r_v, shr_count); let g_sh = _mm_srl_epi16(g_v, shr_count); @@ -146,7 +159,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( x += 8; } if x < width { - scalar::gbr_to_rgba_opaque_high_bit_row::( + scalar::gbr_to_rgba_opaque_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -162,6 +175,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( /// SSE4.1 high-bit-depth G/B/R/A planar → packed `R, G, B, A` **bytes**. /// Alpha sourced from the `a` plane, downshifted by `BITS - 8`. /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. SSE4.1 must be available (caller obligation). @@ -169,7 +184,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row( /// 3. `rgba_out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gbra_to_rgba_high_bit_row( +pub(crate) unsafe fn gbra_to_rgba_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -191,10 +206,10 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row( let mut x = 0usize; while x + 8 <= width { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask_v); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask_v); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask_v); - let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask_v); + let r_v = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask_v); + let g_v = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask_v); + let b_v = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask_v); + let a_v = _mm_and_si128(load_endian_u16x8::(a.as_ptr().add(x).cast()), mask_v); let r_sh = _mm_srl_epi16(r_v, shr_count); let g_sh = _mm_srl_epi16(g_v, shr_count); @@ -213,7 +228,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row( x += 8; } if x < width { - scalar::gbra_to_rgba_high_bit_row::( + scalar::gbra_to_rgba_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -230,6 +245,8 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row( /// SSE4.1 high-bit-depth G/B/R planar → packed `R, G, B` **u16** samples. /// No shift — values copied directly, reordered G/B/R → R/G/B. /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. SSE4.1 must be available (caller obligation). @@ -237,7 +254,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row( /// 3. `rgb_u16_out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( +pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -254,14 +271,14 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( let mask_v = _mm_set1_epi16(((1u32 << BITS) - 1) as u16 as i16); let mut x = 0usize; while x + 8 <= width { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask_v); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask_v); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask_v); + let r_v = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask_v); + let g_v = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask_v); + let b_v = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask_v); write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::gbr_to_rgb_u16_high_bit_row::( + scalar::gbr_to_rgb_u16_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -277,6 +294,8 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( /// SSE4.1 high-bit-depth G/B/R planar → packed `R, G, B, A` **u16** samples /// with constant opaque alpha `(1 << BITS) - 1`. /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. SSE4.1 must be available (caller obligation). @@ -284,7 +303,7 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row( /// 3. `rgba_u16_out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( +pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -307,14 +326,14 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( let mut x = 0usize; while x + 8 <= width { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask_v); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask_v); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask_v); + let r_v = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask_v); + let g_v = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask_v); + let b_v = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask_v); write_rgba_u16_8(r_v, g_v, b_v, opaque, rgba_u16_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::gbr_to_rgba_opaque_u16_high_bit_row::( + scalar::gbr_to_rgba_opaque_u16_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], @@ -330,6 +349,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( /// SSE4.1 high-bit-depth G/B/R/A planar → packed `R, G, B, A` **u16** samples. /// Alpha sourced from the `a` plane at native depth (no shift). /// +/// When `BE = true` each source u16 element is byte-swapped on load. +/// /// # Safety /// /// 1. SSE4.1 must be available (caller obligation). @@ -337,7 +358,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row( /// 3. `rgba_u16_out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row( +pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -359,15 +380,15 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row( let mask_v = _mm_set1_epi16(((1u32 << BITS) - 1) as u16 as i16); let mut x = 0usize; while x + 8 <= width { - let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask_v); - let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask_v); - let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask_v); - let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask_v); + let r_v = _mm_and_si128(load_endian_u16x8::(r.as_ptr().add(x).cast()), mask_v); + let g_v = _mm_and_si128(load_endian_u16x8::(g.as_ptr().add(x).cast()), mask_v); + let b_v = _mm_and_si128(load_endian_u16x8::(b.as_ptr().add(x).cast()), mask_v); + let a_v = _mm_and_si128(load_endian_u16x8::(a.as_ptr().add(x).cast()), mask_v); write_rgba_u16_8(r_v, g_v, b_v, a_v, rgba_u16_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::gbra_to_rgba_u16_high_bit_row::( + scalar::gbra_to_rgba_u16_high_bit_row::( &g[x..width], &b[x..width], &r[x..width], diff --git a/src/row/arch/x86_sse41/tests/planar_gbr_high_bit.rs b/src/row/arch/x86_sse41/tests/planar_gbr_high_bit.rs index 7292b15e..f0c11bf1 100644 --- a/src/row/arch/x86_sse41/tests/planar_gbr_high_bit.rs +++ b/src/row/arch/x86_sse41/tests/planar_gbr_high_bit.rs @@ -37,9 +37,9 @@ fn sse41_gbr_to_rgb_high_bit_matches_scalar_bits10() { let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_sse = std::vec![0u8; w * 3]; - scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_sse, w); + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -60,9 +60,9 @@ fn sse41_gbr_to_rgb_high_bit_matches_scalar_bits16() { let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_sse = std::vec![0u8; w * 3]; - scalar::gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_sse, w); + gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -83,9 +83,9 @@ fn sse41_gbr_to_rgba_opaque_high_bit_matches_scalar_bits10() { let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_sse = std::vec![0u8; w * 4]; - scalar::gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_sse, w); + gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -106,9 +106,9 @@ fn sse41_gbr_to_rgba_opaque_high_bit_matches_scalar_bits16() { let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_sse = std::vec![0u8; w * 4]; - scalar::gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_sse, w); + gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -130,9 +130,9 @@ fn sse41_gbra_to_rgba_high_bit_matches_scalar_bits10() { let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_sse = std::vec![0u8; w * 4]; - scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_sse, w); + gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -154,9 +154,9 @@ fn sse41_gbra_to_rgba_high_bit_matches_scalar_bits16() { let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_sse = std::vec![0u8; w * 4]; - scalar::gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_sse, w); + gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -179,9 +179,9 @@ fn sse41_gbr_to_rgb_u16_high_bit_matches_scalar_bits10() { let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_sse = std::vec![0u16; w * 3]; - scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_sse, w); + gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -202,9 +202,9 @@ fn sse41_gbr_to_rgb_u16_high_bit_matches_scalar_bits16() { let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_sse = std::vec![0u16; w * 3]; - scalar::gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_sse, w); + gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -225,9 +225,9 @@ fn sse41_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits10() { let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_sse = std::vec![0u16; w * 4]; - scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_sse, w); + gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -248,9 +248,9 @@ fn sse41_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits16() { let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_sse = std::vec![0u16; w * 4]; - scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_sse, w); + gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -272,9 +272,9 @@ fn sse41_gbra_to_rgba_u16_high_bit_matches_scalar_bits10() { let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_sse = std::vec![0u16; w * 4]; - scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_sse, w); + gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -296,9 +296,9 @@ fn sse41_gbra_to_rgba_u16_high_bit_matches_scalar_bits16() { let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_sse = std::vec![0u16; w * 4]; - scalar::gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_sse, w); + gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -321,9 +321,9 @@ fn sse41_gbr_to_rgb_high_bit_upper_bits_masked_bits10() { let r = gbr_plane_u16_dirty::<10>(w, 0x0400); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_sse = std::vec![0u8; w * 3]; - scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_sse, w); + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -345,9 +345,9 @@ fn sse41_gbra_to_rgba_high_bit_upper_bits_masked_bits10() { let a = gbr_plane_u16_dirty::<10>(w, 0x0C00); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_sse = std::vec![0u8; w * 4]; - scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_sse, w); + gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -368,9 +368,9 @@ fn sse41_gbr_to_rgb_u16_high_bit_upper_bits_masked_bits10() { let r = gbr_plane_u16_dirty::<10>(w, 0x0400); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_sse = std::vec![0u16; w * 3]; - scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w); + scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w); unsafe { - gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_sse, w); + gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -392,9 +392,9 @@ fn sse41_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() { let a = gbr_plane_u16_dirty::<10>(w, 0x0C00); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_sse = std::vec![0u16; w * 4]; - scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_sse, w); + gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -402,3 +402,332 @@ fn sse41_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() { ); } } + +// ---- BE parity: SSE4.1 output must match SSE4.1 --- +// +// Byte-swap LE inputs to produce BE-encoded data; verify that BE=true kernel +// output is byte-identical to BE=false kernel output on the original LE data. + +fn byte_swap_plane(plane: &[u16]) -> std::vec::Vec { + plane.iter().map(|v| v.swap_bytes()).collect() +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_gbr_to_rgb_high_bit_be_matches_le_bits10() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "SSE4.1 gbr_to_rgb_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_gbr_to_rgb_high_bit_be_matches_le_bits16() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "SSE4.1 gbr_to_rgb_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_gbr_to_rgba_opaque_high_bit_be_matches_le_bits10() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgba_opaque_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "SSE4.1 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_gbr_to_rgba_opaque_high_bit_be_matches_le_bits16() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgba_opaque_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "SSE4.1 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_gbra_to_rgba_high_bit_be_matches_le_bits10() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let a_be = byte_swap_plane(&a); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "SSE4.1 gbra_to_rgba_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_gbra_to_rgba_high_bit_be_matches_le_bits16() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let a_be = byte_swap_plane(&a); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "SSE4.1 gbra_to_rgba_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_gbr_to_rgb_u16_high_bit_be_matches_le_bits10() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "SSE4.1 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_gbr_to_rgb_u16_high_bit_be_matches_le_bits16() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "SSE4.1 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits10() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgba_opaque_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "SSE4.1 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits16() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgba_opaque_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "SSE4.1 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_gbra_to_rgba_u16_high_bit_be_matches_le_bits10() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<10>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF); + let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let a_be = byte_swap_plane(&a); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "SSE4.1 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_gbra_to_rgba_u16_high_bit_be_matches_le_bits16() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] { + let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B); + let b = gbr_plane_u16::<16>(w, 0x12AB_34CD); + let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF); + let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D); + let g_be = byte_swap_plane(&g); + let b_be = byte_swap_plane(&b); + let r_be = byte_swap_plane(&r); + let a_be = byte_swap_plane(&a); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "SSE4.1 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})" + ); + } +} diff --git a/src/row/dispatch/alpha_extract.rs b/src/row/dispatch/alpha_extract.rs index 75cccb8d..00ecb61e 100644 --- a/src/row/dispatch/alpha_extract.rs +++ b/src/row/dispatch/alpha_extract.rs @@ -260,17 +260,45 @@ pub(crate) fn copy_alpha_plane_u8(alpha: &[u8], rgba_out: &mut [u8], width: usiz /// scatter α plane (u16) into `rgba_out[3 + 4*n]` (u8) with /// depth-conv `>> (BITS - 8)`. /// -/// Selects the highest available SIMD backend; falls back to scalar. -/// When `use_simd` is `false`, calls scalar directly. +/// `BE` selects the source α plane byte order (`false` = LE on disk/wire, +/// `true` = BE on disk/wire). The SIMD α-extract helpers use host-native +/// `u16` loads (`vld1q_u16` / `_mm_loadu_si128` / `v128_load64_zero`) AND +/// hardcode their scalar tail to `scalar::`. So SIMD is only +/// correct when BOTH the host CPU is little-endian AND the source data is +/// little-endian — any other quadrant either loads the wrong byte order in +/// the vector body (LE-data on BE-host / BE-data on LE-host) or feeds +/// already-native u16 samples through `u16::from_le` in the scalar tail +/// (BE-data on BE-host), corrupting the tail at non-multiple widths. +/// +/// The dispatcher computes +/// `safe_for_simd = !BE && cfg!(target_endian = "little")` and routes to +/// scalar in every other quadrant. The scalar helper is target-endian-aware +/// via `u16::from_be` / `u16::from_le`, so this scalar fallback emits the +/// correct α plane on every host. Phase 4 will plumb BE through the SIMD +/// helpers if a BE-input sinker hot-path lands. +/// +/// Truth table (`safe_for_simd = !BE && target_endian == "little"`): +/// - LE data, LE host: `!false && true = true` → SIMD (host-native LE u16 loads correct, tail `from_le` is no-op) +/// - LE data, BE host: `!false && false = false` → scalar (handles via `from_le`) +/// - BE data, LE host: `!true && true = false` → scalar (handles via `from_be`) +/// - BE data, BE host: `!true && false = false` → scalar (handles via `from_be`; SIMD vector body would be correct but tail `from_le` would corrupt non-multiple widths — see codex 4th-pass review of PR #82) +/// +/// Selects the highest available SIMD backend on LE-host with LE-data; +/// falls back to scalar otherwise. When `use_simd` is `false`, calls +/// scalar directly. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn copy_alpha_plane_u16_to_u8( +pub(crate) fn copy_alpha_plane_u16_to_u8( alpha: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool, ) { - if !use_simd { - return scalar::copy_alpha_plane_u16_to_u8::(alpha, rgba_out, width); + // SIMD α-extract helpers use host-native u16 loads + a scalar tail + // hardcoded to BE=false. They are only correct on LE host with LE + // source data. Force scalar in every other quadrant. + let safe_for_simd = !BE && cfg!(target_endian = "little"); + if !safe_for_simd || !use_simd { + return scalar::copy_alpha_plane_u16_to_u8::(alpha, rgba_out, width); } cfg_select! { target_arch = "aarch64" => { @@ -306,7 +334,7 @@ pub(crate) fn copy_alpha_plane_u16_to_u8( }, _ => {} } - scalar::copy_alpha_plane_u16_to_u8::(alpha, rgba_out, width); + scalar::copy_alpha_plane_u16_to_u8::(alpha, rgba_out, width); } // --------------------------------------------------------------------------- @@ -317,17 +345,30 @@ pub(crate) fn copy_alpha_plane_u16_to_u8( /// scatter α plane (u16) into `rgba_out[3 + 4*n]` (u16). No depth /// conversion. /// -/// Selects the highest available SIMD backend; falls back to scalar. -/// When `use_simd` is `false`, calls scalar directly. +/// `BE` selects the source α plane byte order (`false` = LE on disk/wire, +/// `true` = BE on disk/wire). The dispatcher computes +/// `safe_for_simd = !BE && cfg!(target_endian = "little")` and routes to +/// scalar in every other quadrant: see `copy_alpha_plane_u16_to_u8` above +/// for the truth table and rationale (SIMD α-extract uses host-native u16 +/// loads AND hardcodes its scalar tail to `BE=false`, so it only handles +/// the LE-host/LE-data quadrant correctly; scalar is target-endian-aware). +/// +/// Selects the highest available SIMD backend on LE-host with LE-data; +/// falls back to scalar otherwise. When `use_simd` is `false`, calls +/// scalar directly. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn copy_alpha_plane_u16( +pub(crate) fn copy_alpha_plane_u16( alpha: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool, ) { - if !use_simd { - return scalar::copy_alpha_plane_u16::(alpha, rgba_out, width); + // SIMD α-extract helpers use host-native u16 loads + a scalar tail + // hardcoded to BE=false. They are only correct on LE host with LE + // source data. Force scalar in every other quadrant. + let safe_for_simd = !BE && cfg!(target_endian = "little"); + if !safe_for_simd || !use_simd { + return scalar::copy_alpha_plane_u16::(alpha, rgba_out, width); } cfg_select! { target_arch = "aarch64" => { @@ -363,5 +404,5 @@ pub(crate) fn copy_alpha_plane_u16( }, _ => {} } - scalar::copy_alpha_plane_u16::(alpha, rgba_out, width); + scalar::copy_alpha_plane_u16::(alpha, rgba_out, width); } diff --git a/src/row/dispatch/planar_gbr_high_bit.rs b/src/row/dispatch/planar_gbr_high_bit.rs index 544d8166..9511e662 100644 --- a/src/row/dispatch/planar_gbr_high_bit.rs +++ b/src/row/dispatch/planar_gbr_high_bit.rs @@ -1,6 +1,7 @@ //! Runtime SIMD dispatchers for high-bit-depth planar GBR sources (Tier 10b). //! -//! Seven kernel variants, all const-generic over `BITS ∈ {9, 10, 12, 14, 16}`: +//! Seven kernel variants, all const-generic over `BITS ∈ {9, 10, 12, 14, 16}` +//! and `BE` (big-endian input when `true`): //! - [`gbr_to_rgb_high_bit_row`] — interleave G/B/R → packed `R, G, B` bytes. //! - [`gbr_to_rgb_u16_high_bit_row`] — interleave G/B/R → packed `R, G, B` u16. //! - [`gbr_to_rgba_opaque_high_bit_row`] — interleave G/B/R → packed @@ -39,8 +40,9 @@ use crate::{ /// Interleaves three planar G/B/R `u16` rows into packed `R, G, B` **bytes**. /// Downshifts each sample by `BITS - 8`. `use_simd = false` forces scalar. +/// When `BE = true`, input u16 samples are big-endian and byte-swapped first. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn gbr_to_rgb_high_bit_row( +pub fn gbr_to_rgb_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -65,31 +67,33 @@ pub fn gbr_to_rgb_high_bit_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified available. - unsafe { arch::neon::gbr_to_rgb_high_bit_row::(g, b, r, rgb_out, width); } + unsafe { arch::neon::gbr_to_rgb_high_bit_row::(g, b, r, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified available. - unsafe { arch::x86_avx512::gbr_to_rgb_high_bit_row::(g, b, r, rgb_out, width); } + unsafe { arch::x86_avx512::gbr_to_rgb_high_bit_row::(g, b, r, rgb_out, width); } return; } if avx2_available() { // SAFETY: AVX2 verified available. - unsafe { arch::x86_avx2::gbr_to_rgb_high_bit_row::(g, b, r, rgb_out, width); } + unsafe { arch::x86_avx2::gbr_to_rgb_high_bit_row::(g, b, r, rgb_out, width); } return; } if sse41_available() { // SAFETY: SSE4.1 verified available. - unsafe { arch::x86_sse41::gbr_to_rgb_high_bit_row::(g, b, r, rgb_out, width); } + unsafe { arch::x86_sse41::gbr_to_rgb_high_bit_row::(g, b, r, rgb_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time enabled. - unsafe { arch::wasm_simd128::gbr_to_rgb_high_bit_row::(g, b, r, rgb_out, width); } + unsafe { + arch::wasm_simd128::gbr_to_rgb_high_bit_row::(g, b, r, rgb_out, width); + } return; } }, @@ -97,7 +101,7 @@ pub fn gbr_to_rgb_high_bit_row( } } - scalar::gbr_to_rgb_high_bit_row::(g, b, r, rgb_out, width); + scalar::gbr_to_rgb_high_bit_row::(g, b, r, rgb_out, width); } // --------------------------------------------------------------------------- @@ -107,8 +111,9 @@ pub fn gbr_to_rgb_high_bit_row( /// Interleaves three planar G/B/R `u16` rows into packed `R, G, B` **u16** /// elements. Samples are copied as-is (no depth conversion); values stay in /// `[0, (1 << BITS) - 1]`. `use_simd = false` forces scalar. +/// When `BE = true`, input u16 samples are big-endian and byte-swapped first. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn gbr_to_rgb_u16_high_bit_row( +pub fn gbr_to_rgb_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -134,7 +139,7 @@ pub fn gbr_to_rgb_u16_high_bit_row( if neon_available() { // SAFETY: NEON verified available. unsafe { - arch::neon::gbr_to_rgb_u16_high_bit_row::(g, b, r, rgb_u16_out, width); + arch::neon::gbr_to_rgb_u16_high_bit_row::(g, b, r, rgb_u16_out, width); } return; } @@ -143,21 +148,27 @@ pub fn gbr_to_rgb_u16_high_bit_row( if avx512_available() { // SAFETY: AVX-512BW verified available. unsafe { - arch::x86_avx512::gbr_to_rgb_u16_high_bit_row::(g, b, r, rgb_u16_out, width); + arch::x86_avx512::gbr_to_rgb_u16_high_bit_row::( + g, b, r, rgb_u16_out, width, + ); } return; } if avx2_available() { // SAFETY: AVX2 verified available. unsafe { - arch::x86_avx2::gbr_to_rgb_u16_high_bit_row::(g, b, r, rgb_u16_out, width); + arch::x86_avx2::gbr_to_rgb_u16_high_bit_row::( + g, b, r, rgb_u16_out, width, + ); } return; } if sse41_available() { // SAFETY: SSE4.1 verified available. unsafe { - arch::x86_sse41::gbr_to_rgb_u16_high_bit_row::(g, b, r, rgb_u16_out, width); + arch::x86_sse41::gbr_to_rgb_u16_high_bit_row::( + g, b, r, rgb_u16_out, width, + ); } return; } @@ -166,7 +177,9 @@ pub fn gbr_to_rgb_u16_high_bit_row( if simd128_available() { // SAFETY: simd128 compile-time enabled. unsafe { - arch::wasm_simd128::gbr_to_rgb_u16_high_bit_row::(g, b, r, rgb_u16_out, width); + arch::wasm_simd128::gbr_to_rgb_u16_high_bit_row::( + g, b, r, rgb_u16_out, width, + ); } return; } @@ -175,7 +188,7 @@ pub fn gbr_to_rgb_u16_high_bit_row( } } - scalar::gbr_to_rgb_u16_high_bit_row::(g, b, r, rgb_u16_out, width); + scalar::gbr_to_rgb_u16_high_bit_row::(g, b, r, rgb_u16_out, width); } // --------------------------------------------------------------------------- @@ -185,8 +198,9 @@ pub fn gbr_to_rgb_u16_high_bit_row( /// Interleaves three planar G/B/R `u16` rows into packed `R, G, B, A` **bytes** /// with constant α = `0xFF`. Used by `GbrpN` for standalone `with_rgba` path. /// `use_simd = false` forces scalar. +/// When `BE = true`, input u16 samples are big-endian and byte-swapped first. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn gbr_to_rgba_opaque_high_bit_row( +pub fn gbr_to_rgba_opaque_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -212,7 +226,7 @@ pub fn gbr_to_rgba_opaque_high_bit_row( if neon_available() { // SAFETY: NEON verified available. unsafe { - arch::neon::gbr_to_rgba_opaque_high_bit_row::(g, b, r, rgba_out, width); + arch::neon::gbr_to_rgba_opaque_high_bit_row::(g, b, r, rgba_out, width); } return; } @@ -221,21 +235,27 @@ pub fn gbr_to_rgba_opaque_high_bit_row( if avx512_available() { // SAFETY: AVX-512BW verified available. unsafe { - arch::x86_avx512::gbr_to_rgba_opaque_high_bit_row::(g, b, r, rgba_out, width); + arch::x86_avx512::gbr_to_rgba_opaque_high_bit_row::( + g, b, r, rgba_out, width, + ); } return; } if avx2_available() { // SAFETY: AVX2 verified available. unsafe { - arch::x86_avx2::gbr_to_rgba_opaque_high_bit_row::(g, b, r, rgba_out, width); + arch::x86_avx2::gbr_to_rgba_opaque_high_bit_row::( + g, b, r, rgba_out, width, + ); } return; } if sse41_available() { // SAFETY: SSE4.1 verified available. unsafe { - arch::x86_sse41::gbr_to_rgba_opaque_high_bit_row::(g, b, r, rgba_out, width); + arch::x86_sse41::gbr_to_rgba_opaque_high_bit_row::( + g, b, r, rgba_out, width, + ); } return; } @@ -244,7 +264,9 @@ pub fn gbr_to_rgba_opaque_high_bit_row( if simd128_available() { // SAFETY: simd128 compile-time enabled. unsafe { - arch::wasm_simd128::gbr_to_rgba_opaque_high_bit_row::(g, b, r, rgba_out, width); + arch::wasm_simd128::gbr_to_rgba_opaque_high_bit_row::( + g, b, r, rgba_out, width, + ); } return; } @@ -253,7 +275,7 @@ pub fn gbr_to_rgba_opaque_high_bit_row( } } - scalar::gbr_to_rgba_opaque_high_bit_row::(g, b, r, rgba_out, width); + scalar::gbr_to_rgba_opaque_high_bit_row::(g, b, r, rgba_out, width); } // --------------------------------------------------------------------------- @@ -264,8 +286,9 @@ pub fn gbr_to_rgba_opaque_high_bit_row( /// **u16** elements with constant α = `(1 << BITS) - 1` (native-depth /// opaque). Used by `GbrpN` for standalone `with_rgba_u16` path. /// `use_simd = false` forces scalar. +/// When `BE = true`, input u16 samples are big-endian and byte-swapped first. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn gbr_to_rgba_opaque_u16_high_bit_row( +pub fn gbr_to_rgba_opaque_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -291,7 +314,7 @@ pub fn gbr_to_rgba_opaque_u16_high_bit_row( if neon_available() { // SAFETY: NEON verified available. unsafe { - arch::neon::gbr_to_rgba_opaque_u16_high_bit_row::( + arch::neon::gbr_to_rgba_opaque_u16_high_bit_row::( g, b, r, rgba_u16_out, width, ); } @@ -302,7 +325,7 @@ pub fn gbr_to_rgba_opaque_u16_high_bit_row( if avx512_available() { // SAFETY: AVX-512BW verified available. unsafe { - arch::x86_avx512::gbr_to_rgba_opaque_u16_high_bit_row::( + arch::x86_avx512::gbr_to_rgba_opaque_u16_high_bit_row::( g, b, r, rgba_u16_out, width, ); } @@ -311,7 +334,7 @@ pub fn gbr_to_rgba_opaque_u16_high_bit_row( if avx2_available() { // SAFETY: AVX2 verified available. unsafe { - arch::x86_avx2::gbr_to_rgba_opaque_u16_high_bit_row::( + arch::x86_avx2::gbr_to_rgba_opaque_u16_high_bit_row::( g, b, r, rgba_u16_out, width, ); } @@ -320,7 +343,7 @@ pub fn gbr_to_rgba_opaque_u16_high_bit_row( if sse41_available() { // SAFETY: SSE4.1 verified available. unsafe { - arch::x86_sse41::gbr_to_rgba_opaque_u16_high_bit_row::( + arch::x86_sse41::gbr_to_rgba_opaque_u16_high_bit_row::( g, b, r, rgba_u16_out, width, ); } @@ -331,7 +354,7 @@ pub fn gbr_to_rgba_opaque_u16_high_bit_row( if simd128_available() { // SAFETY: simd128 compile-time enabled. unsafe { - arch::wasm_simd128::gbr_to_rgba_opaque_u16_high_bit_row::( + arch::wasm_simd128::gbr_to_rgba_opaque_u16_high_bit_row::( g, b, r, rgba_u16_out, width, ); } @@ -342,7 +365,7 @@ pub fn gbr_to_rgba_opaque_u16_high_bit_row( } } - scalar::gbr_to_rgba_opaque_u16_high_bit_row::(g, b, r, rgba_u16_out, width); + scalar::gbr_to_rgba_opaque_u16_high_bit_row::(g, b, r, rgba_u16_out, width); } // --------------------------------------------------------------------------- @@ -352,9 +375,10 @@ pub fn gbr_to_rgba_opaque_u16_high_bit_row( /// Interleaves four planar G/B/R/A `u16` rows into packed `R, G, B, A` /// **bytes**. Alpha is downshifted by `BITS - 8` (real source α, not /// constant). `use_simd = false` forces scalar. +/// When `BE = true`, input u16 samples are big-endian and byte-swapped first. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn gbra_to_rgba_high_bit_row( +pub fn gbra_to_rgba_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -382,7 +406,7 @@ pub fn gbra_to_rgba_high_bit_row( if neon_available() { // SAFETY: NEON verified available. unsafe { - arch::neon::gbra_to_rgba_high_bit_row::(g, b, r, a, rgba_out, width); + arch::neon::gbra_to_rgba_high_bit_row::(g, b, r, a, rgba_out, width); } return; } @@ -391,21 +415,21 @@ pub fn gbra_to_rgba_high_bit_row( if avx512_available() { // SAFETY: AVX-512BW verified available. unsafe { - arch::x86_avx512::gbra_to_rgba_high_bit_row::(g, b, r, a, rgba_out, width); + arch::x86_avx512::gbra_to_rgba_high_bit_row::(g, b, r, a, rgba_out, width); } return; } if avx2_available() { // SAFETY: AVX2 verified available. unsafe { - arch::x86_avx2::gbra_to_rgba_high_bit_row::(g, b, r, a, rgba_out, width); + arch::x86_avx2::gbra_to_rgba_high_bit_row::(g, b, r, a, rgba_out, width); } return; } if sse41_available() { // SAFETY: SSE4.1 verified available. unsafe { - arch::x86_sse41::gbra_to_rgba_high_bit_row::(g, b, r, a, rgba_out, width); + arch::x86_sse41::gbra_to_rgba_high_bit_row::(g, b, r, a, rgba_out, width); } return; } @@ -414,7 +438,9 @@ pub fn gbra_to_rgba_high_bit_row( if simd128_available() { // SAFETY: simd128 compile-time enabled. unsafe { - arch::wasm_simd128::gbra_to_rgba_high_bit_row::(g, b, r, a, rgba_out, width); + arch::wasm_simd128::gbra_to_rgba_high_bit_row::( + g, b, r, a, rgba_out, width, + ); } return; } @@ -423,7 +449,7 @@ pub fn gbra_to_rgba_high_bit_row( } } - scalar::gbra_to_rgba_high_bit_row::(g, b, r, a, rgba_out, width); + scalar::gbra_to_rgba_high_bit_row::(g, b, r, a, rgba_out, width); } // --------------------------------------------------------------------------- @@ -433,9 +459,10 @@ pub fn gbra_to_rgba_high_bit_row( /// Interleaves four planar G/B/R/A `u16` rows into packed `R, G, B, A` /// **u16** elements. Alpha is copied directly without depth conversion (values /// stay in `[0, (1 << BITS) - 1]`). `use_simd = false` forces scalar. +/// When `BE = true`, input u16 samples are big-endian and byte-swapped first. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn gbra_to_rgba_u16_high_bit_row( +pub fn gbra_to_rgba_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -463,7 +490,9 @@ pub fn gbra_to_rgba_u16_high_bit_row( if neon_available() { // SAFETY: NEON verified available. unsafe { - arch::neon::gbra_to_rgba_u16_high_bit_row::(g, b, r, a, rgba_u16_out, width); + arch::neon::gbra_to_rgba_u16_high_bit_row::( + g, b, r, a, rgba_u16_out, width, + ); } return; } @@ -472,7 +501,7 @@ pub fn gbra_to_rgba_u16_high_bit_row( if avx512_available() { // SAFETY: AVX-512BW verified available. unsafe { - arch::x86_avx512::gbra_to_rgba_u16_high_bit_row::( + arch::x86_avx512::gbra_to_rgba_u16_high_bit_row::( g, b, r, a, rgba_u16_out, width, ); } @@ -481,7 +510,7 @@ pub fn gbra_to_rgba_u16_high_bit_row( if avx2_available() { // SAFETY: AVX2 verified available. unsafe { - arch::x86_avx2::gbra_to_rgba_u16_high_bit_row::( + arch::x86_avx2::gbra_to_rgba_u16_high_bit_row::( g, b, r, a, rgba_u16_out, width, ); } @@ -490,7 +519,7 @@ pub fn gbra_to_rgba_u16_high_bit_row( if sse41_available() { // SAFETY: SSE4.1 verified available. unsafe { - arch::x86_sse41::gbra_to_rgba_u16_high_bit_row::( + arch::x86_sse41::gbra_to_rgba_u16_high_bit_row::( g, b, r, a, rgba_u16_out, width, ); } @@ -501,7 +530,7 @@ pub fn gbra_to_rgba_u16_high_bit_row( if simd128_available() { // SAFETY: simd128 compile-time enabled. unsafe { - arch::wasm_simd128::gbra_to_rgba_u16_high_bit_row::( + arch::wasm_simd128::gbra_to_rgba_u16_high_bit_row::( g, b, r, a, rgba_u16_out, width, ); } @@ -512,7 +541,7 @@ pub fn gbra_to_rgba_u16_high_bit_row( } } - scalar::gbra_to_rgba_u16_high_bit_row::(g, b, r, a, rgba_u16_out, width); + scalar::gbra_to_rgba_u16_high_bit_row::(g, b, r, a, rgba_u16_out, width); } // --------------------------------------------------------------------------- @@ -529,9 +558,10 @@ pub fn gbra_to_rgba_u16_high_bit_row( /// `use_simd` accepted for signature consistency with the rest of the /// row dispatcher family. Currently no SIMD path is wired (kernel is /// scalar-only); the flag is reserved for future backends. +/// When `BE = true`, input u16 samples are big-endian and byte-swapped first. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn gbr_to_luma_u16_high_bit_row( +pub fn gbr_to_luma_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -551,5 +581,5 @@ pub fn gbr_to_luma_u16_high_bit_row( assert!(b.len() >= width, "b row too short"); assert!(r.len() >= width, "r row too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - scalar::gbr_to_luma_u16_high_bit_row::(g, b, r, luma_out, width, matrix, full_range); + scalar::gbr_to_luma_u16_high_bit_row::(g, b, r, luma_out, width, matrix, full_range); } diff --git a/src/row/mod.rs b/src/row/mod.rs index 297f1c3c..a5d210f1 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -615,7 +615,7 @@ mod overflow_tests { let b: [u16; 0] = []; let r: [u16; 0] = []; let mut rgb: [u8; 0] = []; - gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut rgb, OVERFLOW_WIDTH, false); + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut rgb, OVERFLOW_WIDTH, false); } #[cfg(target_pointer_width = "32")] @@ -626,7 +626,7 @@ mod overflow_tests { let b: [u16; 0] = []; let r: [u16; 0] = []; let mut rgb: [u16; 0] = []; - gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut rgb, OVERFLOW_WIDTH, false); + gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut rgb, OVERFLOW_WIDTH, false); } #[cfg(target_pointer_width = "32")] @@ -637,7 +637,7 @@ mod overflow_tests { let b: [u16; 0] = []; let r: [u16; 0] = []; let mut rgba: [u8; 0] = []; - gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut rgba, OVERFLOW_WIDTH, false); + gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut rgba, OVERFLOW_WIDTH, false); } #[cfg(target_pointer_width = "32")] @@ -648,7 +648,7 @@ mod overflow_tests { let b: [u16; 0] = []; let r: [u16; 0] = []; let mut rgba: [u16; 0] = []; - gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut rgba, OVERFLOW_WIDTH, false); + gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut rgba, OVERFLOW_WIDTH, false); } #[cfg(target_pointer_width = "32")] @@ -660,7 +660,7 @@ mod overflow_tests { let r: [u16; 0] = []; let a: [u16; 0] = []; let mut rgba: [u8; 0] = []; - gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut rgba, OVERFLOW_WIDTH, false); + gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut rgba, OVERFLOW_WIDTH, false); } #[cfg(target_pointer_width = "32")] @@ -672,7 +672,7 @@ mod overflow_tests { let r: [u16; 0] = []; let a: [u16; 0] = []; let mut rgba: [u16; 0] = []; - gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut rgba, OVERFLOW_WIDTH, false); + gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut rgba, OVERFLOW_WIDTH, false); } // ---- Tier 11 gray dispatchers — `width × {3, 4}` overflow ---- diff --git a/src/row/scalar/alpha_extract.rs b/src/row/scalar/alpha_extract.rs index a5190496..6c77346a 100644 --- a/src/row/scalar/alpha_extract.rs +++ b/src/row/scalar/alpha_extract.rs @@ -96,20 +96,33 @@ pub(crate) fn copy_alpha_plane_u8(alpha: &[u8], rgba_out: &mut [u8], width: usiz } } -/// Yuva*p9/10/12/14 → u8 RGBA: scatter α plane (u16) into -/// `rgba_out[3 + 4*n]` (u8) with depth-conv `>> (BITS - 8)`. +/// Yuva*p9/10/12/14/16 + Gbrap10/12/14/16 → u8 RGBA: scatter α plane +/// (u16) into `rgba_out[3 + 4*n]` (u8) with depth-conv `>> (BITS - 8)`. /// -/// `BITS` is the source α bit depth (9, 10, 12, or 14). +/// `BITS` is the source α bit depth (any value in `[8, 16]`; the runtime +/// `assert!` enforces the range). In practice callers pass 9, 10, 12, 14, +/// or 16. `BE` selects the **byte order** of the encoded source α plane: +/// `false` = LE on disk/wire (e.g., AV `Yuva420p10le`, `Gbrap10le`), +/// `true` = BE on disk/wire (e.g., `Yuva420p10be`, `Gbrap10be`). /// -/// α is masked with `(1 << BITS) - 1` BEFORE the shift to canonicalize -/// over-range source samples. Frame constructors admit raw u16 input -/// (e.g., p010-style buffers store the 10 active bits in the HIGH bits +/// Each raw u16 sample is converted from its disk byte order into host-native +/// order via `u16::from_le` / `u16::from_be` BEFORE the BITS-mask + shift. +/// On a host whose endianness matches the data, the conversion compiles to a +/// no-op; otherwise it is a `swap_bytes`. This mirrors the +/// `load_endian_u16x*::` SIMD pattern from #81 so scalar tails and SIMD +/// paths stay byte-for-byte equivalent on every host. Without this, a +/// big-endian host (e.g., s390x) processing LE source data would emit a +/// byte-reversed α plane. +/// +/// α is masked with `(1 << BITS) - 1` AFTER the endian conversion to +/// canonicalize over-range source samples. Frame constructors admit raw u16 +/// input (e.g., p010-style buffers store the 10 active bits in the HIGH bits /// of u16), so an unmasked over-range value would otherwise leak through /// the shift and produce divergent output between scalar and SIMD paths. /// See sibling inline-α kernels (`yuva_4_*` row impls) for the same /// pattern with comment "silently turning over-range alpha into /// transparent output". -pub(crate) fn copy_alpha_plane_u16_to_u8( +pub(crate) fn copy_alpha_plane_u16_to_u8( alpha: &[u16], rgba_out: &mut [u8], width: usize, @@ -122,7 +135,12 @@ pub(crate) fn copy_alpha_plane_u16_to_u8( let mask: u16 = ((1u32 << BITS) - 1) as u16; let shift = BITS - 8; for n in 0..width { - rgba_out[n * 4 + 3] = ((alpha[n] & mask) >> shift) as u8; + let raw = if BE { + u16::from_be(alpha[n]) + } else { + u16::from_le(alpha[n]) + }; + rgba_out[n * 4 + 3] = ((raw & mask) >> shift) as u8; } } @@ -131,7 +149,17 @@ pub(crate) fn copy_alpha_plane_u16_to_u8( /// depth, masked to `(1 << BITS) - 1` so over-range source samples /// don't leak through (parity with the inline-α kernels — frame /// constructors admit raw u16 input above the BITS-bit native range). -pub(crate) fn copy_alpha_plane_u16( +/// +/// `BE` selects the **byte order** of the encoded source α plane: +/// `false` = LE on disk/wire, `true` = BE on disk/wire. Each raw u16 +/// sample is converted to host-native order via `u16::from_le` / +/// `u16::from_be` BEFORE masking. On a host whose endianness matches +/// the data, the conversion compiles to a no-op; otherwise it is a +/// `swap_bytes`. Mirrors the `load_endian_u16x*::` SIMD pattern +/// from #81 so scalar and SIMD stay byte-for-byte equivalent on every +/// host. Without this, a BE host processing LE source data would emit +/// a byte-reversed α plane. +pub(crate) fn copy_alpha_plane_u16( alpha: &[u16], rgba_out: &mut [u16], width: usize, @@ -143,7 +171,12 @@ pub(crate) fn copy_alpha_plane_u16( debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short"); let mask: u16 = ((1u32 << BITS) - 1) as u16; for n in 0..width { - rgba_out[n * 4 + 3] = alpha[n] & mask; + let raw = if BE { + u16::from_be(alpha[n]) + } else { + u16::from_le(alpha[n]) + }; + rgba_out[n * 4 + 3] = raw & mask; } } @@ -263,33 +296,51 @@ mod tests { ); } + // ---- LE-host fixture tests ---- + // + // The tests below use host-native `u16` literals (e.g. + // `vec![0x3FFu16, 0x1FF]`) as if they were the on-disk LE encoding of + // those samples and then call the kernel with `` + // (LE path). On a BE host (e.g., s390x under miri-sb), host-native + // `u16` storage does NOT lay bytes out little-endian, so the kernel's + // `u16::from_le` byte-swap correctly reinterprets the host-native + // value and produces a different logical value than the literal — + // making the assertion fail. The kernel is correct: its BE-host + // scalar correctness is locked down by the dedicated + // `*_be_parity_with_swapped_buffer` tests below, which build + // BE-encoded fixtures via `swap_bytes` from LE inputs and assert + // byte-for-byte parity. Gating these LE-fixture tests on + // `target_endian = "little"` avoids fixture-vs-kernel byte-order + // confusion without weakening coverage. #[test] + #[cfg(target_endian = "little")] fn copy_alpha_plane_u16_to_u8_depth_converts_at_each_bits_value() { // BITS=10 let alpha: std::vec::Vec = std::vec![0x3FF, 0x1FF]; let mut rgba = std::vec![1u8; 8]; - copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba, 2); + copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba, 2); assert_eq!(rgba, std::vec![1, 1, 1, 0xFF, 1, 1, 1, 0x7F]); // BITS=12 let alpha: std::vec::Vec = std::vec![0xFFF, 0x800]; let mut rgba = std::vec![1u8; 8]; - copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba, 2); + copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba, 2); assert_eq!(rgba, std::vec![1, 1, 1, 0xFF, 1, 1, 1, 0x80]); // BITS=16 let alpha: std::vec::Vec = std::vec![0xFFFF, 0x8000]; let mut rgba = std::vec![1u8; 8]; - copy_alpha_plane_u16_to_u8::<16>(&alpha, &mut rgba, 2); + copy_alpha_plane_u16_to_u8::<16, false>(&alpha, &mut rgba, 2); assert_eq!(rgba, std::vec![1, 1, 1, 0xFF, 1, 1, 1, 0x80]); } #[test] + #[cfg(target_endian = "little")] fn copy_alpha_plane_u16_preserves_native_u16_within_bits_range() { // In-range values pass through unchanged. let alpha: std::vec::Vec = std::vec![0x3FF, 0x1FF, 0x000]; let mut rgba = std::vec![1u16; 12]; - copy_alpha_plane_u16::<10>(&alpha, &mut rgba, 3); + copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba, 3); assert_eq!( rgba, std::vec![1, 1, 1, 0x3FF, 1, 1, 1, 0x1FF, 1, 1, 1, 0x000] @@ -297,6 +348,7 @@ mod tests { } #[test] + #[cfg(target_endian = "little")] fn copy_alpha_plane_u16_masks_overrange_to_bits_range() { // Over-range α (e.g., 0xFFFF at BITS=10) must be masked to low BITS. // Without the mask, raw u16 0xFFFF would leak straight to output and @@ -304,7 +356,7 @@ mod tests { // diverging from the inline-α scalar reference. let alpha: std::vec::Vec = std::vec![0xFFFF, 0x0500, 0x07FF]; let mut rgba = std::vec![1u16; 12]; - copy_alpha_plane_u16::<10>(&alpha, &mut rgba, 3); + copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba, 3); assert_eq!( rgba, std::vec![1, 1, 1, 0x3FF, 1, 1, 1, 0x100, 1, 1, 1, 0x3FF] @@ -312,6 +364,7 @@ mod tests { } #[test] + #[cfg(target_endian = "little")] fn copy_alpha_plane_u16_to_u8_masks_overrange_then_shifts() { // Without the BITS mask, 0x0500 at BITS=10 would shift `>> 2` to // 320 and either narrow as u8 to 64 (scalar `as u8`) or saturate to @@ -319,10 +372,47 @@ mod tests { // & 0x3FF = 0x100 → 0x100 >> 2 = 64 consistently across all paths. let alpha: std::vec::Vec = std::vec![0x0500, 0xFFFF, 0x03FF]; let mut rgba = std::vec![1u8; 12]; - copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba, 3); + copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba, 3); assert_eq!(rgba, std::vec![1, 1, 1, 64, 1, 1, 1, 0xFF, 1, 1, 1, 0xFF]); } + /// BE parity: byte-swapping the source α plane and toggling the `BE` + /// flag must yield byte-for-byte identical output. Locks down the + /// codex-flagged corruption where a BE host processing LE input + /// would otherwise emit a byte-reversed α slot. The synthesized + /// "BE-encoded" buffer is built by host-side `swap_bytes` on the LE + /// fixture; both `from_le` (LE flag) and `from_be` (BE flag with the + /// swapped buffer) recover the same logical u16 values, so the + /// outputs match on every host. + #[test] + fn copy_alpha_plane_u16_to_u8_be_parity_with_swapped_buffer() { + let alpha_le: std::vec::Vec = std::vec![0x3FF, 0x1FF, 0x0500, 0xFFFF, 0x07FF, 0x0123]; + let alpha_be: std::vec::Vec = alpha_le.iter().map(|x| x.swap_bytes()).collect(); + let mut rgba_le = std::vec![1u8; 24]; + let mut rgba_be = std::vec![1u8; 24]; + copy_alpha_plane_u16_to_u8::<10, false>(&alpha_le, &mut rgba_le, 6); + copy_alpha_plane_u16_to_u8::<10, true>(&alpha_be, &mut rgba_be, 6); + assert_eq!( + rgba_le, rgba_be, + "BE flag + byte-swapped buffer must match LE path" + ); + } + + /// BE parity for the u16-output variant. + #[test] + fn copy_alpha_plane_u16_be_parity_with_swapped_buffer() { + let alpha_le: std::vec::Vec = std::vec![0xFFFF, 0x0500, 0x07FF, 0x0123, 0x3FF, 0x000]; + let alpha_be: std::vec::Vec = alpha_le.iter().map(|x| x.swap_bytes()).collect(); + let mut rgba_le = std::vec![7u16; 24]; + let mut rgba_be = std::vec![7u16; 24]; + copy_alpha_plane_u16::<10, false>(&alpha_le, &mut rgba_le, 6); + copy_alpha_plane_u16::<10, true>(&alpha_be, &mut rgba_be, 6); + assert_eq!( + rgba_le, rgba_be, + "BE flag + byte-swapped buffer must match LE path" + ); + } + #[test] fn copy_alpha_ya_u8_extracts_alpha_from_odd_byte_slots() { // Ya8 packed layout: [Y0, A0, Y1, A1, Y2, A2] diff --git a/src/row/scalar/planar_gbr_high_bit.rs b/src/row/scalar/planar_gbr_high_bit.rs index b9c966df..9cdc8568 100644 --- a/src/row/scalar/planar_gbr_high_bit.rs +++ b/src/row/scalar/planar_gbr_high_bit.rs @@ -1,13 +1,15 @@ //! Scalar reference kernels for high-bit-depth planar GBR sources -//! (Tier 10b — `AV_PIX_FMT_GBRP{9,10,12,14,16}LE` / -//! `AV_PIX_FMT_GBRAP{10,12,14,16}LE`). +//! (Tier 10b — `AV_PIX_FMT_GBRP{9,10,12,14,16}LE/BE` / +//! `AV_PIX_FMT_GBRAP{10,12,14,16}LE/BE`). //! //! `gbr_*` kernels (3-plane, no α) are const-generic over -//! `BITS ∈ {9, 10, 12, 14, 16}`. `gbra_*` kernels (4-plane, with α) -//! are const-generic over `BITS ∈ {10, 12, 14, 16}` — FFmpeg has no -//! `GBRAP9` variant; only the 3-plane `GBRP9` exists at 9 bits. +//! `BITS ∈ {9, 10, 12, 14, 16}` **and** `BE: bool` (endianness of the +//! source planes). `gbra_*` kernels (4-plane, with α) are const-generic +//! over `BITS ∈ {10, 12, 14, 16}` — FFmpeg has no `GBRAP9` variant; +//! only the 3-plane `GBRP9` exists at 9 bits. //! No runtime branching on `BITS` — every `BITS - 8` shift is a -//! const-eval expression resolved at monomorphisation. +//! const-eval expression resolved at monomorphisation. The `BE` branch is +//! also const-folded away at monomorphisation time. //! //! # Output variants //! @@ -34,18 +36,27 @@ //! //! - u8: `0xFF` //! - u16: `(1u16 << BITS) - 1` (i.e., `511`, `1023`, `4095`, …) +//! +//! # Big-endian (`BE = true`) mode +//! +//! When `BE = true` each u16 sample is byte-swapped before masking and +//! arithmetic. The swap is a compile-time branch: the `BE = false` path +//! compiles to a no-op and the call overhead is zero. /// Interleaves three planar G/B/R `u16` rows into packed `R, G, B` /// **bytes**, downshifting each sample by `BITS - 8`. /// /// Output order is **R, G, B** per pixel (FFmpeg `RGB24` convention). /// +/// When `BE = true` each source element is byte-swapped before processing +/// (big-endian wire format → host-native arithmetic value). +/// /// # Panics (debug builds) /// /// Asserts that `g`, `b`, `r` each have at least `width` samples and /// `rgb_out` has at least `width * 3` bytes. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbr_to_rgb_high_bit_row( +pub(crate) fn gbr_to_rgb_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -65,9 +76,24 @@ pub(crate) fn gbr_to_rgb_high_bit_row( let mask: u16 = ((1u32 << BITS) - 1) as u16; let shift = BITS - 8; for x in 0..width { - let r_val = r[x] & mask; - let g_val = g[x] & mask; - let b_val = b[x] & mask; + let r_raw = if BE { + u16::from_be(r[x]) + } else { + u16::from_le(r[x]) + }; + let g_raw = if BE { + u16::from_be(g[x]) + } else { + u16::from_le(g[x]) + }; + let b_raw = if BE { + u16::from_be(b[x]) + } else { + u16::from_le(b[x]) + }; + let r_val = r_raw & mask; + let g_val = g_raw & mask; + let b_val = b_raw & mask; let dst = x * 3; rgb_out[dst] = (r_val >> shift) as u8; rgb_out[dst + 1] = (g_val >> shift) as u8; @@ -79,12 +105,14 @@ pub(crate) fn gbr_to_rgb_high_bit_row( /// **`u16`** samples. Copies samples directly without shifting — /// output values are in `[0, (1 << BITS) - 1]`. /// +/// When `BE = true` each source element is byte-swapped before processing. +/// /// # Panics (debug builds) /// /// Asserts that `g`, `b`, `r` each have at least `width` samples and /// `rgb_u16_out` has at least `width * 3` samples. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbr_to_rgb_u16_high_bit_row( +pub(crate) fn gbr_to_rgb_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -103,9 +131,24 @@ pub(crate) fn gbr_to_rgb_u16_high_bit_row( debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out row too short"); let mask: u16 = ((1u32 << BITS) - 1) as u16; for x in 0..width { - let r_val = r[x] & mask; - let g_val = g[x] & mask; - let b_val = b[x] & mask; + let r_raw = if BE { + u16::from_be(r[x]) + } else { + u16::from_le(r[x]) + }; + let g_raw = if BE { + u16::from_be(g[x]) + } else { + u16::from_le(g[x]) + }; + let b_raw = if BE { + u16::from_be(b[x]) + } else { + u16::from_le(b[x]) + }; + let r_val = r_raw & mask; + let g_val = g_raw & mask; + let b_val = b_raw & mask; let dst = x * 3; rgb_u16_out[dst] = r_val; rgb_u16_out[dst + 1] = g_val; @@ -118,8 +161,9 @@ pub(crate) fn gbr_to_rgb_u16_high_bit_row( /// `Gbrp*` sources (no alpha plane) when `with_rgba` is requested. /// /// Each sample is downshifted by `BITS - 8`. +/// When `BE = true` each source element is byte-swapped before processing. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbr_to_rgba_opaque_high_bit_row( +pub(crate) fn gbr_to_rgba_opaque_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -139,9 +183,24 @@ pub(crate) fn gbr_to_rgba_opaque_high_bit_row( let mask: u16 = ((1u32 << BITS) - 1) as u16; let shift = BITS - 8; for x in 0..width { - let r_val = r[x] & mask; - let g_val = g[x] & mask; - let b_val = b[x] & mask; + let r_raw = if BE { + u16::from_be(r[x]) + } else { + u16::from_le(r[x]) + }; + let g_raw = if BE { + u16::from_be(g[x]) + } else { + u16::from_le(g[x]) + }; + let b_raw = if BE { + u16::from_be(b[x]) + } else { + u16::from_le(b[x]) + }; + let r_val = r_raw & mask; + let g_val = g_raw & mask; + let b_val = b_raw & mask; let dst = x * 4; rgba_out[dst] = (r_val >> shift) as u8; rgba_out[dst + 1] = (g_val >> shift) as u8; @@ -154,8 +213,9 @@ pub(crate) fn gbr_to_rgba_opaque_high_bit_row( /// **`u16`** samples with a constant **opaque** alpha /// (`(1u16 << BITS) - 1`). Used for `Gbrp*` sources (no alpha plane) /// when `with_rgba_u16` is requested. Copies samples directly. +/// When `BE = true` each source element is byte-swapped before processing. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbr_to_rgba_opaque_u16_high_bit_row( +pub(crate) fn gbr_to_rgba_opaque_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -178,9 +238,24 @@ pub(crate) fn gbr_to_rgba_opaque_u16_high_bit_row( let mask: u16 = ((1u32 << BITS) - 1) as u16; let opaque: u16 = mask; for x in 0..width { - let r_val = r[x] & mask; - let g_val = g[x] & mask; - let b_val = b[x] & mask; + let r_raw = if BE { + u16::from_be(r[x]) + } else { + u16::from_le(r[x]) + }; + let g_raw = if BE { + u16::from_be(g[x]) + } else { + u16::from_le(g[x]) + }; + let b_raw = if BE { + u16::from_be(b[x]) + } else { + u16::from_le(b[x]) + }; + let r_val = r_raw & mask; + let g_val = g_raw & mask; + let b_val = b_raw & mask; let dst = x * 4; rgba_u16_out[dst] = r_val; rgba_u16_out[dst + 1] = g_val; @@ -192,8 +267,9 @@ pub(crate) fn gbr_to_rgba_opaque_u16_high_bit_row( /// Interleaves four planar G/B/R/A `u16` rows into packed `R, G, B, A` /// **bytes**. Alpha is sourced from the `a` plane (real per-pixel α). /// Each sample (including α) is downshifted by `BITS - 8`. +/// When `BE = true` each source element is byte-swapped before processing. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbra_to_rgba_high_bit_row( +pub(crate) fn gbra_to_rgba_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -215,10 +291,30 @@ pub(crate) fn gbra_to_rgba_high_bit_row( let mask: u16 = ((1u32 << BITS) - 1) as u16; let shift = BITS - 8; for x in 0..width { - let r_val = r[x] & mask; - let g_val = g[x] & mask; - let b_val = b[x] & mask; - let a_val = a[x] & mask; + let r_raw = if BE { + u16::from_be(r[x]) + } else { + u16::from_le(r[x]) + }; + let g_raw = if BE { + u16::from_be(g[x]) + } else { + u16::from_le(g[x]) + }; + let b_raw = if BE { + u16::from_be(b[x]) + } else { + u16::from_le(b[x]) + }; + let a_raw = if BE { + u16::from_be(a[x]) + } else { + u16::from_le(a[x]) + }; + let r_val = r_raw & mask; + let g_val = g_raw & mask; + let b_val = b_raw & mask; + let a_val = a_raw & mask; let dst = x * 4; rgba_out[dst] = (r_val >> shift) as u8; rgba_out[dst + 1] = (g_val >> shift) as u8; @@ -230,8 +326,9 @@ pub(crate) fn gbra_to_rgba_high_bit_row( /// Interleaves four planar G/B/R/A `u16` rows into packed `R, G, B, A` /// **`u16`** samples. Alpha is sourced from the `a` plane at native /// depth (no shift). Copies all four channels directly. +/// When `BE = true` each source element is byte-swapped before processing. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbra_to_rgba_u16_high_bit_row( +pub(crate) fn gbra_to_rgba_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -255,10 +352,30 @@ pub(crate) fn gbra_to_rgba_u16_high_bit_row( ); let mask: u16 = ((1u32 << BITS) - 1) as u16; for x in 0..width { - let r_val = r[x] & mask; - let g_val = g[x] & mask; - let b_val = b[x] & mask; - let a_val = a[x] & mask; + let r_raw = if BE { + u16::from_be(r[x]) + } else { + u16::from_le(r[x]) + }; + let g_raw = if BE { + u16::from_be(g[x]) + } else { + u16::from_le(g[x]) + }; + let b_raw = if BE { + u16::from_be(b[x]) + } else { + u16::from_le(b[x]) + }; + let a_raw = if BE { + u16::from_be(a[x]) + } else { + u16::from_le(a[x]) + }; + let r_val = r_raw & mask; + let g_val = g_raw & mask; + let b_val = b_raw & mask; + let a_val = a_raw & mask; let dst = x * 4; rgba_u16_out[dst] = r_val; rgba_u16_out[dst + 1] = g_val; @@ -280,8 +397,9 @@ pub(crate) fn gbra_to_rgba_u16_high_bit_row( /// `full_range = false` → Y' ∈ `[16 << (BITS - 8), 235 << (BITS - 8)]` /// (limited / studio swing). The limited-range formula mirrors /// `rgb_to_luma_row` but scaled to native depth. +/// When `BE = true` each source element is byte-swapped before processing. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbr_to_luma_u16_high_bit_row( +pub(crate) fn gbr_to_luma_u16_high_bit_row( g: &[u16], b: &[u16], r: &[u16], @@ -311,9 +429,24 @@ pub(crate) fn gbr_to_luma_u16_high_bit_row( if full_range { for x in 0..width { - let rv = (r[x] & mask) as i64; - let gv = (g[x] & mask) as i64; - let bv = (b[x] & mask) as i64; + let r_raw = if BE { + u16::from_be(r[x]) + } else { + u16::from_le(r[x]) + }; + let g_raw = if BE { + u16::from_be(g[x]) + } else { + u16::from_le(g[x]) + }; + let b_raw = if BE { + u16::from_be(b[x]) + } else { + u16::from_le(b[x]) + }; + let rv = (r_raw & mask) as i64; + let gv = (g_raw & mask) as i64; + let bv = (b_raw & mask) as i64; let y = ((k_r * rv + k_g * gv + k_b * bv + RND) >> 15) as i32; luma_out[x] = y.clamp(0, native_max as i32) as u16; } @@ -339,9 +472,24 @@ pub(crate) fn gbr_to_luma_u16_high_bit_row( let y_max = (235i64) << (BITS - 8); let y_min = y_off; for x in 0..width { - let rv = (r[x] & mask) as i64; - let gv = (g[x] & mask) as i64; - let bv = (b[x] & mask) as i64; + let r_raw = if BE { + u16::from_be(r[x]) + } else { + u16::from_le(r[x]) + }; + let g_raw = if BE { + u16::from_be(g[x]) + } else { + u16::from_le(g[x]) + }; + let b_raw = if BE { + u16::from_be(b[x]) + } else { + u16::from_le(b[x]) + }; + let rv = (r_raw & mask) as i64; + let gv = (g_raw & mask) as i64; + let bv = (b_raw & mask) as i64; let y_full = (k_r * rv + k_g * gv + k_b * bv + RND) >> 15; let y_full_clamped = y_full.clamp(0, native_max_i64); let y_lim = y_off + (y_full_clamped * range + native_max_i64 / 2) / native_max_i64; @@ -357,29 +505,51 @@ mod tests { use super::*; use crate::ColorMatrix; + // ---- LE-host fixture tests ---- + // + // The tests below use host-native `u16` literals (e.g. `[100u16; 1]`, + // `vec![400u16, 200u16, 0u16]`) as if they were the on-disk LE + // encoding of those samples and then call the kernel with + // `` (LE path). On a BE host (e.g., s390x under + // miri-sb), host-native `u16` storage does NOT lay bytes out + // little-endian, so the kernel's `u16::from_le` byte-swap correctly + // reinterprets the host-native value and produces a different + // logical value than the literal — making the assertion fail. The + // kernel is correct: its BE-host scalar correctness is locked down + // by the dedicated `scalar_*_be_parity_*` tests further below, which + // build BE-encoded fixtures via `byte_swap_vec` from LE inputs and + // assert byte-for-byte parity. Gating these LE-fixture tests on + // `target_endian = "little"` avoids fixture-vs-kernel byte-order + // confusion without weakening coverage. + // Tests with all-zero / all-`u16::MAX` (byte-symmetric) literals are + // intentionally NOT gated — `from_le` is a no-op on those bit + // patterns regardless of host endianness. + // ---- gbr_to_rgb_high_bit_row: u8 output, downshift ---------------------- #[test] + #[cfg(target_endian = "little")] fn rgb_high_bit_bits10_channel_reorder() { // G=0, B=100, R=1000 → packed R,G,B = 1000>>2, 0>>2, 100>>2 = 250, 0, 25 let g = [0u16; 1]; let b = [100u16; 1]; let r = [1000u16; 1]; let mut out = [0u8; 3]; - gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out, 1); + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1); assert_eq!(out[0], 250); // R assert_eq!(out[1], 0); // G assert_eq!(out[2], 25); // B } #[test] + #[cfg(target_endian = "little")] fn rgb_high_bit_bits10_max_value_becomes_0xff() { let max = (1u16 << 10) - 1; // 1023 let g = [max; 4]; let b = [max; 4]; let r = [max; 4]; let mut out = [0u8; 12]; - gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out, 4); + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out, 4); assert!(out.iter().all(|&v| v == 0xFF), "all pixels must be 0xFF"); } @@ -390,7 +560,7 @@ mod tests { let b = [max; 2]; let r = [max; 2]; let mut out = [0u8; 6]; - gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out, 2); + gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out, 2); assert!(out.iter().all(|&v| v == 0xFF)); } @@ -400,33 +570,36 @@ mod tests { let b = [0u16; 2]; let r = [0u16; 2]; let mut out = [0xFFu8; 6]; - gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out, 2); + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out, 2); assert!(out.iter().all(|&v| v == 0)); } #[test] + #[cfg(target_endian = "little")] fn rgb_high_bit_bits9_downshift_by_1() { // BITS=9: shift = 1. Value 510 >> 1 = 255. let g = [510u16; 1]; let b = [0u16; 1]; let r = [0u16; 1]; let mut out = [0u8; 3]; - gbr_to_rgb_high_bit_row::<9>(&g, &b, &r, &mut out, 1); + gbr_to_rgb_high_bit_row::<9, false>(&g, &b, &r, &mut out, 1); assert_eq!(out[1], 255); // G channel } #[test] + #[cfg(target_endian = "little")] fn rgb_high_bit_bits12_downshift_by_4() { // BITS=12: shift = 4. Value 4080 >> 4 = 255. let r = [4080u16; 1]; let g = [0u16; 1]; let b = [0u16; 1]; let mut out = [0u8; 3]; - gbr_to_rgb_high_bit_row::<12>(&g, &b, &r, &mut out, 1); + gbr_to_rgb_high_bit_row::<12, false>(&g, &b, &r, &mut out, 1); assert_eq!(out[0], 255); // R channel } #[test] + #[cfg(target_endian = "little")] fn rgb_high_bit_multiple_pixels_correct_layout() { // 3 pixels: (R,G,B) = (100,200,300>>2=75), (200>>2=50,0,0), (0,150>>2=37,50>>2=12) // BITS=10, shift=2 @@ -434,7 +607,7 @@ mod tests { let g = [800u16, 0u16, 600u16]; let b = [300u16, 0u16, 200u16]; let mut out = [0u8; 9]; - gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out, 3); + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out, 3); // pixel 0: R=400>>2=100, G=800>>2=200, B=300>>2=75 assert_eq!(out[0], 100); assert_eq!(out[1], 200); @@ -452,25 +625,27 @@ mod tests { // ---- gbr_to_rgb_u16_high_bit_row: u16 output, no shift ------------------ #[test] + #[cfg(target_endian = "little")] fn rgb_u16_high_bit_channel_reorder() { let g = [111u16; 1]; let b = [222u16; 1]; let r = [333u16; 1]; let mut out = [0u16; 3]; - gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 1); + gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1); assert_eq!(out[0], 333); // R assert_eq!(out[1], 111); // G assert_eq!(out[2], 222); // B } #[test] + #[cfg(target_endian = "little")] fn rgb_u16_high_bit_bits10_max_preserved() { let max = (1u16 << 10) - 1; // 1023 let g = [max; 4]; let b = [max; 4]; let r = [max; 4]; let mut out = [0u16; 12]; - gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 4); + gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 4); assert!(out.iter().all(|&v| v == max)); } @@ -481,18 +656,19 @@ mod tests { let b = [max; 2]; let r = [max; 2]; let mut out = [0u16; 6]; - gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out, 2); + gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out, 2); assert!(out.iter().all(|&v| v == max)); } #[test] + #[cfg(target_endian = "little")] fn rgb_u16_high_bit_values_not_shifted() { // Verify that u16 output does NOT shift values (unlike u8 output). let g = [1000u16; 1]; let b = [2000u16; 1]; let r = [3000u16; 1]; let mut out = [0u16; 3]; - gbr_to_rgb_u16_high_bit_row::<12>(&g, &b, &r, &mut out, 1); + gbr_to_rgb_u16_high_bit_row::<12, false>(&g, &b, &r, &mut out, 1); assert_eq!(out[0], 3000); // R — unchanged assert_eq!(out[1], 1000); // G — unchanged assert_eq!(out[2], 2000); // B — unchanged @@ -501,13 +677,14 @@ mod tests { // ---- gbr_to_rgba_opaque_high_bit_row: u8 RGBA with constant alpha -------- #[test] + #[cfg(target_endian = "little")] fn rgba_opaque_high_bit_bits10_alpha_is_0xff() { let max = (1u16 << 10) - 1; let g = [max; 4]; let b = [max; 4]; let r = [max; 4]; let mut out = [0u8; 16]; - gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out, 4); + gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out, 4); for i in 0..4 { assert_eq!(out[i * 4 + 3], 0xFF, "alpha must be 0xFF at pixel {i}"); assert_eq!(out[i * 4], 0xFF, "R must be 0xFF at pixel {i}"); @@ -515,13 +692,14 @@ mod tests { } #[test] + #[cfg(target_endian = "little")] fn rgba_opaque_high_bit_bits9_downshift_correct() { // BITS=9, shift=1. Value 510 >> 1 = 255. let g = [510u16; 1]; let b = [0u16; 1]; let r = [0u16; 1]; let mut out = [0u8; 4]; - gbr_to_rgba_opaque_high_bit_row::<9>(&g, &b, &r, &mut out, 1); + gbr_to_rgba_opaque_high_bit_row::<9, false>(&g, &b, &r, &mut out, 1); assert_eq!(out[1], 255); // G assert_eq!(out[3], 0xFF); // alpha } @@ -529,12 +707,13 @@ mod tests { // ---- gbr_to_rgba_opaque_u16_high_bit_row: u16 RGBA with constant alpha --- #[test] + #[cfg(target_endian = "little")] fn rgba_opaque_u16_high_bit_bits10_alpha_is_1023() { let g = [500u16; 2]; let b = [200u16; 2]; let r = [800u16; 2]; let mut out = [0u16; 8]; - gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 2); + gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 2); let opaque = (1u16 << 10) - 1; // 1023 assert_eq!(out[3], opaque); // pixel 0 alpha assert_eq!(out[7], opaque); // pixel 1 alpha @@ -549,7 +728,7 @@ mod tests { let b = [0u16; 1]; let r = [0u16; 1]; let mut out = [0u16; 4]; - gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out, 1); + gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out, 1); assert_eq!(out[3], u16::MAX); } @@ -559,13 +738,14 @@ mod tests { let b = [0u16; 1]; let r = [0u16; 1]; let mut out = [0u16; 4]; - gbr_to_rgba_opaque_u16_high_bit_row::<9>(&g, &b, &r, &mut out, 1); + gbr_to_rgba_opaque_u16_high_bit_row::<9, false>(&g, &b, &r, &mut out, 1); assert_eq!(out[3], (1u16 << 9) - 1); // 511 } // ---- gbra_to_rgba_high_bit_row: u8 RGBA with source alpha ---------------- #[test] + #[cfg(target_endian = "little")] fn gbra_rgba_high_bit_bits10_source_alpha_downshifted() { // BITS=10, shift=2. Alpha value 512 >> 2 = 128. let g = [0u16; 1]; @@ -573,11 +753,12 @@ mod tests { let r = [0u16; 1]; let a = [512u16; 1]; let mut out = [0u8; 4]; - gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out, 1); + gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out, 1); assert_eq!(out[3], 128); // alpha = 512 >> 2 } #[test] + #[cfg(target_endian = "little")] fn gbra_rgba_high_bit_bits10_max_alpha_is_0xff() { let max = (1u16 << 10) - 1; let g = [max; 2]; @@ -585,13 +766,14 @@ mod tests { let r = [max; 2]; let a = [max; 2]; let mut out = [0u8; 8]; - gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out, 2); + gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out, 2); for i in 0..2 { assert_eq!(out[i * 4 + 3], 0xFF, "alpha must be 0xFF at pixel {i}"); } } #[test] + #[cfg(target_endian = "little")] fn gbra_rgba_high_bit_bits14_channel_reorder_and_shift() { // BITS=14, shift=6. R=16320 >> 6 = 255, G=0, B=0, A=8192 >> 6 = 128. let g = [0u16; 1]; @@ -599,7 +781,7 @@ mod tests { let r = [16320u16; 1]; let a = [8192u16; 1]; let mut out = [0u8; 4]; - gbra_to_rgba_high_bit_row::<14>(&g, &b, &r, &a, &mut out, 1); + gbra_to_rgba_high_bit_row::<14, false>(&g, &b, &r, &a, &mut out, 1); assert_eq!(out[0], 255); // R assert_eq!(out[1], 0); // G assert_eq!(out[2], 0); // B @@ -609,13 +791,14 @@ mod tests { // ---- gbra_to_rgba_u16_high_bit_row: u16 RGBA with source alpha ----------- #[test] + #[cfg(target_endian = "little")] fn gbra_rgba_u16_high_bit_source_alpha_preserved() { let g = [100u16; 1]; let b = [200u16; 1]; let r = [300u16; 1]; let a = [777u16; 1]; let mut out = [0u16; 4]; - gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out, 1); + gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out, 1); assert_eq!(out[0], 300); // R assert_eq!(out[1], 100); // G assert_eq!(out[2], 200); // B @@ -623,13 +806,14 @@ mod tests { } #[test] + #[cfg(target_endian = "little")] fn gbra_rgba_u16_high_bit_bits16_all_channels_preserved() { let g = [10000u16; 2]; let b = [20000u16; 2]; let r = [30000u16; 2]; let a = [40000u16; 2]; let mut out = [0u16; 8]; - gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out, 2); + gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out, 2); for i in 0..2 { assert_eq!(out[i * 4], 30000); assert_eq!(out[i * 4 + 1], 10000); @@ -641,6 +825,7 @@ mod tests { // ---- Round-trip parity: high-bit u8 output matches 8-bit source ---------- #[test] + #[cfg(target_endian = "little")] fn rgb_high_bit_bits10_parity_with_scaled_8bit() { // val=128 in 8-bit; in 10-bit: 128 << 2 = 512. 512 >> 2 = 128. let val: u16 = 128u16 << 2; @@ -648,11 +833,12 @@ mod tests { let b = [val; 8]; let r = [val; 8]; let mut out = [0u8; 24]; - gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out, 8); + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out, 8); assert!(out.iter().all(|&v| v == 128)); } #[test] + #[cfg(target_endian = "little")] fn rgb_high_bit_bits12_parity_with_scaled_8bit() { // val=200 in 8-bit; in 12-bit: 200 << 4 = 3200. 3200 >> 4 = 200. let val: u16 = 200u16 << 4; @@ -660,7 +846,7 @@ mod tests { let b = [val; 4]; let r = [val; 4]; let mut out = [0u8; 12]; - gbr_to_rgb_high_bit_row::<12>(&g, &b, &r, &mut out, 4); + gbr_to_rgb_high_bit_row::<12, false>(&g, &b, &r, &mut out, 4); assert!(out.iter().all(|&v| v == 200)); } @@ -669,6 +855,7 @@ mod tests { // correctly before processing, ensuring scalar/SIMD produce identical output. #[test] + #[cfg(target_endian = "little")] fn gbr_to_rgb_high_bit_masks_upper_bits_bits10() { // BITS=10, mask=0x03FF. Input 0x0CFF has upper bits set. // masked = 0x0CFF & 0x03FF = 0x00FF = 255. 255 >> 2 = 63 as u8. @@ -679,7 +866,7 @@ mod tests { let b = [dirty; 1]; let r = [dirty; 1]; let mut out = [0u8; 3]; - gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out, 1); + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1); assert_eq!( out[0], expected_u8, "R must equal masked-then-shifted value" @@ -695,6 +882,7 @@ mod tests { } #[test] + #[cfg(target_endian = "little")] fn gbr_to_rgb_high_bit_masks_upper_bits_multiple_widths_bits10() { // Width sweep: [1, 7, 8, 16, 17, 32, 33, 64, 128, 130]. let dirty: u16 = 0x0500; // BITS=10: mask&0x0500 = 0x0100=256; 256>>2=64. @@ -705,7 +893,7 @@ mod tests { let b = std::vec![dirty; w]; let r = std::vec![dirty; w]; let mut out = std::vec![0u8; w * 3]; - gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out, w); + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out, w); for i in 0..w { assert_eq!(out[i * 3], expected_u8, "R pixel {i} wrong at width {w}"); assert_eq!( @@ -723,6 +911,7 @@ mod tests { } #[test] + #[cfg(target_endian = "little")] fn gbra_to_rgba_high_bit_masks_upper_bits_alpha_bits10() { // Verify that the alpha channel is also masked before shifting. // BITS=10: dirty_alpha = 0x0800 | 512 = 0x0A00 = 2560. @@ -734,7 +923,7 @@ mod tests { let r = [dirty_rgb; 1]; let a = [dirty_alpha; 1]; let mut out = [0u8; 4]; - gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out, 1); + gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out, 1); assert_eq!(out[0], 0, "R (dirty, masked to 0)"); assert_eq!(out[1], 0, "G (dirty, masked to 0)"); assert_eq!(out[2], 0, "B (dirty, masked to 0)"); @@ -742,6 +931,7 @@ mod tests { } #[test] + #[cfg(target_endian = "little")] fn gbr_to_rgb_u16_high_bit_masks_upper_bits_bits10() { // u16-output: verify that masked sample is in the output (not raw dirty value). let dirty: u16 = 0x0CFF; @@ -750,13 +940,14 @@ mod tests { let b = [dirty; 1]; let r = [dirty; 1]; let mut out = [0u16; 3]; - gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 1); + gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1); assert_eq!(out[0], clean, "R u16 must be masked value"); assert_eq!(out[1], clean, "G u16 must be masked value"); assert_eq!(out[2], clean, "B u16 must be masked value"); } #[test] + #[cfg(target_endian = "little")] fn gbra_to_rgba_u16_high_bit_masks_upper_bits_bits10() { // u16 RGBA output: all channels masked. let dirty: u16 = 0x0555; // BITS=10: masked = 0x0555 & 0x03FF = 0x0155 = 341. @@ -766,7 +957,7 @@ mod tests { let r = [dirty; 1]; let a = [dirty; 1]; let mut out = [0u16; 4]; - gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out, 1); + gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out, 1); assert_eq!(out[0], clean, "R u16 must be masked"); assert_eq!(out[1], clean, "G u16 must be masked"); assert_eq!(out[2], clean, "B u16 must be masked"); @@ -774,6 +965,7 @@ mod tests { } #[test] + #[cfg(target_endian = "little")] fn gbr_to_rgba_opaque_high_bit_masks_upper_bits_bits10() { // u8 RGBA opaque: RGB channels masked, alpha always 0xFF. let dirty: u16 = 0x0CFF; // masked & 0x03FF = 0x00FF = 255. 255>>2=63. @@ -783,7 +975,7 @@ mod tests { let b = [dirty; 1]; let r = [dirty; 1]; let mut out = [0u8; 4]; - gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out, 1); + gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1); assert_eq!(out[0], expected_u8, "R must be masked"); assert_eq!(out[1], expected_u8, "G must be masked"); assert_eq!(out[2], expected_u8, "B must be masked"); @@ -791,6 +983,7 @@ mod tests { } #[test] + #[cfg(target_endian = "little")] fn gbr_to_rgba_opaque_u16_high_bit_masks_upper_bits_bits10() { // u16 RGBA opaque: RGB masked, alpha is opaque mask value. let dirty: u16 = 0x0CFF; // masked = 0x00FF = 255. @@ -799,7 +992,7 @@ mod tests { let b = [dirty; 1]; let r = [dirty; 1]; let mut out = [0u16; 4]; - gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 1); + gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1); assert_eq!(out[0], clean, "R u16 must be masked"); assert_eq!(out[1], clean, "G u16 must be masked"); assert_eq!(out[2], clean, "B u16 must be masked"); @@ -815,7 +1008,7 @@ mod tests { let b = [val; 2]; let r = [val; 2]; let mut out = [0u8; 6]; - gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out, 2); + gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out, 2); assert!( out.iter().all(|&v| v == 0xFF), "BITS=16: max sample => 0xFF" @@ -825,6 +1018,7 @@ mod tests { // ---- Cross-path consistency: direct GBRA vs masked RGB + separate alpha --- #[test] + #[cfg(target_endian = "little")] fn gbra_to_rgba_high_bit_cross_path_consistency_bits10() { // With upper-bits-set alpha: direct gbra_to_rgba == manual masking. // BITS=10, dirty_alpha = 0x0800 | 0x0100 = 0x0900; masked=0x0100=256; 256>>2=64. @@ -839,12 +1033,12 @@ mod tests { // Direct path let mut out_direct = [0u8; 4]; - gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_direct, 1); + gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_direct, 1); // Manual path: apply mask to alpha, call with clean value let a_clean = [clean_alpha; 1]; let mut out_manual = [0u8; 4]; - gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a_clean, &mut out_manual, 1); + gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a_clean, &mut out_manual, 1); assert_eq!( out_direct, out_manual, @@ -856,6 +1050,7 @@ mod tests { // ---- gbr_to_luma_u16_high_bit_row: native-depth luma -------------------- #[test] + #[cfg(target_endian = "little")] fn luma_u16_high_bit_bits10_max_white_not_banded() { // BITS=10: max = 1023. Old path gave (255 as u16) << 2 = 1020, not 1023. // New kernel must produce a value near 1023 for all-white input. @@ -864,7 +1059,7 @@ mod tests { let b = [max; 1]; let r = [max; 1]; let mut out = [0u16; 1]; - gbr_to_luma_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true); + gbr_to_luma_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true); // For BT.709 full-range all-white: Y = round(Kr*max + Kg*max + Kb*max). // = round((6966 + 23436 + 2366) / 32768 * 1023) ≈ round(32768/32768 * 1023) = 1023. assert!( @@ -879,6 +1074,7 @@ mod tests { } #[test] + #[cfg(target_endian = "little")] fn luma_u16_high_bit_bits12_max_white_not_banded() { // BITS=12: max = 4095. Old path: (255 as u16) << 4 = 4080. // New kernel should give a value in [4090, 4095]. @@ -887,7 +1083,7 @@ mod tests { let b = [max; 1]; let r = [max; 1]; let mut out = [0u16; 1]; - gbr_to_luma_u16_high_bit_row::<12>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt601, true); + gbr_to_luma_u16_high_bit_row::<12, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt601, true); assert!( out[0] >= 4090, "max-white luma_u16 bits12 must be near 4095 (was {})", @@ -905,7 +1101,7 @@ mod tests { let b = [max; 1]; let r = [max; 1]; let mut out = [0u16; 1]; - gbr_to_luma_u16_high_bit_row::<16>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true); + gbr_to_luma_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true); assert!( out[0] >= 65520, "max-white luma_u16 bits16 must be near 65535 (was {}), old banded gives 65280", @@ -915,6 +1111,7 @@ mod tests { } #[test] + #[cfg(target_endian = "little")] fn luma_u16_high_bit_bits10_neutral_gray_midrange() { // BITS=10: mid = 512. Luma of neutral gray ≈ 512. let mid = 512u16; @@ -922,7 +1119,7 @@ mod tests { let b = [mid; 1]; let r = [mid; 1]; let mut out = [0u16; 1]; - gbr_to_luma_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true); + gbr_to_luma_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true); assert!( out[0] >= 510 && out[0] <= 514, "neutral gray luma_u16 must be ~512 (was {})", @@ -936,11 +1133,12 @@ mod tests { let b = [0u16; 2]; let r = [0u16; 2]; let mut out = [0xFFFFu16; 2]; - gbr_to_luma_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 2, ColorMatrix::Bt709, true); + gbr_to_luma_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 2, ColorMatrix::Bt709, true); assert!(out.iter().all(|&v| v == 0), "all-black must give zero luma"); } #[test] + #[cfg(target_endian = "little")] fn luma_u16_high_bit_bits10_full_range_vs_limited_range() { // For mid-gray input, limited-range luma should be in [16<<2, 235<<2] = [64, 940]. let mid = 512u16; @@ -949,8 +1147,24 @@ mod tests { let r = [mid; 1]; let mut out_full = [0u16; 1]; let mut out_lim = [0u16; 1]; - gbr_to_luma_u16_high_bit_row::<10>(&g, &b, &r, &mut out_full, 1, ColorMatrix::Bt601, true); - gbr_to_luma_u16_high_bit_row::<10>(&g, &b, &r, &mut out_lim, 1, ColorMatrix::Bt601, false); + gbr_to_luma_u16_high_bit_row::<10, false>( + &g, + &b, + &r, + &mut out_full, + 1, + ColorMatrix::Bt601, + true, + ); + gbr_to_luma_u16_high_bit_row::<10, false>( + &g, + &b, + &r, + &mut out_lim, + 1, + ColorMatrix::Bt601, + false, + ); let y_off = 16u16 << 2; // 64 let y_max = 235u16 << 2; // 940 assert!( @@ -976,7 +1190,7 @@ mod tests { let b = [0u16; 1]; let r = [0u16; 1]; let mut out = [0u16; 1]; - gbr_to_luma_u16_high_bit_row::<16>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false); + gbr_to_luma_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false); let y_off = 16u16 << 8; // 4096 assert_eq!( out[0], y_off, @@ -1001,7 +1215,7 @@ mod tests { let b = [u16::MAX; 1]; let r = [u16::MAX; 1]; let mut out = [0u16; 1]; - gbr_to_luma_u16_high_bit_row::<16>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false); + gbr_to_luma_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false); let y_max = 235u16 << 8; // 60160 assert_eq!( out[0], y_max, @@ -1010,6 +1224,7 @@ mod tests { } #[test] + #[cfg(target_endian = "little")] fn luma_u16_high_bit_bits16_limited_range_near_white_keeps_gradation() { // BITS=16, BT.709 luma weights ≈ Kr=0.2126, Kg=0.7152, Kb=0.0722. // Setting all 3 channels equal makes the matrix multiply produce @@ -1022,7 +1237,7 @@ mod tests { let b = [v; 1]; let r = [v; 1]; let mut out = [0u16; 1]; - gbr_to_luma_u16_high_bit_row::<16>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false); + gbr_to_luma_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false); // Native-depth limited-range: y_lim = 4096 + v × 56064 / 65535 let expected = 4096 + ((v as u64 * 56064 + 65535 / 2) / 65535) as u16; // Allow ±1 LSB for matrix-multiply rounding (BT.709 weights aren't @@ -1044,6 +1259,7 @@ mod tests { } #[test] + #[cfg(target_endian = "little")] fn luma_u16_high_bit_bits10_limited_range_endpoints() { // BITS=10: y_off=64 (=16<<2), y_max=940 (=235<<2), native_max=1023. // BT.709 luma at all-equal channels passes y_full ≈ input through. @@ -1054,7 +1270,7 @@ mod tests { let b = [input; 1]; let r = [input; 1]; let mut out = [0u16; 1]; - gbr_to_luma_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false); + gbr_to_luma_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false); let diff = (out[0] as i32 - expected as i32).abs(); assert!( diff <= 1, @@ -1063,4 +1279,164 @@ mod tests { ); } } + + // ---- BE vs LE parity: scalar must produce same output as ------- + // scalar on byte-swapped input. Covers 6 kernels at BITS 10/16. - + + fn byte_swap_vec(v: &[u16]) -> std::vec::Vec { + v.iter().map(|x| x.swap_bytes()).collect() + } + + fn rand_plane(seed: u32, n: usize) -> std::vec::Vec { + let mask = (1u32 << BITS) - 1; + let mut s = seed; + (0..n) + .map(|_| { + s = s.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + (s & mask) as u16 + }) + .collect() + } + + #[test] + fn scalar_gbr_to_rgb_high_bit_be_parity_bits10() { + for w in [1usize, 7, 8, 9, 17, 33, 65] { + let g = rand_plane::<10>(0xAAAA, w); + let b = rand_plane::<10>(0xBBBB, w); + let r = rand_plane::<10>(0xCCCC, w); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_high_bit_row::<10, true>( + &byte_swap_vec(&g), + &byte_swap_vec(&b), + &byte_swap_vec(&r), + &mut out_be, + w, + ); + assert_eq!( + out_le, out_be, + "scalar BE/LE mismatch gbr_to_rgb bits10 w={w}" + ); + } + } + + #[test] + fn scalar_gbr_to_rgb_high_bit_be_parity_bits16() { + for w in [1usize, 7, 8, 9, 17, 33, 65] { + let g = rand_plane::<16>(0xAAAA, w); + let b = rand_plane::<16>(0xBBBB, w); + let r = rand_plane::<16>(0xCCCC, w); + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_high_bit_row::<16, true>( + &byte_swap_vec(&g), + &byte_swap_vec(&b), + &byte_swap_vec(&r), + &mut out_be, + w, + ); + assert_eq!( + out_le, out_be, + "scalar BE/LE mismatch gbr_to_rgb bits16 w={w}" + ); + } + } + + #[test] + fn scalar_gbr_to_rgba_opaque_high_bit_be_parity_bits10() { + for w in [1usize, 7, 8, 9, 17] { + let g = rand_plane::<10>(0xAAAA, w); + let b = rand_plane::<10>(0xBBBB, w); + let r = rand_plane::<10>(0xCCCC, w); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgba_opaque_high_bit_row::<10, true>( + &byte_swap_vec(&g), + &byte_swap_vec(&b), + &byte_swap_vec(&r), + &mut out_be, + w, + ); + assert_eq!( + out_le, out_be, + "scalar BE/LE mismatch gbr_to_rgba_opaque bits10 w={w}" + ); + } + } + + #[test] + fn scalar_gbra_to_rgba_high_bit_be_parity_bits10() { + for w in [1usize, 7, 8, 9, 17] { + let g = rand_plane::<10>(0xAAAA, w); + let b = rand_plane::<10>(0xBBBB, w); + let r = rand_plane::<10>(0xCCCC, w); + let a = rand_plane::<10>(0xDDDD, w); + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_high_bit_row::<10, true>( + &byte_swap_vec(&g), + &byte_swap_vec(&b), + &byte_swap_vec(&r), + &byte_swap_vec(&a), + &mut out_be, + w, + ); + assert_eq!( + out_le, out_be, + "scalar BE/LE mismatch gbra_to_rgba bits10 w={w}" + ); + } + } + + #[test] + fn scalar_gbr_to_rgb_u16_high_bit_be_parity_bits10() { + for w in [1usize, 7, 8, 9, 17] { + let g = rand_plane::<10>(0xAAAA, w); + let b = rand_plane::<10>(0xBBBB, w); + let r = rand_plane::<10>(0xCCCC, w); + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w); + gbr_to_rgb_u16_high_bit_row::<10, true>( + &byte_swap_vec(&g), + &byte_swap_vec(&b), + &byte_swap_vec(&r), + &mut out_be, + w, + ); + assert_eq!( + out_le, out_be, + "scalar BE/LE mismatch gbr_to_rgb_u16 bits10 w={w}" + ); + } + } + + #[test] + fn scalar_gbra_to_rgba_u16_high_bit_be_parity_bits10() { + for w in [1usize, 7, 8, 9, 17] { + let g = rand_plane::<10>(0xAAAA, w); + let b = rand_plane::<10>(0xBBBB, w); + let r = rand_plane::<10>(0xCCCC, w); + let a = rand_plane::<10>(0xDDDD, w); + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w); + gbra_to_rgba_u16_high_bit_row::<10, true>( + &byte_swap_vec(&g), + &byte_swap_vec(&b), + &byte_swap_vec(&r), + &byte_swap_vec(&a), + &mut out_be, + w, + ); + assert_eq!( + out_le, out_be, + "scalar BE/LE mismatch gbra_to_rgba_u16 bits10 w={w}" + ); + } + } } diff --git a/src/sinker/mixed/planar_gbr_high_bit.rs b/src/sinker/mixed/planar_gbr_high_bit.rs index f28432b9..f1a6479c 100644 --- a/src/sinker/mixed/planar_gbr_high_bit.rs +++ b/src/sinker/mixed/planar_gbr_high_bit.rs @@ -237,7 +237,14 @@ macro_rules! impl_gbrp_high_bit { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; - gbr_to_rgba_opaque_u16_high_bit_row::(g_in, b_in, r_in, rgba_u16_row, w, use_simd); + gbr_to_rgba_opaque_u16_high_bit_row::( + g_in, + b_in, + r_in, + rgba_u16_row, + w, + use_simd, + ); } else if want_rgb_u16 { let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_end = @@ -250,7 +257,7 @@ macro_rules! impl_gbrp_high_bit { })?; let rgb_plane_start = one_plane_start * 3; let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; - gbr_to_rgb_u16_high_bit_row::(g_in, b_in, r_in, rgb_u16_row, w, use_simd); + gbr_to_rgb_u16_high_bit_row::(g_in, b_in, r_in, rgb_u16_row, w, use_simd); if want_rgba_u16 { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = @@ -264,7 +271,7 @@ macro_rules! impl_gbrp_high_bit { // going through the u8 staging path, so it is independent of whether // RGB staging happens below. if let Some(luma_u16_buf) = luma_u16.as_deref_mut() { - gbr_to_luma_u16_high_bit_row::( + gbr_to_luma_u16_high_bit_row::( g_in, b_in, r_in, @@ -287,7 +294,7 @@ macro_rules! impl_gbrp_high_bit { if want_rgba && !need_rgb_staging { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - gbr_to_rgba_opaque_high_bit_row::(g_in, b_in, r_in, rgba_row, w, use_simd); + gbr_to_rgba_opaque_high_bit_row::(g_in, b_in, r_in, rgba_row, w, use_simd); return Ok(()); } @@ -304,7 +311,7 @@ macro_rules! impl_gbrp_high_bit { w, h, )?; - gbr_to_rgb_high_bit_row::(g_in, b_in, r_in, rgb_row, w, use_simd); + gbr_to_rgb_high_bit_row::(g_in, b_in, r_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { rgb_to_luma_row( @@ -519,7 +526,15 @@ macro_rules! impl_gbrap_high_bit { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; - gbra_to_rgba_u16_high_bit_row::(g_in, b_in, r_in, a_in, rgba_u16_row, w, use_simd); + gbra_to_rgba_u16_high_bit_row::( + g_in, + b_in, + r_in, + a_in, + rgba_u16_row, + w, + use_simd, + ); } else if want_rgb_u16 { let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_end = @@ -532,7 +547,7 @@ macro_rules! impl_gbrap_high_bit { })?; let rgb_plane_start = one_plane_start * 3; let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; - gbr_to_rgb_u16_high_bit_row::(g_in, b_in, r_in, rgb_u16_row, w, use_simd); + gbr_to_rgb_u16_high_bit_row::(g_in, b_in, r_in, rgb_u16_row, w, use_simd); if want_rgba_u16 { // Strategy A+: expand RGB → RGBA, then overwrite α from source plane. let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); @@ -540,7 +555,11 @@ macro_rules! impl_gbrap_high_bit { rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); // Overwrite α slot from source plane (native depth, no shift). - alpha_extract::copy_alpha_plane_u16::(a_in, rgba_u16_row, w, use_simd); + // BE flag hard-wired to `false`: this sinker only handles LE-encoded + // GBR/GBRA inputs today (Tier 10b). Phase 4 will wire the kernel's + // `` through here (matches the LE-only `false` in + // the sibling `gbr_to_rgb_u16_high_bit_row::` call). + alpha_extract::copy_alpha_plane_u16::(a_in, rgba_u16_row, w, use_simd); } } @@ -549,7 +568,7 @@ macro_rules! impl_gbrap_high_bit { // going through the u8 staging path, so it is independent of whether // RGB staging happens below. if let Some(luma_u16_buf) = luma_u16.as_deref_mut() { - gbr_to_luma_u16_high_bit_row::( + gbr_to_luma_u16_high_bit_row::( g_in, b_in, r_in, @@ -572,7 +591,7 @@ macro_rules! impl_gbrap_high_bit { if want_rgba && !need_rgb_staging { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - gbra_to_rgba_high_bit_row::(g_in, b_in, r_in, a_in, rgba_row, w, use_simd); + gbra_to_rgba_high_bit_row::(g_in, b_in, r_in, a_in, rgba_row, w, use_simd); return Ok(()); } @@ -589,7 +608,7 @@ macro_rules! impl_gbrap_high_bit { w, h, )?; - gbr_to_rgb_high_bit_row::(g_in, b_in, r_in, rgb_row, w, use_simd); + gbr_to_rgb_high_bit_row::(g_in, b_in, r_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { rgb_to_luma_row( @@ -618,7 +637,8 @@ macro_rules! impl_gbrap_high_bit { // overwrite α bytes from the source A plane. let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; expand_rgb_to_rgba_row(rgb_row, rgba_row, w); - alpha_extract::copy_alpha_plane_u16_to_u8::(a_in, rgba_row, w, use_simd); + // BE flag hard-wired to `false`: see the rgba_u16 branch above. + alpha_extract::copy_alpha_plane_u16_to_u8::(a_in, rgba_row, w, use_simd); } Ok(()) diff --git a/src/sinker/mixed/tests/planar_gbr_high_bit.rs b/src/sinker/mixed/tests/planar_gbr_high_bit.rs index 83cf13c4..d49e573e 100644 --- a/src/sinker/mixed/tests/planar_gbr_high_bit.rs +++ b/src/sinker/mixed/tests/planar_gbr_high_bit.rs @@ -140,10 +140,13 @@ test_gbrp_channel_reorder!(gbrp16_channel_reorder, Gbrp16, gbrp16_to, 16); macro_rules! test_gbrap_strategy_a_plus { ($name:ident, $marker:ident, $walker:ident, $bits:literal) => { + test_gbrap_strategy_a_plus!($name, $marker, $walker, $bits, 32); + }; + ($name:ident, $marker:ident, $walker:ident, $bits:literal, $w:literal) => { #[test] #[cfg_attr(miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri")] fn $name() { - let w = 32usize; + let w = $w as usize; let h = 8usize; let n = w * h; let mut g = std::vec![0u16; n]; @@ -177,7 +180,7 @@ macro_rules! test_gbrap_strategy_a_plus { // RGBA bytes must be identical between standalone and combo paths. assert_eq!( rgba_ref, rgba_combo, - "Strategy A+ RGBA mismatch for BITS={}", $bits, + "Strategy A+ RGBA mismatch for BITS={} w={}", $bits, $w, ); } }; @@ -208,6 +211,151 @@ test_gbrap_strategy_a_plus!( 16 ); +// ---- Strategy A+: Gbrap combo RGB_u16+RGBA_u16 matches standalone RGBA_u16 - +// +// Mirrors the u8 Strategy A+ test above, but covers the native-depth combo +// path (`with_rgb_u16` + `with_rgba_u16`) that routes through +// `copy_alpha_plane_u16` rather than `copy_alpha_plane_u16_to_u8`. Without +// this, a regression in the `BE != cfg!(target_endian)` dispatcher routing +// or in the scalar α-extract helper would not be caught for the native-depth +// path. +// +// Source planes are filled with full-range u16 values (`bits=16` argument +// to `pseudo_random_u16_low_n_bits`) so the upper bits beyond BITS are +// "dirty" — both paths must mask via `(1 << BITS) - 1`, so any drift between +// them surfaces here. +macro_rules! test_gbrap_strategy_a_plus_u16 { + ($name:ident, $marker:ident, $walker:ident, $bits:literal) => { + test_gbrap_strategy_a_plus_u16!($name, $marker, $walker, $bits, 32); + }; + ($name:ident, $marker:ident, $walker:ident, $bits:literal, $w:literal) => { + #[test] + #[cfg_attr(miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri")] + fn $name() { + let w = $w as usize; + let h = 8usize; + let n = w * h; + let mut g = std::vec![0u16; n]; + let mut b = std::vec![0u16; n]; + let mut r = std::vec![0u16; n]; + let mut a = std::vec![0u16; n]; + // Use full-range u16 (bits=16) so upper bits beyond BITS are dirty, + // exercising the mask in both the direct kernel and α-extract paths. + pseudo_random_u16_low_n_bits(&mut g, 0x55_u32.wrapping_add($bits), 16); + pseudo_random_u16_low_n_bits(&mut b, 0x66_u32.wrapping_add($bits), 16); + pseudo_random_u16_low_n_bits(&mut r, 0x77_u32.wrapping_add($bits), 16); + pseudo_random_u16_low_n_bits(&mut a, 0x88_u32.wrapping_add($bits), 16); + + // Reference: standalone with_rgba_u16 (direct 4-channel kernel). + let src_ref = solid_gbrap_frame::<$bits>(&g, &b, &r, &a, w as u32, h as u32); + let mut rgba_u16_ref = std::vec![0u16; n * 4]; + let mut sink_ref = MixedSinker::::new(w, h) + .with_rgba_u16(&mut rgba_u16_ref) + .unwrap(); + crate::yuv::$walker(&src_ref, false, ColorMatrix::Bt709, &mut sink_ref).unwrap(); + + // Combo: with_rgb_u16 + with_rgba_u16 (Strategy A+ native-depth). + let src_combo = solid_gbrap_frame::<$bits>(&g, &b, &r, &a, w as u32, h as u32); + let mut rgb_u16_combo = std::vec![0u16; n * 3]; + let mut rgba_u16_combo = std::vec![0u16; n * 4]; + let mut sink_combo = MixedSinker::::new(w, h) + .with_rgb_u16(&mut rgb_u16_combo) + .unwrap() + .with_rgba_u16(&mut rgba_u16_combo) + .unwrap(); + crate::yuv::$walker(&src_combo, false, ColorMatrix::Bt709, &mut sink_combo).unwrap(); + + // RGBA u16 elements must be byte-exact between standalone and combo paths. + assert_eq!( + rgba_u16_ref, rgba_u16_combo, + "Strategy A+ native-depth RGBA u16 mismatch for BITS={} w={}", $bits, $w, + ); + } + }; +} + +test_gbrap_strategy_a_plus_u16!( + gbrap10_strategy_a_plus_u16_matches_standalone, + Gbrap10, + gbrap10_to, + 10 +); +test_gbrap_strategy_a_plus_u16!( + gbrap12_strategy_a_plus_u16_matches_standalone, + Gbrap12, + gbrap12_to, + 12 +); +test_gbrap_strategy_a_plus_u16!( + gbrap14_strategy_a_plus_u16_matches_standalone, + Gbrap14, + gbrap14_to, + 14 +); +test_gbrap_strategy_a_plus_u16!( + gbrap16_strategy_a_plus_u16_matches_standalone, + Gbrap16, + gbrap16_to, + 16 +); + +// ---- Strategy A+ at non-multiple width (31) — exercises SIMD scalar tail --- +// +// The SIMD α-extract backends (`copy_alpha_plane_u16{_to_u8}`) hardcode +// `scalar::` for the tail (e.g. NEON block size 8 + width 31 +// leaves 7 px in the tail; AVX2/AVX-512 likewise). Codex's 4th-pass review +// of PR #82 found that the prior dispatcher routing +// (`need_swap = BE != cfg!(target_endian = "big")`) admitted SIMD on +// BE-host/BE-data: the vector body's host-native loads are correct there, +// but the LE-only scalar tail then byte-swaps already-native u16 samples, +// silently corrupting α at non-multiple widths. The fix is to route SIMD +// only for the LE-host/LE-data quadrant; these tests at width 31 exercise +// the SIMD tail path on supported (LE) hosts, locking in the parity +// guarantee for the LE/LE quadrant. (The LE/BE, BE/LE, BE/BE quadrants +// are exercised at the scalar level by the `target_endian`-aware scalar +// helper itself; the new dispatcher routes them to scalar always.) + +test_gbrap_strategy_a_plus_u16!( + gbrap10_strategy_a_plus_u16_matches_standalone_w31, + Gbrap10, + gbrap10_to, + 10, + 31 +); +test_gbrap_strategy_a_plus_u16!( + gbrap12_strategy_a_plus_u16_matches_standalone_w31, + Gbrap12, + gbrap12_to, + 12, + 31 +); +test_gbrap_strategy_a_plus_u16!( + gbrap14_strategy_a_plus_u16_matches_standalone_w31, + Gbrap14, + gbrap14_to, + 14, + 31 +); +test_gbrap_strategy_a_plus_u16!( + gbrap16_strategy_a_plus_u16_matches_standalone_w31, + Gbrap16, + gbrap16_to, + 16, + 31 +); + +// u8-path Strategy A+ at width 31 — exercises the SIMD tail of +// `copy_alpha_plane_u16_to_u8` (depth-conv `>> (BITS - 8)`). One BITS value +// is sufficient to cover the same dispatcher path as the u16 set above; +// Gbrap10 chosen for parity with the existing u8 Strategy A+ coverage. +test_gbrap_strategy_a_plus!( + gbrap10_strategy_a_plus_matches_standalone_w31, + Gbrap10, + gbrap10_to, + 10, + 31 +); + // ---- Gbrap alpha downshift correctness ------------------------------------- macro_rules! test_gbrap_alpha_downshift { diff --git a/src/sinker/mixed/yuva_4_2_0.rs b/src/sinker/mixed/yuva_4_2_0.rs index e32af5ba..d543f0a6 100644 --- a/src/sinker/mixed/yuva_4_2_0.rs +++ b/src/sinker/mixed/yuva_4_2_0.rs @@ -657,7 +657,14 @@ fn yuva420p_high_bit_process< let rgba_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); - crate::row::alpha_extract::copy_alpha_plane_u16::(a_row, rgba_u16_row, w, use_simd); + // BE = false: this sinker handles only LE-encoded high-bit Yuva*p inputs + // today. Phase 4 will plumb a `` from the row type here. + crate::row::alpha_extract::copy_alpha_plane_u16::( + a_row, + rgba_u16_row, + w, + use_simd, + ); } } else if want_rgba_u16 { // Standalone rgba_u16: delegate to the alpha-source-aware dispatcher. @@ -727,7 +734,10 @@ fn yuva420p_high_bit_process< let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; expand_rgb_to_rgba_row(rgb_row, rgba_row, w); - crate::row::alpha_extract::copy_alpha_plane_u16_to_u8::(a_row, rgba_row, w, use_simd); + // BE = false: see the rgba_u16 branch above for rationale. + crate::row::alpha_extract::copy_alpha_plane_u16_to_u8::( + a_row, rgba_row, w, use_simd, + ); } Ok(()) diff --git a/src/sinker/mixed/yuva_4_2_2.rs b/src/sinker/mixed/yuva_4_2_2.rs index 6174c7d7..c7e861a6 100644 --- a/src/sinker/mixed/yuva_4_2_2.rs +++ b/src/sinker/mixed/yuva_4_2_2.rs @@ -757,7 +757,14 @@ fn yuva422p_high_bit_process< let rgba_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); - crate::row::alpha_extract::copy_alpha_plane_u16::(a_row, rgba_u16_row, w, use_simd); + // BE = false: this sinker handles only LE-encoded high-bit Yuva*p inputs + // today. Phase 4 will plumb a `` from the row type here. + crate::row::alpha_extract::copy_alpha_plane_u16::( + a_row, + rgba_u16_row, + w, + use_simd, + ); } } else if want_rgba_u16 { // Standalone rgba_u16: delegate to the alpha-source-aware dispatcher. @@ -826,7 +833,10 @@ fn yuva422p_high_bit_process< let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; expand_rgb_to_rgba_row(rgb_row, rgba_row, w); - crate::row::alpha_extract::copy_alpha_plane_u16_to_u8::(a_row, rgba_row, w, use_simd); + // BE = false: see the rgba_u16 branch above for rationale. + crate::row::alpha_extract::copy_alpha_plane_u16_to_u8::( + a_row, rgba_row, w, use_simd, + ); } Ok(()) diff --git a/src/sinker/mixed/yuva_4_4_4.rs b/src/sinker/mixed/yuva_4_4_4.rs index be76e51a..c9d46e9d 100644 --- a/src/sinker/mixed/yuva_4_4_4.rs +++ b/src/sinker/mixed/yuva_4_4_4.rs @@ -868,7 +868,14 @@ fn yuva444p_high_bit_process< let rgba_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); - crate::row::alpha_extract::copy_alpha_plane_u16::(a_row, rgba_u16_row, w, use_simd); + // BE = false: this sinker handles only LE-encoded high-bit Yuva*p inputs + // today. Phase 4 will plumb a `` from the row type here. + crate::row::alpha_extract::copy_alpha_plane_u16::( + a_row, + rgba_u16_row, + w, + use_simd, + ); } } else if want_rgba_u16 { // Standalone rgba_u16: delegate to the alpha-source-aware dispatcher. @@ -938,7 +945,10 @@ fn yuva444p_high_bit_process< let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; expand_rgb_to_rgba_row(rgb_row, rgba_row, w); - crate::row::alpha_extract::copy_alpha_plane_u16_to_u8::(a_row, rgba_row, w, use_simd); + // BE = false: see the rgba_u16 branch above for rationale. + crate::row::alpha_extract::copy_alpha_plane_u16_to_u8::( + a_row, rgba_row, w, use_simd, + ); } Ok(())